trace_event_perf.c 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. /*
  2. * trace event based perf event profiling/tracing
  3. *
  4. * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
  5. * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
  6. */
  7. #include <linux/module.h>
  8. #include <linux/kprobes.h>
  9. #include "trace.h"
  10. static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
  11. /*
  12. * Force it to be aligned to unsigned long to avoid misaligned accesses
  13. * suprises
  14. */
  15. typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
  16. perf_trace_t;
  17. /* Count the events in use (per event id, not per instance) */
  18. static int total_ref_count;
  19. static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
  20. struct perf_event *p_event)
  21. {
  22. if (tp_event->perf_perm) {
  23. int ret = tp_event->perf_perm(tp_event, p_event);
  24. if (ret)
  25. return ret;
  26. }
  27. /* The ftrace function trace is allowed only for root. */
  28. if (ftrace_event_is_function(tp_event)) {
  29. if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
  30. return -EPERM;
  31. /*
  32. * We don't allow user space callchains for function trace
  33. * event, due to issues with page faults while tracing page
  34. * fault handler and its overall trickiness nature.
  35. */
  36. if (!p_event->attr.exclude_callchain_user)
  37. return -EINVAL;
  38. /*
  39. * Same reason to disable user stack dump as for user space
  40. * callchains above.
  41. */
  42. if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
  43. return -EINVAL;
  44. }
  45. /* No tracing, just counting, so no obvious leak */
  46. if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
  47. return 0;
  48. /* Some events are ok to be traced by non-root users... */
  49. if (p_event->attach_state == PERF_ATTACH_TASK) {
  50. if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
  51. return 0;
  52. }
  53. /*
  54. * ...otherwise raw tracepoint data can be a severe data leak,
  55. * only allow root to have these.
  56. */
  57. if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
  58. return -EPERM;
  59. return 0;
  60. }
  61. static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
  62. struct perf_event *p_event)
  63. {
  64. struct hlist_head __percpu *list;
  65. int ret = -ENOMEM;
  66. int cpu;
  67. p_event->tp_event = tp_event;
  68. if (tp_event->perf_refcount++ > 0)
  69. return 0;
  70. list = alloc_percpu(struct hlist_head);
  71. if (!list)
  72. goto fail;
  73. for_each_possible_cpu(cpu)
  74. INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
  75. tp_event->perf_events = list;
  76. if (!total_ref_count) {
  77. char __percpu *buf;
  78. int i;
  79. for (i = 0; i < PERF_NR_CONTEXTS; i++) {
  80. buf = (char __percpu *)alloc_percpu(perf_trace_t);
  81. if (!buf)
  82. goto fail;
  83. perf_trace_buf[i] = buf;
  84. }
  85. }
  86. ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
  87. if (ret)
  88. goto fail;
  89. total_ref_count++;
  90. return 0;
  91. fail:
  92. if (!total_ref_count) {
  93. int i;
  94. for (i = 0; i < PERF_NR_CONTEXTS; i++) {
  95. free_percpu(perf_trace_buf[i]);
  96. perf_trace_buf[i] = NULL;
  97. }
  98. }
  99. if (!--tp_event->perf_refcount) {
  100. free_percpu(tp_event->perf_events);
  101. tp_event->perf_events = NULL;
  102. }
  103. return ret;
  104. }
  105. static void perf_trace_event_unreg(struct perf_event *p_event)
  106. {
  107. struct ftrace_event_call *tp_event = p_event->tp_event;
  108. int i;
  109. if (--tp_event->perf_refcount > 0)
  110. goto out;
  111. tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
  112. /*
  113. * Ensure our callback won't be called anymore. The buffers
  114. * will be freed after that.
  115. */
  116. tracepoint_synchronize_unregister();
  117. free_percpu(tp_event->perf_events);
  118. tp_event->perf_events = NULL;
  119. if (!--total_ref_count) {
  120. for (i = 0; i < PERF_NR_CONTEXTS; i++) {
  121. free_percpu(perf_trace_buf[i]);
  122. perf_trace_buf[i] = NULL;
  123. }
  124. }
  125. out:
  126. module_put(tp_event->mod);
  127. }
  128. static int perf_trace_event_open(struct perf_event *p_event)
  129. {
  130. struct ftrace_event_call *tp_event = p_event->tp_event;
  131. return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
  132. }
  133. static void perf_trace_event_close(struct perf_event *p_event)
  134. {
  135. struct ftrace_event_call *tp_event = p_event->tp_event;
  136. tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
  137. }
  138. static int perf_trace_event_init(struct ftrace_event_call *tp_event,
  139. struct perf_event *p_event)
  140. {
  141. int ret;
  142. ret = perf_trace_event_perm(tp_event, p_event);
  143. if (ret)
  144. return ret;
  145. ret = perf_trace_event_reg(tp_event, p_event);
  146. if (ret)
  147. return ret;
  148. ret = perf_trace_event_open(p_event);
  149. if (ret) {
  150. perf_trace_event_unreg(p_event);
  151. return ret;
  152. }
  153. return 0;
  154. }
  155. int perf_trace_init(struct perf_event *p_event)
  156. {
  157. struct ftrace_event_call *tp_event;
  158. u64 event_id = p_event->attr.config;
  159. int ret = -EINVAL;
  160. mutex_lock(&event_mutex);
  161. list_for_each_entry(tp_event, &ftrace_events, list) {
  162. if (tp_event->event.type == event_id &&
  163. tp_event->class && tp_event->class->reg &&
  164. try_module_get(tp_event->mod)) {
  165. ret = perf_trace_event_init(tp_event, p_event);
  166. if (ret)
  167. module_put(tp_event->mod);
  168. break;
  169. }
  170. }
  171. mutex_unlock(&event_mutex);
  172. return ret;
  173. }
  174. void perf_trace_destroy(struct perf_event *p_event)
  175. {
  176. mutex_lock(&event_mutex);
  177. perf_trace_event_close(p_event);
  178. perf_trace_event_unreg(p_event);
  179. mutex_unlock(&event_mutex);
  180. }
  181. int perf_trace_add(struct perf_event *p_event, int flags)
  182. {
  183. struct ftrace_event_call *tp_event = p_event->tp_event;
  184. struct hlist_head __percpu *pcpu_list;
  185. struct hlist_head *list;
  186. pcpu_list = tp_event->perf_events;
  187. if (WARN_ON_ONCE(!pcpu_list))
  188. return -EINVAL;
  189. if (!(flags & PERF_EF_START))
  190. p_event->hw.state = PERF_HES_STOPPED;
  191. list = this_cpu_ptr(pcpu_list);
  192. hlist_add_head_rcu(&p_event->hlist_entry, list);
  193. return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
  194. }
  195. void perf_trace_del(struct perf_event *p_event, int flags)
  196. {
  197. struct ftrace_event_call *tp_event = p_event->tp_event;
  198. hlist_del_rcu(&p_event->hlist_entry);
  199. tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
  200. }
  201. __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
  202. struct pt_regs *regs, int *rctxp)
  203. {
  204. struct trace_entry *entry;
  205. unsigned long flags;
  206. char *raw_data;
  207. int pc;
  208. BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
  209. if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
  210. "perf buffer not large enough"))
  211. return NULL;
  212. pc = preempt_count();
  213. *rctxp = perf_swevent_get_recursion_context();
  214. if (*rctxp < 0)
  215. return NULL;
  216. raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
  217. /* zero the dead bytes from align to not leak stack to user */
  218. memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
  219. entry = (struct trace_entry *)raw_data;
  220. local_save_flags(flags);
  221. tracing_generic_entry_update(entry, flags, pc);
  222. entry->type = type;
  223. return raw_data;
  224. }
  225. EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
  226. #ifdef CONFIG_FUNCTION_TRACER
  227. static void
  228. perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
  229. struct ftrace_ops *ops, struct pt_regs *pt_regs)
  230. {
  231. struct ftrace_entry *entry;
  232. struct hlist_head *head;
  233. struct pt_regs regs;
  234. int rctx;
  235. head = this_cpu_ptr(event_function.perf_events);
  236. if (hlist_empty(head))
  237. return;
  238. #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
  239. sizeof(u64)) - sizeof(u32))
  240. BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
  241. perf_fetch_caller_regs(&regs);
  242. entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
  243. if (!entry)
  244. return;
  245. entry->ip = ip;
  246. entry->parent_ip = parent_ip;
  247. perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
  248. 1, &regs, head, NULL);
  249. #undef ENTRY_SIZE
  250. }
  251. static int perf_ftrace_function_register(struct perf_event *event)
  252. {
  253. struct ftrace_ops *ops = &event->ftrace_ops;
  254. ops->flags |= FTRACE_OPS_FL_CONTROL;
  255. ops->func = perf_ftrace_function_call;
  256. return register_ftrace_function(ops);
  257. }
  258. static int perf_ftrace_function_unregister(struct perf_event *event)
  259. {
  260. struct ftrace_ops *ops = &event->ftrace_ops;
  261. int ret = unregister_ftrace_function(ops);
  262. ftrace_free_filter(ops);
  263. return ret;
  264. }
  265. static void perf_ftrace_function_enable(struct perf_event *event)
  266. {
  267. ftrace_function_local_enable(&event->ftrace_ops);
  268. }
  269. static void perf_ftrace_function_disable(struct perf_event *event)
  270. {
  271. ftrace_function_local_disable(&event->ftrace_ops);
  272. }
  273. int perf_ftrace_event_register(struct ftrace_event_call *call,
  274. enum trace_reg type, void *data)
  275. {
  276. switch (type) {
  277. case TRACE_REG_REGISTER:
  278. case TRACE_REG_UNREGISTER:
  279. break;
  280. case TRACE_REG_PERF_REGISTER:
  281. case TRACE_REG_PERF_UNREGISTER:
  282. return 0;
  283. case TRACE_REG_PERF_OPEN:
  284. return perf_ftrace_function_register(data);
  285. case TRACE_REG_PERF_CLOSE:
  286. return perf_ftrace_function_unregister(data);
  287. case TRACE_REG_PERF_ADD:
  288. perf_ftrace_function_enable(data);
  289. return 0;
  290. case TRACE_REG_PERF_DEL:
  291. perf_ftrace_function_disable(data);
  292. return 0;
  293. }
  294. return -EINVAL;
  295. }
  296. #endif /* CONFIG_FUNCTION_TRACER */