stat-shadow.c 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <stdio.h>
  3. #include "evsel.h"
  4. #include "stat.h"
  5. #include "color.h"
  6. #include "pmu.h"
  7. #include "rblist.h"
  8. #include "evlist.h"
  9. #include "expr.h"
  10. #include "metricgroup.h"
  11. /*
  12. * AGGR_GLOBAL: Use CPU 0
  13. * AGGR_SOCKET: Use first CPU of socket
  14. * AGGR_CORE: Use first CPU of core
  15. * AGGR_NONE: Use matching CPU
  16. * AGGR_THREAD: Not supported?
  17. */
  18. static bool have_frontend_stalled;
  19. struct runtime_stat rt_stat;
  20. struct stats walltime_nsecs_stats;
  21. struct saved_value {
  22. struct rb_node rb_node;
  23. struct perf_evsel *evsel;
  24. enum stat_type type;
  25. int ctx;
  26. int cpu;
  27. struct runtime_stat *stat;
  28. struct stats stats;
  29. };
  30. static int saved_value_cmp(struct rb_node *rb_node, const void *entry)
  31. {
  32. struct saved_value *a = container_of(rb_node,
  33. struct saved_value,
  34. rb_node);
  35. const struct saved_value *b = entry;
  36. if (a->cpu != b->cpu)
  37. return a->cpu - b->cpu;
  38. /*
  39. * Previously the rbtree was used to link generic metrics.
  40. * The keys were evsel/cpu. Now the rbtree is extended to support
  41. * per-thread shadow stats. For shadow stats case, the keys
  42. * are cpu/type/ctx/stat (evsel is NULL). For generic metrics
  43. * case, the keys are still evsel/cpu (type/ctx/stat are 0 or NULL).
  44. */
  45. if (a->type != b->type)
  46. return a->type - b->type;
  47. if (a->ctx != b->ctx)
  48. return a->ctx - b->ctx;
  49. if (a->evsel == NULL && b->evsel == NULL) {
  50. if (a->stat == b->stat)
  51. return 0;
  52. if ((char *)a->stat < (char *)b->stat)
  53. return -1;
  54. return 1;
  55. }
  56. if (a->evsel == b->evsel)
  57. return 0;
  58. if ((char *)a->evsel < (char *)b->evsel)
  59. return -1;
  60. return +1;
  61. }
  62. static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused,
  63. const void *entry)
  64. {
  65. struct saved_value *nd = malloc(sizeof(struct saved_value));
  66. if (!nd)
  67. return NULL;
  68. memcpy(nd, entry, sizeof(struct saved_value));
  69. return &nd->rb_node;
  70. }
  71. static void saved_value_delete(struct rblist *rblist __maybe_unused,
  72. struct rb_node *rb_node)
  73. {
  74. struct saved_value *v;
  75. BUG_ON(!rb_node);
  76. v = container_of(rb_node, struct saved_value, rb_node);
  77. free(v);
  78. }
  79. static struct saved_value *saved_value_lookup(struct perf_evsel *evsel,
  80. int cpu,
  81. bool create,
  82. enum stat_type type,
  83. int ctx,
  84. struct runtime_stat *st)
  85. {
  86. struct rblist *rblist;
  87. struct rb_node *nd;
  88. struct saved_value dm = {
  89. .cpu = cpu,
  90. .evsel = evsel,
  91. .type = type,
  92. .ctx = ctx,
  93. .stat = st,
  94. };
  95. rblist = &st->value_list;
  96. nd = rblist__find(rblist, &dm);
  97. if (nd)
  98. return container_of(nd, struct saved_value, rb_node);
  99. if (create) {
  100. rblist__add_node(rblist, &dm);
  101. nd = rblist__find(rblist, &dm);
  102. if (nd)
  103. return container_of(nd, struct saved_value, rb_node);
  104. }
  105. return NULL;
  106. }
  107. void runtime_stat__init(struct runtime_stat *st)
  108. {
  109. struct rblist *rblist = &st->value_list;
  110. rblist__init(rblist);
  111. rblist->node_cmp = saved_value_cmp;
  112. rblist->node_new = saved_value_new;
  113. rblist->node_delete = saved_value_delete;
  114. }
  115. void runtime_stat__exit(struct runtime_stat *st)
  116. {
  117. rblist__exit(&st->value_list);
  118. }
  119. void perf_stat__init_shadow_stats(void)
  120. {
  121. have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend");
  122. runtime_stat__init(&rt_stat);
  123. }
  124. static int evsel_context(struct perf_evsel *evsel)
  125. {
  126. int ctx = 0;
  127. if (evsel->attr.exclude_kernel)
  128. ctx |= CTX_BIT_KERNEL;
  129. if (evsel->attr.exclude_user)
  130. ctx |= CTX_BIT_USER;
  131. if (evsel->attr.exclude_hv)
  132. ctx |= CTX_BIT_HV;
  133. if (evsel->attr.exclude_host)
  134. ctx |= CTX_BIT_HOST;
  135. if (evsel->attr.exclude_idle)
  136. ctx |= CTX_BIT_IDLE;
  137. return ctx;
  138. }
  139. static void reset_stat(struct runtime_stat *st)
  140. {
  141. struct rblist *rblist;
  142. struct rb_node *pos, *next;
  143. rblist = &st->value_list;
  144. next = rb_first(&rblist->entries);
  145. while (next) {
  146. pos = next;
  147. next = rb_next(pos);
  148. memset(&container_of(pos, struct saved_value, rb_node)->stats,
  149. 0,
  150. sizeof(struct stats));
  151. }
  152. }
  153. void perf_stat__reset_shadow_stats(void)
  154. {
  155. reset_stat(&rt_stat);
  156. memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
  157. }
  158. void perf_stat__reset_shadow_per_stat(struct runtime_stat *st)
  159. {
  160. reset_stat(st);
  161. }
  162. static void update_runtime_stat(struct runtime_stat *st,
  163. enum stat_type type,
  164. int ctx, int cpu, u64 count)
  165. {
  166. struct saved_value *v = saved_value_lookup(NULL, cpu, true,
  167. type, ctx, st);
  168. if (v)
  169. update_stats(&v->stats, count);
  170. }
  171. /*
  172. * Update various tracking values we maintain to print
  173. * more semantic information such as miss/hit ratios,
  174. * instruction rates, etc:
  175. */
  176. void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 count,
  177. int cpu, struct runtime_stat *st)
  178. {
  179. int ctx = evsel_context(counter);
  180. count *= counter->scale;
  181. if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK) ||
  182. perf_evsel__match(counter, SOFTWARE, SW_CPU_CLOCK))
  183. update_runtime_stat(st, STAT_NSECS, 0, cpu, count);
  184. else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
  185. update_runtime_stat(st, STAT_CYCLES, ctx, cpu, count);
  186. else if (perf_stat_evsel__is(counter, CYCLES_IN_TX))
  187. update_runtime_stat(st, STAT_CYCLES_IN_TX, ctx, cpu, count);
  188. else if (perf_stat_evsel__is(counter, TRANSACTION_START))
  189. update_runtime_stat(st, STAT_TRANSACTION, ctx, cpu, count);
  190. else if (perf_stat_evsel__is(counter, ELISION_START))
  191. update_runtime_stat(st, STAT_ELISION, ctx, cpu, count);
  192. else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
  193. update_runtime_stat(st, STAT_TOPDOWN_TOTAL_SLOTS,
  194. ctx, cpu, count);
  195. else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
  196. update_runtime_stat(st, STAT_TOPDOWN_SLOTS_ISSUED,
  197. ctx, cpu, count);
  198. else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
  199. update_runtime_stat(st, STAT_TOPDOWN_SLOTS_RETIRED,
  200. ctx, cpu, count);
  201. else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
  202. update_runtime_stat(st, STAT_TOPDOWN_FETCH_BUBBLES,
  203. ctx, cpu, count);
  204. else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
  205. update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES,
  206. ctx, cpu, count);
  207. else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
  208. update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT,
  209. ctx, cpu, count);
  210. else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
  211. update_runtime_stat(st, STAT_STALLED_CYCLES_BACK,
  212. ctx, cpu, count);
  213. else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
  214. update_runtime_stat(st, STAT_BRANCHES, ctx, cpu, count);
  215. else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
  216. update_runtime_stat(st, STAT_CACHEREFS, ctx, cpu, count);
  217. else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
  218. update_runtime_stat(st, STAT_L1_DCACHE, ctx, cpu, count);
  219. else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
  220. update_runtime_stat(st, STAT_L1_ICACHE, ctx, cpu, count);
  221. else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
  222. update_runtime_stat(st, STAT_LL_CACHE, ctx, cpu, count);
  223. else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
  224. update_runtime_stat(st, STAT_DTLB_CACHE, ctx, cpu, count);
  225. else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
  226. update_runtime_stat(st, STAT_ITLB_CACHE, ctx, cpu, count);
  227. else if (perf_stat_evsel__is(counter, SMI_NUM))
  228. update_runtime_stat(st, STAT_SMI_NUM, ctx, cpu, count);
  229. else if (perf_stat_evsel__is(counter, APERF))
  230. update_runtime_stat(st, STAT_APERF, ctx, cpu, count);
  231. if (counter->collect_stat) {
  232. struct saved_value *v = saved_value_lookup(counter, cpu, true,
  233. STAT_NONE, 0, st);
  234. update_stats(&v->stats, count);
  235. }
  236. }
  237. /* used for get_ratio_color() */
  238. enum grc_type {
  239. GRC_STALLED_CYCLES_FE,
  240. GRC_STALLED_CYCLES_BE,
  241. GRC_CACHE_MISSES,
  242. GRC_MAX_NR
  243. };
  244. static const char *get_ratio_color(enum grc_type type, double ratio)
  245. {
  246. static const double grc_table[GRC_MAX_NR][3] = {
  247. [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 },
  248. [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 },
  249. [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 },
  250. };
  251. const char *color = PERF_COLOR_NORMAL;
  252. if (ratio > grc_table[type][0])
  253. color = PERF_COLOR_RED;
  254. else if (ratio > grc_table[type][1])
  255. color = PERF_COLOR_MAGENTA;
  256. else if (ratio > grc_table[type][2])
  257. color = PERF_COLOR_YELLOW;
  258. return color;
  259. }
  260. static struct perf_evsel *perf_stat__find_event(struct perf_evlist *evsel_list,
  261. const char *name)
  262. {
  263. struct perf_evsel *c2;
  264. evlist__for_each_entry (evsel_list, c2) {
  265. if (!strcasecmp(c2->name, name))
  266. return c2;
  267. }
  268. return NULL;
  269. }
  270. /* Mark MetricExpr target events and link events using them to them. */
  271. void perf_stat__collect_metric_expr(struct perf_evlist *evsel_list)
  272. {
  273. struct perf_evsel *counter, *leader, **metric_events, *oc;
  274. bool found;
  275. const char **metric_names;
  276. int i;
  277. int num_metric_names;
  278. evlist__for_each_entry(evsel_list, counter) {
  279. bool invalid = false;
  280. leader = counter->leader;
  281. if (!counter->metric_expr)
  282. continue;
  283. metric_events = counter->metric_events;
  284. if (!metric_events) {
  285. if (expr__find_other(counter->metric_expr, counter->name,
  286. &metric_names, &num_metric_names) < 0)
  287. continue;
  288. metric_events = calloc(sizeof(struct perf_evsel *),
  289. num_metric_names + 1);
  290. if (!metric_events)
  291. return;
  292. counter->metric_events = metric_events;
  293. }
  294. for (i = 0; i < num_metric_names; i++) {
  295. found = false;
  296. if (leader) {
  297. /* Search in group */
  298. for_each_group_member (oc, leader) {
  299. if (!strcasecmp(oc->name, metric_names[i])) {
  300. found = true;
  301. break;
  302. }
  303. }
  304. }
  305. if (!found) {
  306. /* Search ignoring groups */
  307. oc = perf_stat__find_event(evsel_list, metric_names[i]);
  308. }
  309. if (!oc) {
  310. /* Deduping one is good enough to handle duplicated PMUs. */
  311. static char *printed;
  312. /*
  313. * Adding events automatically would be difficult, because
  314. * it would risk creating groups that are not schedulable.
  315. * perf stat doesn't understand all the scheduling constraints
  316. * of events. So we ask the user instead to add the missing
  317. * events.
  318. */
  319. if (!printed || strcasecmp(printed, metric_names[i])) {
  320. fprintf(stderr,
  321. "Add %s event to groups to get metric expression for %s\n",
  322. metric_names[i],
  323. counter->name);
  324. printed = strdup(metric_names[i]);
  325. }
  326. invalid = true;
  327. continue;
  328. }
  329. metric_events[i] = oc;
  330. oc->collect_stat = true;
  331. }
  332. metric_events[i] = NULL;
  333. free(metric_names);
  334. if (invalid) {
  335. free(metric_events);
  336. counter->metric_events = NULL;
  337. counter->metric_expr = NULL;
  338. }
  339. }
  340. }
  341. static double runtime_stat_avg(struct runtime_stat *st,
  342. enum stat_type type, int ctx, int cpu)
  343. {
  344. struct saved_value *v;
  345. v = saved_value_lookup(NULL, cpu, false, type, ctx, st);
  346. if (!v)
  347. return 0.0;
  348. return avg_stats(&v->stats);
  349. }
  350. static double runtime_stat_n(struct runtime_stat *st,
  351. enum stat_type type, int ctx, int cpu)
  352. {
  353. struct saved_value *v;
  354. v = saved_value_lookup(NULL, cpu, false, type, ctx, st);
  355. if (!v)
  356. return 0.0;
  357. return v->stats.n;
  358. }
  359. static void print_stalled_cycles_frontend(struct perf_stat_config *config,
  360. int cpu,
  361. struct perf_evsel *evsel, double avg,
  362. struct perf_stat_output_ctx *out,
  363. struct runtime_stat *st)
  364. {
  365. double total, ratio = 0.0;
  366. const char *color;
  367. int ctx = evsel_context(evsel);
  368. total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
  369. if (total)
  370. ratio = avg / total * 100.0;
  371. color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
  372. if (ratio)
  373. out->print_metric(config, out->ctx, color, "%7.2f%%", "frontend cycles idle",
  374. ratio);
  375. else
  376. out->print_metric(config, out->ctx, NULL, NULL, "frontend cycles idle", 0);
  377. }
  378. static void print_stalled_cycles_backend(struct perf_stat_config *config,
  379. int cpu,
  380. struct perf_evsel *evsel, double avg,
  381. struct perf_stat_output_ctx *out,
  382. struct runtime_stat *st)
  383. {
  384. double total, ratio = 0.0;
  385. const char *color;
  386. int ctx = evsel_context(evsel);
  387. total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
  388. if (total)
  389. ratio = avg / total * 100.0;
  390. color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
  391. out->print_metric(config, out->ctx, color, "%7.2f%%", "backend cycles idle", ratio);
  392. }
  393. static void print_branch_misses(struct perf_stat_config *config,
  394. int cpu,
  395. struct perf_evsel *evsel,
  396. double avg,
  397. struct perf_stat_output_ctx *out,
  398. struct runtime_stat *st)
  399. {
  400. double total, ratio = 0.0;
  401. const char *color;
  402. int ctx = evsel_context(evsel);
  403. total = runtime_stat_avg(st, STAT_BRANCHES, ctx, cpu);
  404. if (total)
  405. ratio = avg / total * 100.0;
  406. color = get_ratio_color(GRC_CACHE_MISSES, ratio);
  407. out->print_metric(config, out->ctx, color, "%7.2f%%", "of all branches", ratio);
  408. }
  409. static void print_l1_dcache_misses(struct perf_stat_config *config,
  410. int cpu,
  411. struct perf_evsel *evsel,
  412. double avg,
  413. struct perf_stat_output_ctx *out,
  414. struct runtime_stat *st)
  415. {
  416. double total, ratio = 0.0;
  417. const char *color;
  418. int ctx = evsel_context(evsel);
  419. total = runtime_stat_avg(st, STAT_L1_DCACHE, ctx, cpu);
  420. if (total)
  421. ratio = avg / total * 100.0;
  422. color = get_ratio_color(GRC_CACHE_MISSES, ratio);
  423. out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio);
  424. }
  425. static void print_l1_icache_misses(struct perf_stat_config *config,
  426. int cpu,
  427. struct perf_evsel *evsel,
  428. double avg,
  429. struct perf_stat_output_ctx *out,
  430. struct runtime_stat *st)
  431. {
  432. double total, ratio = 0.0;
  433. const char *color;
  434. int ctx = evsel_context(evsel);
  435. total = runtime_stat_avg(st, STAT_L1_ICACHE, ctx, cpu);
  436. if (total)
  437. ratio = avg / total * 100.0;
  438. color = get_ratio_color(GRC_CACHE_MISSES, ratio);
  439. out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio);
  440. }
  441. static void print_dtlb_cache_misses(struct perf_stat_config *config,
  442. int cpu,
  443. struct perf_evsel *evsel,
  444. double avg,
  445. struct perf_stat_output_ctx *out,
  446. struct runtime_stat *st)
  447. {
  448. double total, ratio = 0.0;
  449. const char *color;
  450. int ctx = evsel_context(evsel);
  451. total = runtime_stat_avg(st, STAT_DTLB_CACHE, ctx, cpu);
  452. if (total)
  453. ratio = avg / total * 100.0;
  454. color = get_ratio_color(GRC_CACHE_MISSES, ratio);
  455. out->print_metric(config, out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio);
  456. }
  457. static void print_itlb_cache_misses(struct perf_stat_config *config,
  458. int cpu,
  459. struct perf_evsel *evsel,
  460. double avg,
  461. struct perf_stat_output_ctx *out,
  462. struct runtime_stat *st)
  463. {
  464. double total, ratio = 0.0;
  465. const char *color;
  466. int ctx = evsel_context(evsel);
  467. total = runtime_stat_avg(st, STAT_ITLB_CACHE, ctx, cpu);
  468. if (total)
  469. ratio = avg / total * 100.0;
  470. color = get_ratio_color(GRC_CACHE_MISSES, ratio);
  471. out->print_metric(config, out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio);
  472. }
  473. static void print_ll_cache_misses(struct perf_stat_config *config,
  474. int cpu,
  475. struct perf_evsel *evsel,
  476. double avg,
  477. struct perf_stat_output_ctx *out,
  478. struct runtime_stat *st)
  479. {
  480. double total, ratio = 0.0;
  481. const char *color;
  482. int ctx = evsel_context(evsel);
  483. total = runtime_stat_avg(st, STAT_LL_CACHE, ctx, cpu);
  484. if (total)
  485. ratio = avg / total * 100.0;
  486. color = get_ratio_color(GRC_CACHE_MISSES, ratio);
  487. out->print_metric(config, out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
  488. }
  489. /*
  490. * High level "TopDown" CPU core pipe line bottleneck break down.
  491. *
  492. * Basic concept following
  493. * Yasin, A Top Down Method for Performance analysis and Counter architecture
  494. * ISPASS14
  495. *
  496. * The CPU pipeline is divided into 4 areas that can be bottlenecks:
  497. *
  498. * Frontend -> Backend -> Retiring
  499. * BadSpeculation in addition means out of order execution that is thrown away
  500. * (for example branch mispredictions)
  501. * Frontend is instruction decoding.
  502. * Backend is execution, like computation and accessing data in memory
  503. * Retiring is good execution that is not directly bottlenecked
  504. *
  505. * The formulas are computed in slots.
  506. * A slot is an entry in the pipeline each for the pipeline width
  507. * (for example a 4-wide pipeline has 4 slots for each cycle)
  508. *
  509. * Formulas:
  510. * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) /
  511. * TotalSlots
  512. * Retiring = SlotsRetired / TotalSlots
  513. * FrontendBound = FetchBubbles / TotalSlots
  514. * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound
  515. *
  516. * The kernel provides the mapping to the low level CPU events and any scaling
  517. * needed for the CPU pipeline width, for example:
  518. *
  519. * TotalSlots = Cycles * 4
  520. *
  521. * The scaling factor is communicated in the sysfs unit.
  522. *
  523. * In some cases the CPU may not be able to measure all the formulas due to
  524. * missing events. In this case multiple formulas are combined, as possible.
  525. *
  526. * Full TopDown supports more levels to sub-divide each area: for example
  527. * BackendBound into computing bound and memory bound. For now we only
  528. * support Level 1 TopDown.
  529. */
  530. static double sanitize_val(double x)
  531. {
  532. if (x < 0 && x >= -0.02)
  533. return 0.0;
  534. return x;
  535. }
  536. static double td_total_slots(int ctx, int cpu, struct runtime_stat *st)
  537. {
  538. return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, ctx, cpu);
  539. }
  540. static double td_bad_spec(int ctx, int cpu, struct runtime_stat *st)
  541. {
  542. double bad_spec = 0;
  543. double total_slots;
  544. double total;
  545. total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, ctx, cpu) -
  546. runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, ctx, cpu) +
  547. runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, ctx, cpu);
  548. total_slots = td_total_slots(ctx, cpu, st);
  549. if (total_slots)
  550. bad_spec = total / total_slots;
  551. return sanitize_val(bad_spec);
  552. }
  553. static double td_retiring(int ctx, int cpu, struct runtime_stat *st)
  554. {
  555. double retiring = 0;
  556. double total_slots = td_total_slots(ctx, cpu, st);
  557. double ret_slots = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED,
  558. ctx, cpu);
  559. if (total_slots)
  560. retiring = ret_slots / total_slots;
  561. return retiring;
  562. }
  563. static double td_fe_bound(int ctx, int cpu, struct runtime_stat *st)
  564. {
  565. double fe_bound = 0;
  566. double total_slots = td_total_slots(ctx, cpu, st);
  567. double fetch_bub = runtime_stat_avg(st, STAT_TOPDOWN_FETCH_BUBBLES,
  568. ctx, cpu);
  569. if (total_slots)
  570. fe_bound = fetch_bub / total_slots;
  571. return fe_bound;
  572. }
  573. static double td_be_bound(int ctx, int cpu, struct runtime_stat *st)
  574. {
  575. double sum = (td_fe_bound(ctx, cpu, st) +
  576. td_bad_spec(ctx, cpu, st) +
  577. td_retiring(ctx, cpu, st));
  578. if (sum == 0)
  579. return 0;
  580. return sanitize_val(1.0 - sum);
  581. }
  582. static void print_smi_cost(struct perf_stat_config *config,
  583. int cpu, struct perf_evsel *evsel,
  584. struct perf_stat_output_ctx *out,
  585. struct runtime_stat *st)
  586. {
  587. double smi_num, aperf, cycles, cost = 0.0;
  588. int ctx = evsel_context(evsel);
  589. const char *color = NULL;
  590. smi_num = runtime_stat_avg(st, STAT_SMI_NUM, ctx, cpu);
  591. aperf = runtime_stat_avg(st, STAT_APERF, ctx, cpu);
  592. cycles = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
  593. if ((cycles == 0) || (aperf == 0))
  594. return;
  595. if (smi_num)
  596. cost = (aperf - cycles) / aperf * 100.00;
  597. if (cost > 10)
  598. color = PERF_COLOR_RED;
  599. out->print_metric(config, out->ctx, color, "%8.1f%%", "SMI cycles%", cost);
  600. out->print_metric(config, out->ctx, NULL, "%4.0f", "SMI#", smi_num);
  601. }
  602. static void generic_metric(struct perf_stat_config *config,
  603. const char *metric_expr,
  604. struct perf_evsel **metric_events,
  605. char *name,
  606. const char *metric_name,
  607. double avg,
  608. int cpu,
  609. struct perf_stat_output_ctx *out,
  610. struct runtime_stat *st)
  611. {
  612. print_metric_t print_metric = out->print_metric;
  613. struct parse_ctx pctx;
  614. double ratio;
  615. int i;
  616. void *ctxp = out->ctx;
  617. expr__ctx_init(&pctx);
  618. expr__add_id(&pctx, name, avg);
  619. for (i = 0; metric_events[i]; i++) {
  620. struct saved_value *v;
  621. struct stats *stats;
  622. double scale;
  623. if (!strcmp(metric_events[i]->name, "duration_time")) {
  624. stats = &walltime_nsecs_stats;
  625. scale = 1e-9;
  626. } else {
  627. v = saved_value_lookup(metric_events[i], cpu, false,
  628. STAT_NONE, 0, st);
  629. if (!v)
  630. break;
  631. stats = &v->stats;
  632. scale = 1.0;
  633. }
  634. expr__add_id(&pctx, metric_events[i]->name, avg_stats(stats)*scale);
  635. }
  636. if (!metric_events[i]) {
  637. const char *p = metric_expr;
  638. if (expr__parse(&ratio, &pctx, &p) == 0)
  639. print_metric(config, ctxp, NULL, "%8.1f",
  640. metric_name ?
  641. metric_name :
  642. out->force_header ? name : "",
  643. ratio);
  644. else
  645. print_metric(config, ctxp, NULL, NULL,
  646. out->force_header ?
  647. (metric_name ? metric_name : name) : "", 0);
  648. } else
  649. print_metric(config, ctxp, NULL, NULL, "", 0);
  650. }
  651. void perf_stat__print_shadow_stats(struct perf_stat_config *config,
  652. struct perf_evsel *evsel,
  653. double avg, int cpu,
  654. struct perf_stat_output_ctx *out,
  655. struct rblist *metric_events,
  656. struct runtime_stat *st)
  657. {
  658. void *ctxp = out->ctx;
  659. print_metric_t print_metric = out->print_metric;
  660. double total, ratio = 0.0, total2;
  661. const char *color = NULL;
  662. int ctx = evsel_context(evsel);
  663. struct metric_event *me;
  664. int num = 1;
  665. if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
  666. total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
  667. if (total) {
  668. ratio = avg / total;
  669. print_metric(config, ctxp, NULL, "%7.2f ",
  670. "insn per cycle", ratio);
  671. } else {
  672. print_metric(config, ctxp, NULL, NULL, "insn per cycle", 0);
  673. }
  674. total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT,
  675. ctx, cpu);
  676. total = max(total, runtime_stat_avg(st,
  677. STAT_STALLED_CYCLES_BACK,
  678. ctx, cpu));
  679. if (total && avg) {
  680. out->new_line(config, ctxp);
  681. ratio = total / avg;
  682. print_metric(config, ctxp, NULL, "%7.2f ",
  683. "stalled cycles per insn",
  684. ratio);
  685. } else if (have_frontend_stalled) {
  686. print_metric(config, ctxp, NULL, NULL,
  687. "stalled cycles per insn", 0);
  688. }
  689. } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) {
  690. if (runtime_stat_n(st, STAT_BRANCHES, ctx, cpu) != 0)
  691. print_branch_misses(config, cpu, evsel, avg, out, st);
  692. else
  693. print_metric(config, ctxp, NULL, NULL, "of all branches", 0);
  694. } else if (
  695. evsel->attr.type == PERF_TYPE_HW_CACHE &&
  696. evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D |
  697. ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
  698. ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
  699. if (runtime_stat_n(st, STAT_L1_DCACHE, ctx, cpu) != 0)
  700. print_l1_dcache_misses(config, cpu, evsel, avg, out, st);
  701. else
  702. print_metric(config, ctxp, NULL, NULL, "of all L1-dcache hits", 0);
  703. } else if (
  704. evsel->attr.type == PERF_TYPE_HW_CACHE &&
  705. evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I |
  706. ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
  707. ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
  708. if (runtime_stat_n(st, STAT_L1_ICACHE, ctx, cpu) != 0)
  709. print_l1_icache_misses(config, cpu, evsel, avg, out, st);
  710. else
  711. print_metric(config, ctxp, NULL, NULL, "of all L1-icache hits", 0);
  712. } else if (
  713. evsel->attr.type == PERF_TYPE_HW_CACHE &&
  714. evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB |
  715. ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
  716. ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
  717. if (runtime_stat_n(st, STAT_DTLB_CACHE, ctx, cpu) != 0)
  718. print_dtlb_cache_misses(config, cpu, evsel, avg, out, st);
  719. else
  720. print_metric(config, ctxp, NULL, NULL, "of all dTLB cache hits", 0);
  721. } else if (
  722. evsel->attr.type == PERF_TYPE_HW_CACHE &&
  723. evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB |
  724. ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
  725. ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
  726. if (runtime_stat_n(st, STAT_ITLB_CACHE, ctx, cpu) != 0)
  727. print_itlb_cache_misses(config, cpu, evsel, avg, out, st);
  728. else
  729. print_metric(config, ctxp, NULL, NULL, "of all iTLB cache hits", 0);
  730. } else if (
  731. evsel->attr.type == PERF_TYPE_HW_CACHE &&
  732. evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL |
  733. ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
  734. ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
  735. if (runtime_stat_n(st, STAT_LL_CACHE, ctx, cpu) != 0)
  736. print_ll_cache_misses(config, cpu, evsel, avg, out, st);
  737. else
  738. print_metric(config, ctxp, NULL, NULL, "of all LL-cache hits", 0);
  739. } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) {
  740. total = runtime_stat_avg(st, STAT_CACHEREFS, ctx, cpu);
  741. if (total)
  742. ratio = avg * 100 / total;
  743. if (runtime_stat_n(st, STAT_CACHEREFS, ctx, cpu) != 0)
  744. print_metric(config, ctxp, NULL, "%8.3f %%",
  745. "of all cache refs", ratio);
  746. else
  747. print_metric(config, ctxp, NULL, NULL, "of all cache refs", 0);
  748. } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
  749. print_stalled_cycles_frontend(config, cpu, evsel, avg, out, st);
  750. } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
  751. print_stalled_cycles_backend(config, cpu, evsel, avg, out, st);
  752. } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
  753. total = runtime_stat_avg(st, STAT_NSECS, 0, cpu);
  754. if (total) {
  755. ratio = avg / total;
  756. print_metric(config, ctxp, NULL, "%8.3f", "GHz", ratio);
  757. } else {
  758. print_metric(config, ctxp, NULL, NULL, "Ghz", 0);
  759. }
  760. } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
  761. total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
  762. if (total)
  763. print_metric(config, ctxp, NULL,
  764. "%7.2f%%", "transactional cycles",
  765. 100.0 * (avg / total));
  766. else
  767. print_metric(config, ctxp, NULL, NULL, "transactional cycles",
  768. 0);
  769. } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
  770. total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
  771. total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, ctx, cpu);
  772. if (total2 < avg)
  773. total2 = avg;
  774. if (total)
  775. print_metric(config, ctxp, NULL, "%7.2f%%", "aborted cycles",
  776. 100.0 * ((total2-avg) / total));
  777. else
  778. print_metric(config, ctxp, NULL, NULL, "aborted cycles", 0);
  779. } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) {
  780. total = runtime_stat_avg(st, STAT_CYCLES_IN_TX,
  781. ctx, cpu);
  782. if (avg)
  783. ratio = total / avg;
  784. if (runtime_stat_n(st, STAT_CYCLES_IN_TX, ctx, cpu) != 0)
  785. print_metric(config, ctxp, NULL, "%8.0f",
  786. "cycles / transaction", ratio);
  787. else
  788. print_metric(config, ctxp, NULL, NULL, "cycles / transaction",
  789. 0);
  790. } else if (perf_stat_evsel__is(evsel, ELISION_START)) {
  791. total = runtime_stat_avg(st, STAT_CYCLES_IN_TX,
  792. ctx, cpu);
  793. if (avg)
  794. ratio = total / avg;
  795. print_metric(config, ctxp, NULL, "%8.0f", "cycles / elision", ratio);
  796. } else if (perf_evsel__is_clock(evsel)) {
  797. if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0)
  798. print_metric(config, ctxp, NULL, "%8.3f", "CPUs utilized",
  799. avg / (ratio * evsel->scale));
  800. else
  801. print_metric(config, ctxp, NULL, NULL, "CPUs utilized", 0);
  802. } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
  803. double fe_bound = td_fe_bound(ctx, cpu, st);
  804. if (fe_bound > 0.2)
  805. color = PERF_COLOR_RED;
  806. print_metric(config, ctxp, color, "%8.1f%%", "frontend bound",
  807. fe_bound * 100.);
  808. } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
  809. double retiring = td_retiring(ctx, cpu, st);
  810. if (retiring > 0.7)
  811. color = PERF_COLOR_GREEN;
  812. print_metric(config, ctxp, color, "%8.1f%%", "retiring",
  813. retiring * 100.);
  814. } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
  815. double bad_spec = td_bad_spec(ctx, cpu, st);
  816. if (bad_spec > 0.1)
  817. color = PERF_COLOR_RED;
  818. print_metric(config, ctxp, color, "%8.1f%%", "bad speculation",
  819. bad_spec * 100.);
  820. } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
  821. double be_bound = td_be_bound(ctx, cpu, st);
  822. const char *name = "backend bound";
  823. static int have_recovery_bubbles = -1;
  824. /* In case the CPU does not support topdown-recovery-bubbles */
  825. if (have_recovery_bubbles < 0)
  826. have_recovery_bubbles = pmu_have_event("cpu",
  827. "topdown-recovery-bubbles");
  828. if (!have_recovery_bubbles)
  829. name = "backend bound/bad spec";
  830. if (be_bound > 0.2)
  831. color = PERF_COLOR_RED;
  832. if (td_total_slots(ctx, cpu, st) > 0)
  833. print_metric(config, ctxp, color, "%8.1f%%", name,
  834. be_bound * 100.);
  835. else
  836. print_metric(config, ctxp, NULL, NULL, name, 0);
  837. } else if (evsel->metric_expr) {
  838. generic_metric(config, evsel->metric_expr, evsel->metric_events, evsel->name,
  839. evsel->metric_name, avg, cpu, out, st);
  840. } else if (runtime_stat_n(st, STAT_NSECS, 0, cpu) != 0) {
  841. char unit = 'M';
  842. char unit_buf[10];
  843. total = runtime_stat_avg(st, STAT_NSECS, 0, cpu);
  844. if (total)
  845. ratio = 1000.0 * avg / total;
  846. if (ratio < 0.001) {
  847. ratio *= 1000;
  848. unit = 'K';
  849. }
  850. snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
  851. print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio);
  852. } else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
  853. print_smi_cost(config, cpu, evsel, out, st);
  854. } else {
  855. num = 0;
  856. }
  857. if ((me = metricgroup__lookup(metric_events, evsel, false)) != NULL) {
  858. struct metric_expr *mexp;
  859. list_for_each_entry (mexp, &me->head, nd) {
  860. if (num++ > 0)
  861. out->new_line(config, ctxp);
  862. generic_metric(config, mexp->metric_expr, mexp->metric_events,
  863. evsel->name, mexp->metric_name,
  864. avg, cpu, out, st);
  865. }
  866. }
  867. if (num == 0)
  868. print_metric(config, ctxp, NULL, NULL, NULL, 0);
  869. }