hv-24x7.c 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633
  1. /*
  2. * Hypervisor supplied "24x7" performance counter support
  3. *
  4. * Author: Cody P Schafer <cody@linux.vnet.ibm.com>
  5. * Copyright 2014 IBM Corporation.
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. */
  12. #define pr_fmt(fmt) "hv-24x7: " fmt
  13. #include <linux/perf_event.h>
  14. #include <linux/rbtree.h>
  15. #include <linux/module.h>
  16. #include <linux/slab.h>
  17. #include <linux/vmalloc.h>
  18. #include <asm/cputhreads.h>
  19. #include <asm/firmware.h>
  20. #include <asm/hvcall.h>
  21. #include <asm/io.h>
  22. #include <linux/byteorder/generic.h>
  23. #include "hv-24x7.h"
  24. #include "hv-24x7-catalog.h"
  25. #include "hv-common.h"
  26. /* Version of the 24x7 hypervisor API that we should use in this machine. */
  27. static int interface_version;
  28. /* Whether we have to aggregate result data for some domains. */
  29. static bool aggregate_result_elements;
  30. static bool domain_is_valid(unsigned domain)
  31. {
  32. switch (domain) {
  33. #define DOMAIN(n, v, x, c) \
  34. case HV_PERF_DOMAIN_##n: \
  35. /* fall through */
  36. #include "hv-24x7-domains.h"
  37. #undef DOMAIN
  38. return true;
  39. default:
  40. return false;
  41. }
  42. }
  43. static bool is_physical_domain(unsigned domain)
  44. {
  45. switch (domain) {
  46. #define DOMAIN(n, v, x, c) \
  47. case HV_PERF_DOMAIN_##n: \
  48. return c;
  49. #include "hv-24x7-domains.h"
  50. #undef DOMAIN
  51. default:
  52. return false;
  53. }
  54. }
  55. /* Domains for which more than one result element are returned for each event. */
  56. static bool domain_needs_aggregation(unsigned int domain)
  57. {
  58. return aggregate_result_elements &&
  59. (domain == HV_PERF_DOMAIN_PHYS_CORE ||
  60. (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE &&
  61. domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE));
  62. }
  63. static const char *domain_name(unsigned domain)
  64. {
  65. if (!domain_is_valid(domain))
  66. return NULL;
  67. switch (domain) {
  68. case HV_PERF_DOMAIN_PHYS_CHIP: return "Physical Chip";
  69. case HV_PERF_DOMAIN_PHYS_CORE: return "Physical Core";
  70. case HV_PERF_DOMAIN_VCPU_HOME_CORE: return "VCPU Home Core";
  71. case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return "VCPU Home Chip";
  72. case HV_PERF_DOMAIN_VCPU_HOME_NODE: return "VCPU Home Node";
  73. case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return "VCPU Remote Node";
  74. }
  75. WARN_ON_ONCE(domain);
  76. return NULL;
  77. }
  78. static bool catalog_entry_domain_is_valid(unsigned domain)
  79. {
  80. /* POWER8 doesn't support virtual domains. */
  81. if (interface_version == 1)
  82. return is_physical_domain(domain);
  83. else
  84. return domain_is_valid(domain);
  85. }
  86. /*
  87. * TODO: Merging events:
  88. * - Think of the hcall as an interface to a 4d array of counters:
  89. * - x = domains
  90. * - y = indexes in the domain (core, chip, vcpu, node, etc)
  91. * - z = offset into the counter space
  92. * - w = lpars (guest vms, "logical partitions")
  93. * - A single request is: x,y,y_last,z,z_last,w,w_last
  94. * - this means we can retrieve a rectangle of counters in y,z for a single x.
  95. *
  96. * - Things to consider (ignoring w):
  97. * - input cost_per_request = 16
  98. * - output cost_per_result(ys,zs) = 8 + 8 * ys + ys * zs
  99. * - limited number of requests per hcall (must fit into 4K bytes)
  100. * - 4k = 16 [buffer header] - 16 [request size] * request_count
  101. * - 255 requests per hcall
  102. * - sometimes it will be more efficient to read extra data and discard
  103. */
  104. /*
  105. * Example usage:
  106. * perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/'
  107. */
  108. /* u3 0-6, one of HV_24X7_PERF_DOMAIN */
  109. EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3);
  110. /* u16 */
  111. EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31);
  112. EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31);
  113. EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31);
  114. /* u32, see "data_offset" */
  115. EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63);
  116. /* u16 */
  117. EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15);
  118. EVENT_DEFINE_RANGE(reserved1, config, 4, 15);
  119. EVENT_DEFINE_RANGE(reserved2, config1, 16, 63);
  120. EVENT_DEFINE_RANGE(reserved3, config2, 0, 63);
  121. static struct attribute *format_attrs[] = {
  122. &format_attr_domain.attr,
  123. &format_attr_offset.attr,
  124. &format_attr_core.attr,
  125. &format_attr_chip.attr,
  126. &format_attr_vcpu.attr,
  127. &format_attr_lpar.attr,
  128. NULL,
  129. };
  130. static struct attribute_group format_group = {
  131. .name = "format",
  132. .attrs = format_attrs,
  133. };
  134. static struct attribute_group event_group = {
  135. .name = "events",
  136. /* .attrs is set in init */
  137. };
  138. static struct attribute_group event_desc_group = {
  139. .name = "event_descs",
  140. /* .attrs is set in init */
  141. };
  142. static struct attribute_group event_long_desc_group = {
  143. .name = "event_long_descs",
  144. /* .attrs is set in init */
  145. };
  146. static struct kmem_cache *hv_page_cache;
  147. DEFINE_PER_CPU(int, hv_24x7_txn_flags);
  148. DEFINE_PER_CPU(int, hv_24x7_txn_err);
  149. struct hv_24x7_hw {
  150. struct perf_event *events[255];
  151. };
  152. DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw);
  153. /*
  154. * request_buffer and result_buffer are not required to be 4k aligned,
  155. * but are not allowed to cross any 4k boundary. Aligning them to 4k is
  156. * the simplest way to ensure that.
  157. */
  158. #define H24x7_DATA_BUFFER_SIZE 4096
  159. DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
  160. DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
  161. static unsigned int max_num_requests(int interface_version)
  162. {
  163. return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer))
  164. / H24x7_REQUEST_SIZE(interface_version);
  165. }
  166. static char *event_name(struct hv_24x7_event_data *ev, int *len)
  167. {
  168. *len = be16_to_cpu(ev->event_name_len) - 2;
  169. return (char *)ev->remainder;
  170. }
  171. static char *event_desc(struct hv_24x7_event_data *ev, int *len)
  172. {
  173. unsigned nl = be16_to_cpu(ev->event_name_len);
  174. __be16 *desc_len = (__be16 *)(ev->remainder + nl - 2);
  175. *len = be16_to_cpu(*desc_len) - 2;
  176. return (char *)ev->remainder + nl;
  177. }
  178. static char *event_long_desc(struct hv_24x7_event_data *ev, int *len)
  179. {
  180. unsigned nl = be16_to_cpu(ev->event_name_len);
  181. __be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2);
  182. unsigned desc_len = be16_to_cpu(*desc_len_);
  183. __be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2);
  184. *len = be16_to_cpu(*long_desc_len) - 2;
  185. return (char *)ev->remainder + nl + desc_len;
  186. }
  187. static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev,
  188. void *end)
  189. {
  190. void *start = ev;
  191. return (start + offsetof(struct hv_24x7_event_data, remainder)) < end;
  192. }
  193. /*
  194. * Things we don't check:
  195. * - padding for desc, name, and long/detailed desc is required to be '\0'
  196. * bytes.
  197. *
  198. * Return NULL if we pass end,
  199. * Otherwise return the address of the byte just following the event.
  200. */
  201. static void *event_end(struct hv_24x7_event_data *ev, void *end)
  202. {
  203. void *start = ev;
  204. __be16 *dl_, *ldl_;
  205. unsigned dl, ldl;
  206. unsigned nl = be16_to_cpu(ev->event_name_len);
  207. if (nl < 2) {
  208. pr_debug("%s: name length too short: %d", __func__, nl);
  209. return NULL;
  210. }
  211. if (start + nl > end) {
  212. pr_debug("%s: start=%p + nl=%u > end=%p",
  213. __func__, start, nl, end);
  214. return NULL;
  215. }
  216. dl_ = (__be16 *)(ev->remainder + nl - 2);
  217. if (!IS_ALIGNED((uintptr_t)dl_, 2))
  218. pr_warn("desc len not aligned %p", dl_);
  219. dl = be16_to_cpu(*dl_);
  220. if (dl < 2) {
  221. pr_debug("%s: desc len too short: %d", __func__, dl);
  222. return NULL;
  223. }
  224. if (start + nl + dl > end) {
  225. pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p",
  226. __func__, start, nl, dl, start + nl + dl, end);
  227. return NULL;
  228. }
  229. ldl_ = (__be16 *)(ev->remainder + nl + dl - 2);
  230. if (!IS_ALIGNED((uintptr_t)ldl_, 2))
  231. pr_warn("long desc len not aligned %p", ldl_);
  232. ldl = be16_to_cpu(*ldl_);
  233. if (ldl < 2) {
  234. pr_debug("%s: long desc len too short (ldl=%u)",
  235. __func__, ldl);
  236. return NULL;
  237. }
  238. if (start + nl + dl + ldl > end) {
  239. pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p",
  240. __func__, start, nl, dl, ldl, end);
  241. return NULL;
  242. }
  243. return start + nl + dl + ldl;
  244. }
  245. static long h_get_24x7_catalog_page_(unsigned long phys_4096,
  246. unsigned long version, unsigned long index)
  247. {
  248. pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)",
  249. phys_4096, version, index);
  250. WARN_ON(!IS_ALIGNED(phys_4096, 4096));
  251. return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE,
  252. phys_4096, version, index);
  253. }
  254. static long h_get_24x7_catalog_page(char page[], u64 version, u32 index)
  255. {
  256. return h_get_24x7_catalog_page_(virt_to_phys(page),
  257. version, index);
  258. }
  259. /*
  260. * Each event we find in the catalog, will have a sysfs entry. Format the
  261. * data for this sysfs entry based on the event's domain.
  262. *
  263. * Events belonging to the Chip domain can only be monitored in that domain.
  264. * i.e the domain for these events is a fixed/knwon value.
  265. *
  266. * Events belonging to the Core domain can be monitored either in the physical
  267. * core or in one of the virtual CPU domains. So the domain value for these
  268. * events must be specified by the user (i.e is a required parameter). Format
  269. * the Core events with 'domain=?' so the perf-tool can error check required
  270. * parameters.
  271. *
  272. * NOTE: For the Core domain events, rather than making domain a required
  273. * parameter we could default it to PHYS_CORE and allowe users to
  274. * override the domain to one of the VCPU domains.
  275. *
  276. * However, this can make the interface a little inconsistent.
  277. *
  278. * If we set domain=2 (PHYS_CHIP) and allow user to override this field
  279. * the user may be tempted to also modify the "offset=x" field in which
  280. * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and
  281. * HPM_INST (offset=0x20) events. With:
  282. *
  283. * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/
  284. *
  285. * we end up monitoring HPM_INST, while the command line has HPM_PCYC.
  286. *
  287. * By not assigning a default value to the domain for the Core events,
  288. * we can have simple guidelines:
  289. *
  290. * - Specifying values for parameters with "=?" is required.
  291. *
  292. * - Specifying (i.e overriding) values for other parameters
  293. * is undefined.
  294. */
  295. static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain)
  296. {
  297. const char *sindex;
  298. const char *lpar;
  299. const char *domain_str;
  300. char buf[8];
  301. switch (domain) {
  302. case HV_PERF_DOMAIN_PHYS_CHIP:
  303. snprintf(buf, sizeof(buf), "%d", domain);
  304. domain_str = buf;
  305. lpar = "0x0";
  306. sindex = "chip";
  307. break;
  308. case HV_PERF_DOMAIN_PHYS_CORE:
  309. domain_str = "?";
  310. lpar = "0x0";
  311. sindex = "core";
  312. break;
  313. default:
  314. domain_str = "?";
  315. lpar = "?";
  316. sindex = "vcpu";
  317. }
  318. return kasprintf(GFP_KERNEL,
  319. "domain=%s,offset=0x%x,%s=?,lpar=%s",
  320. domain_str,
  321. be16_to_cpu(event->event_counter_offs) +
  322. be16_to_cpu(event->event_group_record_offs),
  323. sindex,
  324. lpar);
  325. }
  326. /* Avoid trusting fw to NUL terminate strings */
  327. static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp)
  328. {
  329. return kasprintf(gfp, "%.*s", max_len, maybe_str);
  330. }
  331. static ssize_t device_show_string(struct device *dev,
  332. struct device_attribute *attr, char *buf)
  333. {
  334. struct dev_ext_attribute *d;
  335. d = container_of(attr, struct dev_ext_attribute, attr);
  336. return sprintf(buf, "%s\n", (char *)d->var);
  337. }
  338. static struct attribute *device_str_attr_create_(char *name, char *str)
  339. {
  340. struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL);
  341. if (!attr)
  342. return NULL;
  343. sysfs_attr_init(&attr->attr.attr);
  344. attr->var = str;
  345. attr->attr.attr.name = name;
  346. attr->attr.attr.mode = 0444;
  347. attr->attr.show = device_show_string;
  348. return &attr->attr.attr;
  349. }
  350. /*
  351. * Allocate and initialize strings representing event attributes.
  352. *
  353. * NOTE: The strings allocated here are never destroyed and continue to
  354. * exist till shutdown. This is to allow us to create as many events
  355. * from the catalog as possible, even if we encounter errors with some.
  356. * In case of changes to error paths in future, these may need to be
  357. * freed by the caller.
  358. */
  359. static struct attribute *device_str_attr_create(char *name, int name_max,
  360. int name_nonce,
  361. char *str, size_t str_max)
  362. {
  363. char *n;
  364. char *s = memdup_to_str(str, str_max, GFP_KERNEL);
  365. struct attribute *a;
  366. if (!s)
  367. return NULL;
  368. if (!name_nonce)
  369. n = kasprintf(GFP_KERNEL, "%.*s", name_max, name);
  370. else
  371. n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name,
  372. name_nonce);
  373. if (!n)
  374. goto out_s;
  375. a = device_str_attr_create_(n, s);
  376. if (!a)
  377. goto out_n;
  378. return a;
  379. out_n:
  380. kfree(n);
  381. out_s:
  382. kfree(s);
  383. return NULL;
  384. }
  385. static struct attribute *event_to_attr(unsigned ix,
  386. struct hv_24x7_event_data *event,
  387. unsigned domain,
  388. int nonce)
  389. {
  390. int event_name_len;
  391. char *ev_name, *a_ev_name, *val;
  392. struct attribute *attr;
  393. if (!domain_is_valid(domain)) {
  394. pr_warn("catalog event %u has invalid domain %u\n",
  395. ix, domain);
  396. return NULL;
  397. }
  398. val = event_fmt(event, domain);
  399. if (!val)
  400. return NULL;
  401. ev_name = event_name(event, &event_name_len);
  402. if (!nonce)
  403. a_ev_name = kasprintf(GFP_KERNEL, "%.*s",
  404. (int)event_name_len, ev_name);
  405. else
  406. a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d",
  407. (int)event_name_len, ev_name, nonce);
  408. if (!a_ev_name)
  409. goto out_val;
  410. attr = device_str_attr_create_(a_ev_name, val);
  411. if (!attr)
  412. goto out_name;
  413. return attr;
  414. out_name:
  415. kfree(a_ev_name);
  416. out_val:
  417. kfree(val);
  418. return NULL;
  419. }
  420. static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event,
  421. int nonce)
  422. {
  423. int nl, dl;
  424. char *name = event_name(event, &nl);
  425. char *desc = event_desc(event, &dl);
  426. /* If there isn't a description, don't create the sysfs file */
  427. if (!dl)
  428. return NULL;
  429. return device_str_attr_create(name, nl, nonce, desc, dl);
  430. }
  431. static struct attribute *
  432. event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce)
  433. {
  434. int nl, dl;
  435. char *name = event_name(event, &nl);
  436. char *desc = event_long_desc(event, &dl);
  437. /* If there isn't a description, don't create the sysfs file */
  438. if (!dl)
  439. return NULL;
  440. return device_str_attr_create(name, nl, nonce, desc, dl);
  441. }
  442. static int event_data_to_attrs(unsigned ix, struct attribute **attrs,
  443. struct hv_24x7_event_data *event, int nonce)
  444. {
  445. *attrs = event_to_attr(ix, event, event->domain, nonce);
  446. if (!*attrs)
  447. return -1;
  448. return 0;
  449. }
  450. /* */
  451. struct event_uniq {
  452. struct rb_node node;
  453. const char *name;
  454. int nl;
  455. unsigned ct;
  456. unsigned domain;
  457. };
  458. static int memord(const void *d1, size_t s1, const void *d2, size_t s2)
  459. {
  460. if (s1 < s2)
  461. return 1;
  462. if (s1 > s2)
  463. return -1;
  464. return memcmp(d1, d2, s1);
  465. }
  466. static int ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2,
  467. size_t s2, unsigned d2)
  468. {
  469. int r = memord(v1, s1, v2, s2);
  470. if (r)
  471. return r;
  472. if (d1 > d2)
  473. return 1;
  474. if (d2 > d1)
  475. return -1;
  476. return 0;
  477. }
  478. static int event_uniq_add(struct rb_root *root, const char *name, int nl,
  479. unsigned domain)
  480. {
  481. struct rb_node **new = &(root->rb_node), *parent = NULL;
  482. struct event_uniq *data;
  483. /* Figure out where to put new node */
  484. while (*new) {
  485. struct event_uniq *it;
  486. int result;
  487. it = container_of(*new, struct event_uniq, node);
  488. result = ev_uniq_ord(name, nl, domain, it->name, it->nl,
  489. it->domain);
  490. parent = *new;
  491. if (result < 0)
  492. new = &((*new)->rb_left);
  493. else if (result > 0)
  494. new = &((*new)->rb_right);
  495. else {
  496. it->ct++;
  497. pr_info("found a duplicate event %.*s, ct=%u\n", nl,
  498. name, it->ct);
  499. return it->ct;
  500. }
  501. }
  502. data = kmalloc(sizeof(*data), GFP_KERNEL);
  503. if (!data)
  504. return -ENOMEM;
  505. *data = (struct event_uniq) {
  506. .name = name,
  507. .nl = nl,
  508. .ct = 0,
  509. .domain = domain,
  510. };
  511. /* Add new node and rebalance tree. */
  512. rb_link_node(&data->node, parent, new);
  513. rb_insert_color(&data->node, root);
  514. /* data->ct */
  515. return 0;
  516. }
  517. static void event_uniq_destroy(struct rb_root *root)
  518. {
  519. /*
  520. * the strings we point to are in the giant block of memory filled by
  521. * the catalog, and are freed separately.
  522. */
  523. struct event_uniq *pos, *n;
  524. rbtree_postorder_for_each_entry_safe(pos, n, root, node)
  525. kfree(pos);
  526. }
  527. /*
  528. * ensure the event structure's sizes are self consistent and don't cause us to
  529. * read outside of the event
  530. *
  531. * On success, return the event length in bytes.
  532. * Otherwise, return -1 (and print as appropriate).
  533. */
  534. static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
  535. size_t event_idx,
  536. size_t event_data_bytes,
  537. size_t event_entry_count,
  538. size_t offset, void *end)
  539. {
  540. ssize_t ev_len;
  541. void *ev_end, *calc_ev_end;
  542. if (offset >= event_data_bytes)
  543. return -1;
  544. if (event_idx >= event_entry_count) {
  545. pr_devel("catalog event data has %zu bytes of padding after last event\n",
  546. event_data_bytes - offset);
  547. return -1;
  548. }
  549. if (!event_fixed_portion_is_within(event, end)) {
  550. pr_warn("event %zu fixed portion is not within range\n",
  551. event_idx);
  552. return -1;
  553. }
  554. ev_len = be16_to_cpu(event->length);
  555. if (ev_len % 16)
  556. pr_info("event %zu has length %zu not divisible by 16: event=%pK\n",
  557. event_idx, ev_len, event);
  558. ev_end = (__u8 *)event + ev_len;
  559. if (ev_end > end) {
  560. pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n",
  561. event_idx, ev_len, ev_end, end,
  562. offset);
  563. return -1;
  564. }
  565. calc_ev_end = event_end(event, end);
  566. if (!calc_ev_end) {
  567. pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n",
  568. event_idx, event_data_bytes, event, end,
  569. offset);
  570. return -1;
  571. }
  572. if (calc_ev_end > ev_end) {
  573. pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n",
  574. event_idx, event, ev_end, offset, calc_ev_end);
  575. return -1;
  576. }
  577. return ev_len;
  578. }
  579. #define MAX_4K (SIZE_MAX / 4096)
  580. static int create_events_from_catalog(struct attribute ***events_,
  581. struct attribute ***event_descs_,
  582. struct attribute ***event_long_descs_)
  583. {
  584. long hret;
  585. size_t catalog_len, catalog_page_len, event_entry_count,
  586. event_data_len, event_data_offs,
  587. event_data_bytes, junk_events, event_idx, event_attr_ct, i,
  588. attr_max, event_idx_last, desc_ct, long_desc_ct;
  589. ssize_t ct, ev_len;
  590. uint64_t catalog_version_num;
  591. struct attribute **events, **event_descs, **event_long_descs;
  592. struct hv_24x7_catalog_page_0 *page_0 =
  593. kmem_cache_alloc(hv_page_cache, GFP_KERNEL);
  594. void *page = page_0;
  595. void *event_data, *end;
  596. struct hv_24x7_event_data *event;
  597. struct rb_root ev_uniq = RB_ROOT;
  598. int ret = 0;
  599. if (!page) {
  600. ret = -ENOMEM;
  601. goto e_out;
  602. }
  603. hret = h_get_24x7_catalog_page(page, 0, 0);
  604. if (hret) {
  605. ret = -EIO;
  606. goto e_free;
  607. }
  608. catalog_version_num = be64_to_cpu(page_0->version);
  609. catalog_page_len = be32_to_cpu(page_0->length);
  610. if (MAX_4K < catalog_page_len) {
  611. pr_err("invalid page count: %zu\n", catalog_page_len);
  612. ret = -EIO;
  613. goto e_free;
  614. }
  615. catalog_len = catalog_page_len * 4096;
  616. event_entry_count = be16_to_cpu(page_0->event_entry_count);
  617. event_data_offs = be16_to_cpu(page_0->event_data_offs);
  618. event_data_len = be16_to_cpu(page_0->event_data_len);
  619. pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n",
  620. catalog_version_num, catalog_len,
  621. event_entry_count, event_data_offs, event_data_len);
  622. if ((MAX_4K < event_data_len)
  623. || (MAX_4K < event_data_offs)
  624. || (MAX_4K - event_data_offs < event_data_len)) {
  625. pr_err("invalid event data offs %zu and/or len %zu\n",
  626. event_data_offs, event_data_len);
  627. ret = -EIO;
  628. goto e_free;
  629. }
  630. if ((event_data_offs + event_data_len) > catalog_page_len) {
  631. pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n",
  632. event_data_offs,
  633. event_data_offs + event_data_len,
  634. catalog_page_len);
  635. ret = -EIO;
  636. goto e_free;
  637. }
  638. if (SIZE_MAX - 1 < event_entry_count) {
  639. pr_err("event_entry_count %zu is invalid\n", event_entry_count);
  640. ret = -EIO;
  641. goto e_free;
  642. }
  643. event_data_bytes = event_data_len * 4096;
  644. /*
  645. * event data can span several pages, events can cross between these
  646. * pages. Use vmalloc to make this easier.
  647. */
  648. event_data = vmalloc(event_data_bytes);
  649. if (!event_data) {
  650. pr_err("could not allocate event data\n");
  651. ret = -ENOMEM;
  652. goto e_free;
  653. }
  654. end = event_data + event_data_bytes;
  655. /*
  656. * using vmalloc_to_phys() like this only works if PAGE_SIZE is
  657. * divisible by 4096
  658. */
  659. BUILD_BUG_ON(PAGE_SIZE % 4096);
  660. for (i = 0; i < event_data_len; i++) {
  661. hret = h_get_24x7_catalog_page_(
  662. vmalloc_to_phys(event_data + i * 4096),
  663. catalog_version_num,
  664. i + event_data_offs);
  665. if (hret) {
  666. pr_err("Failed to get event data in page %zu: rc=%ld\n",
  667. i + event_data_offs, hret);
  668. ret = -EIO;
  669. goto e_event_data;
  670. }
  671. }
  672. /*
  673. * scan the catalog to determine the number of attributes we need, and
  674. * verify it at the same time.
  675. */
  676. for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0;
  677. ;
  678. event_idx++, event = (void *)event + ev_len) {
  679. size_t offset = (void *)event - (void *)event_data;
  680. char *name;
  681. int nl;
  682. ev_len = catalog_event_len_validate(event, event_idx,
  683. event_data_bytes,
  684. event_entry_count,
  685. offset, end);
  686. if (ev_len < 0)
  687. break;
  688. name = event_name(event, &nl);
  689. if (event->event_group_record_len == 0) {
  690. pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n",
  691. event_idx, nl, name);
  692. junk_events++;
  693. continue;
  694. }
  695. if (!catalog_entry_domain_is_valid(event->domain)) {
  696. pr_info("event %zu (%.*s) has invalid domain %d\n",
  697. event_idx, nl, name, event->domain);
  698. junk_events++;
  699. continue;
  700. }
  701. attr_max++;
  702. }
  703. event_idx_last = event_idx;
  704. if (event_idx_last != event_entry_count)
  705. pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n",
  706. event_idx_last, event_entry_count, junk_events);
  707. events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL);
  708. if (!events) {
  709. ret = -ENOMEM;
  710. goto e_event_data;
  711. }
  712. event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs),
  713. GFP_KERNEL);
  714. if (!event_descs) {
  715. ret = -ENOMEM;
  716. goto e_event_attrs;
  717. }
  718. event_long_descs = kmalloc_array(event_idx + 1,
  719. sizeof(*event_long_descs), GFP_KERNEL);
  720. if (!event_long_descs) {
  721. ret = -ENOMEM;
  722. goto e_event_descs;
  723. }
  724. /* Iterate over the catalog filling in the attribute vector */
  725. for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0,
  726. event = event_data, event_idx = 0;
  727. event_idx < event_idx_last;
  728. event_idx++, ev_len = be16_to_cpu(event->length),
  729. event = (void *)event + ev_len) {
  730. char *name;
  731. int nl;
  732. int nonce;
  733. /*
  734. * these are the only "bad" events that are intermixed and that
  735. * we can ignore without issue. make sure to skip them here
  736. */
  737. if (event->event_group_record_len == 0)
  738. continue;
  739. if (!catalog_entry_domain_is_valid(event->domain))
  740. continue;
  741. name = event_name(event, &nl);
  742. nonce = event_uniq_add(&ev_uniq, name, nl, event->domain);
  743. ct = event_data_to_attrs(event_idx, events + event_attr_ct,
  744. event, nonce);
  745. if (ct < 0) {
  746. pr_warn("event %zu (%.*s) creation failure, skipping\n",
  747. event_idx, nl, name);
  748. junk_events++;
  749. } else {
  750. event_attr_ct++;
  751. event_descs[desc_ct] = event_to_desc_attr(event, nonce);
  752. if (event_descs[desc_ct])
  753. desc_ct++;
  754. event_long_descs[long_desc_ct] =
  755. event_to_long_desc_attr(event, nonce);
  756. if (event_long_descs[long_desc_ct])
  757. long_desc_ct++;
  758. }
  759. }
  760. pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n",
  761. event_idx, event_attr_ct, junk_events, desc_ct);
  762. events[event_attr_ct] = NULL;
  763. event_descs[desc_ct] = NULL;
  764. event_long_descs[long_desc_ct] = NULL;
  765. event_uniq_destroy(&ev_uniq);
  766. vfree(event_data);
  767. kmem_cache_free(hv_page_cache, page);
  768. *events_ = events;
  769. *event_descs_ = event_descs;
  770. *event_long_descs_ = event_long_descs;
  771. return 0;
  772. e_event_descs:
  773. kfree(event_descs);
  774. e_event_attrs:
  775. kfree(events);
  776. e_event_data:
  777. vfree(event_data);
  778. e_free:
  779. kmem_cache_free(hv_page_cache, page);
  780. e_out:
  781. *events_ = NULL;
  782. *event_descs_ = NULL;
  783. *event_long_descs_ = NULL;
  784. return ret;
  785. }
  786. static ssize_t catalog_read(struct file *filp, struct kobject *kobj,
  787. struct bin_attribute *bin_attr, char *buf,
  788. loff_t offset, size_t count)
  789. {
  790. long hret;
  791. ssize_t ret = 0;
  792. size_t catalog_len = 0, catalog_page_len = 0;
  793. loff_t page_offset = 0;
  794. loff_t offset_in_page;
  795. size_t copy_len;
  796. uint64_t catalog_version_num = 0;
  797. void *page = kmem_cache_alloc(hv_page_cache, GFP_USER);
  798. struct hv_24x7_catalog_page_0 *page_0 = page;
  799. if (!page)
  800. return -ENOMEM;
  801. hret = h_get_24x7_catalog_page(page, 0, 0);
  802. if (hret) {
  803. ret = -EIO;
  804. goto e_free;
  805. }
  806. catalog_version_num = be64_to_cpu(page_0->version);
  807. catalog_page_len = be32_to_cpu(page_0->length);
  808. catalog_len = catalog_page_len * 4096;
  809. page_offset = offset / 4096;
  810. offset_in_page = offset % 4096;
  811. if (page_offset >= catalog_page_len)
  812. goto e_free;
  813. if (page_offset != 0) {
  814. hret = h_get_24x7_catalog_page(page, catalog_version_num,
  815. page_offset);
  816. if (hret) {
  817. ret = -EIO;
  818. goto e_free;
  819. }
  820. }
  821. copy_len = 4096 - offset_in_page;
  822. if (copy_len > count)
  823. copy_len = count;
  824. memcpy(buf, page+offset_in_page, copy_len);
  825. ret = copy_len;
  826. e_free:
  827. if (hret)
  828. pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:"
  829. " rc=%ld\n",
  830. catalog_version_num, page_offset, hret);
  831. kmem_cache_free(hv_page_cache, page);
  832. pr_devel("catalog_read: offset=%lld(%lld) count=%zu "
  833. "catalog_len=%zu(%zu) => %zd\n", offset, page_offset,
  834. count, catalog_len, catalog_page_len, ret);
  835. return ret;
  836. }
  837. static ssize_t domains_show(struct device *dev, struct device_attribute *attr,
  838. char *page)
  839. {
  840. int d, n, count = 0;
  841. const char *str;
  842. for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) {
  843. str = domain_name(d);
  844. if (!str)
  845. continue;
  846. n = sprintf(page, "%d: %s\n", d, str);
  847. if (n < 0)
  848. break;
  849. count += n;
  850. page += n;
  851. }
  852. return count;
  853. }
  854. #define PAGE_0_ATTR(_name, _fmt, _expr) \
  855. static ssize_t _name##_show(struct device *dev, \
  856. struct device_attribute *dev_attr, \
  857. char *buf) \
  858. { \
  859. long hret; \
  860. ssize_t ret = 0; \
  861. void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \
  862. struct hv_24x7_catalog_page_0 *page_0 = page; \
  863. if (!page) \
  864. return -ENOMEM; \
  865. hret = h_get_24x7_catalog_page(page, 0, 0); \
  866. if (hret) { \
  867. ret = -EIO; \
  868. goto e_free; \
  869. } \
  870. ret = sprintf(buf, _fmt, _expr); \
  871. e_free: \
  872. kmem_cache_free(hv_page_cache, page); \
  873. return ret; \
  874. } \
  875. static DEVICE_ATTR_RO(_name)
  876. PAGE_0_ATTR(catalog_version, "%lld\n",
  877. (unsigned long long)be64_to_cpu(page_0->version));
  878. PAGE_0_ATTR(catalog_len, "%lld\n",
  879. (unsigned long long)be32_to_cpu(page_0->length) * 4096);
  880. static BIN_ATTR_RO(catalog, 0/* real length varies */);
  881. static DEVICE_ATTR_RO(domains);
  882. static struct bin_attribute *if_bin_attrs[] = {
  883. &bin_attr_catalog,
  884. NULL,
  885. };
  886. static struct attribute *if_attrs[] = {
  887. &dev_attr_catalog_len.attr,
  888. &dev_attr_catalog_version.attr,
  889. &dev_attr_domains.attr,
  890. NULL,
  891. };
  892. static struct attribute_group if_group = {
  893. .name = "interface",
  894. .bin_attrs = if_bin_attrs,
  895. .attrs = if_attrs,
  896. };
  897. static const struct attribute_group *attr_groups[] = {
  898. &format_group,
  899. &event_group,
  900. &event_desc_group,
  901. &event_long_desc_group,
  902. &if_group,
  903. NULL,
  904. };
  905. /*
  906. * Start the process for a new H_GET_24x7_DATA hcall.
  907. */
  908. static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer,
  909. struct hv_24x7_data_result_buffer *result_buffer)
  910. {
  911. memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE);
  912. memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE);
  913. request_buffer->interface_version = interface_version;
  914. /* memset above set request_buffer->num_requests to 0 */
  915. }
  916. /*
  917. * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected
  918. * by 'init_24x7_request()' and 'add_event_to_24x7_request()'.
  919. */
  920. static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer,
  921. struct hv_24x7_data_result_buffer *result_buffer)
  922. {
  923. long ret;
  924. /*
  925. * NOTE: Due to variable number of array elements in request and
  926. * result buffer(s), sizeof() is not reliable. Use the actual
  927. * allocated buffer size, H24x7_DATA_BUFFER_SIZE.
  928. */
  929. ret = plpar_hcall_norets(H_GET_24X7_DATA,
  930. virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE,
  931. virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE);
  932. if (ret) {
  933. struct hv_24x7_request *req;
  934. req = request_buffer->requests;
  935. pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n",
  936. req->performance_domain, req->data_offset,
  937. req->starting_ix, req->starting_lpar_ix,
  938. ret, ret, result_buffer->detailed_rc,
  939. result_buffer->failing_request_ix);
  940. return -EIO;
  941. }
  942. return 0;
  943. }
  944. /*
  945. * Add the given @event to the next slot in the 24x7 request_buffer.
  946. *
  947. * Note that H_GET_24X7_DATA hcall allows reading several counters'
  948. * values in a single HCALL. We expect the caller to add events to the
  949. * request buffer one by one, make the HCALL and process the results.
  950. */
  951. static int add_event_to_24x7_request(struct perf_event *event,
  952. struct hv_24x7_request_buffer *request_buffer)
  953. {
  954. u16 idx;
  955. int i;
  956. size_t req_size;
  957. struct hv_24x7_request *req;
  958. if (request_buffer->num_requests >=
  959. max_num_requests(request_buffer->interface_version)) {
  960. pr_devel("Too many requests for 24x7 HCALL %d\n",
  961. request_buffer->num_requests);
  962. return -EINVAL;
  963. }
  964. switch (event_get_domain(event)) {
  965. case HV_PERF_DOMAIN_PHYS_CHIP:
  966. idx = event_get_chip(event);
  967. break;
  968. case HV_PERF_DOMAIN_PHYS_CORE:
  969. idx = event_get_core(event);
  970. break;
  971. default:
  972. idx = event_get_vcpu(event);
  973. }
  974. req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version);
  975. i = request_buffer->num_requests++;
  976. req = (void *) request_buffer->requests + i * req_size;
  977. req->performance_domain = event_get_domain(event);
  978. req->data_size = cpu_to_be16(8);
  979. req->data_offset = cpu_to_be32(event_get_offset(event));
  980. req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event));
  981. req->max_num_lpars = cpu_to_be16(1);
  982. req->starting_ix = cpu_to_be16(idx);
  983. req->max_ix = cpu_to_be16(1);
  984. if (request_buffer->interface_version > 1) {
  985. if (domain_needs_aggregation(req->performance_domain))
  986. req->max_num_thread_groups = -1;
  987. else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) {
  988. req->starting_thread_group_ix = idx % 2;
  989. req->max_num_thread_groups = 1;
  990. }
  991. }
  992. return 0;
  993. }
  994. /**
  995. * get_count_from_result - get event count from all result elements in result
  996. *
  997. * If the event corresponding to this result needs aggregation of the result
  998. * element values, then this function does that.
  999. *
  1000. * @event: Event associated with @res.
  1001. * @resb: Result buffer containing @res.
  1002. * @res: Result to work on.
  1003. * @countp: Output variable containing the event count.
  1004. * @next: Optional output variable pointing to the next result in @resb.
  1005. */
  1006. static int get_count_from_result(struct perf_event *event,
  1007. struct hv_24x7_data_result_buffer *resb,
  1008. struct hv_24x7_result *res, u64 *countp,
  1009. struct hv_24x7_result **next)
  1010. {
  1011. u16 num_elements = be16_to_cpu(res->num_elements_returned);
  1012. u16 data_size = be16_to_cpu(res->result_element_data_size);
  1013. unsigned int data_offset;
  1014. void *element_data;
  1015. int i;
  1016. u64 count;
  1017. /*
  1018. * We can bail out early if the result is empty.
  1019. */
  1020. if (!num_elements) {
  1021. pr_debug("Result of request %hhu is empty, nothing to do\n",
  1022. res->result_ix);
  1023. if (next)
  1024. *next = (struct hv_24x7_result *) res->elements;
  1025. return -ENODATA;
  1026. }
  1027. /*
  1028. * Since we always specify 1 as the maximum for the smallest resource
  1029. * we're requesting, there should to be only one element per result.
  1030. * Except when an event needs aggregation, in which case there are more.
  1031. */
  1032. if (num_elements != 1 &&
  1033. !domain_needs_aggregation(event_get_domain(event))) {
  1034. pr_err("Error: result of request %hhu has %hu elements\n",
  1035. res->result_ix, num_elements);
  1036. return -EIO;
  1037. }
  1038. if (data_size != sizeof(u64)) {
  1039. pr_debug("Error: result of request %hhu has data of %hu bytes\n",
  1040. res->result_ix, data_size);
  1041. return -ENOTSUPP;
  1042. }
  1043. if (resb->interface_version == 1)
  1044. data_offset = offsetof(struct hv_24x7_result_element_v1,
  1045. element_data);
  1046. else
  1047. data_offset = offsetof(struct hv_24x7_result_element_v2,
  1048. element_data);
  1049. /* Go through the result elements in the result. */
  1050. for (i = count = 0, element_data = res->elements + data_offset;
  1051. i < num_elements;
  1052. i++, element_data += data_size + data_offset)
  1053. count += be64_to_cpu(*((u64 *) element_data));
  1054. *countp = count;
  1055. /* The next result is after the last result element. */
  1056. if (next)
  1057. *next = element_data - data_offset;
  1058. return 0;
  1059. }
  1060. static int single_24x7_request(struct perf_event *event, u64 *count)
  1061. {
  1062. int ret;
  1063. struct hv_24x7_request_buffer *request_buffer;
  1064. struct hv_24x7_data_result_buffer *result_buffer;
  1065. BUILD_BUG_ON(sizeof(*request_buffer) > 4096);
  1066. BUILD_BUG_ON(sizeof(*result_buffer) > 4096);
  1067. request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
  1068. result_buffer = (void *)get_cpu_var(hv_24x7_resb);
  1069. init_24x7_request(request_buffer, result_buffer);
  1070. ret = add_event_to_24x7_request(event, request_buffer);
  1071. if (ret)
  1072. goto out;
  1073. ret = make_24x7_request(request_buffer, result_buffer);
  1074. if (ret)
  1075. goto out;
  1076. /* process result from hcall */
  1077. ret = get_count_from_result(event, result_buffer,
  1078. result_buffer->results, count, NULL);
  1079. out:
  1080. put_cpu_var(hv_24x7_reqb);
  1081. put_cpu_var(hv_24x7_resb);
  1082. return ret;
  1083. }
  1084. static int h_24x7_event_init(struct perf_event *event)
  1085. {
  1086. struct hv_perf_caps caps;
  1087. unsigned domain;
  1088. unsigned long hret;
  1089. u64 ct;
  1090. /* Not our event */
  1091. if (event->attr.type != event->pmu->type)
  1092. return -ENOENT;
  1093. /* Unused areas must be 0 */
  1094. if (event_get_reserved1(event) ||
  1095. event_get_reserved2(event) ||
  1096. event_get_reserved3(event)) {
  1097. pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n",
  1098. event->attr.config,
  1099. event_get_reserved1(event),
  1100. event->attr.config1,
  1101. event_get_reserved2(event),
  1102. event->attr.config2,
  1103. event_get_reserved3(event));
  1104. return -EINVAL;
  1105. }
  1106. /* unsupported modes and filters */
  1107. if (event->attr.exclude_user ||
  1108. event->attr.exclude_kernel ||
  1109. event->attr.exclude_hv ||
  1110. event->attr.exclude_idle ||
  1111. event->attr.exclude_host ||
  1112. event->attr.exclude_guest)
  1113. return -EINVAL;
  1114. /* no branch sampling */
  1115. if (has_branch_stack(event))
  1116. return -EOPNOTSUPP;
  1117. /* offset must be 8 byte aligned */
  1118. if (event_get_offset(event) % 8) {
  1119. pr_devel("bad alignment\n");
  1120. return -EINVAL;
  1121. }
  1122. domain = event_get_domain(event);
  1123. if (domain >= HV_PERF_DOMAIN_MAX) {
  1124. pr_devel("invalid domain %d\n", domain);
  1125. return -EINVAL;
  1126. }
  1127. hret = hv_perf_caps_get(&caps);
  1128. if (hret) {
  1129. pr_devel("could not get capabilities: rc=%ld\n", hret);
  1130. return -EIO;
  1131. }
  1132. /* Physical domains & other lpars require extra capabilities */
  1133. if (!caps.collect_privileged && (is_physical_domain(domain) ||
  1134. (event_get_lpar(event) != event_get_lpar_max()))) {
  1135. pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n",
  1136. is_physical_domain(domain),
  1137. event_get_lpar(event));
  1138. return -EACCES;
  1139. }
  1140. /* Get the initial value of the counter for this event */
  1141. if (single_24x7_request(event, &ct)) {
  1142. pr_devel("test hcall failed\n");
  1143. return -EIO;
  1144. }
  1145. (void)local64_xchg(&event->hw.prev_count, ct);
  1146. return 0;
  1147. }
  1148. static u64 h_24x7_get_value(struct perf_event *event)
  1149. {
  1150. u64 ct;
  1151. if (single_24x7_request(event, &ct))
  1152. /* We checked this in event init, shouldn't fail here... */
  1153. return 0;
  1154. return ct;
  1155. }
  1156. static void update_event_count(struct perf_event *event, u64 now)
  1157. {
  1158. s64 prev;
  1159. prev = local64_xchg(&event->hw.prev_count, now);
  1160. local64_add(now - prev, &event->count);
  1161. }
  1162. static void h_24x7_event_read(struct perf_event *event)
  1163. {
  1164. u64 now;
  1165. struct hv_24x7_request_buffer *request_buffer;
  1166. struct hv_24x7_hw *h24x7hw;
  1167. int txn_flags;
  1168. txn_flags = __this_cpu_read(hv_24x7_txn_flags);
  1169. /*
  1170. * If in a READ transaction, add this counter to the list of
  1171. * counters to read during the next HCALL (i.e commit_txn()).
  1172. * If not in a READ transaction, go ahead and make the HCALL
  1173. * to read this counter by itself.
  1174. */
  1175. if (txn_flags & PERF_PMU_TXN_READ) {
  1176. int i;
  1177. int ret;
  1178. if (__this_cpu_read(hv_24x7_txn_err))
  1179. return;
  1180. request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
  1181. ret = add_event_to_24x7_request(event, request_buffer);
  1182. if (ret) {
  1183. __this_cpu_write(hv_24x7_txn_err, ret);
  1184. } else {
  1185. /*
  1186. * Associate the event with the HCALL request index,
  1187. * so ->commit_txn() can quickly find/update count.
  1188. */
  1189. i = request_buffer->num_requests - 1;
  1190. h24x7hw = &get_cpu_var(hv_24x7_hw);
  1191. h24x7hw->events[i] = event;
  1192. put_cpu_var(h24x7hw);
  1193. /*
  1194. * Clear the event count so we can compute the _change_
  1195. * in the 24x7 raw counter value at the end of the txn.
  1196. *
  1197. * Note that we could alternatively read the 24x7 value
  1198. * now and save its value in event->hw.prev_count. But
  1199. * that would require issuing a hcall, which would then
  1200. * defeat the purpose of using the txn interface.
  1201. */
  1202. local64_set(&event->count, 0);
  1203. }
  1204. put_cpu_var(hv_24x7_reqb);
  1205. } else {
  1206. now = h_24x7_get_value(event);
  1207. update_event_count(event, now);
  1208. }
  1209. }
  1210. static void h_24x7_event_start(struct perf_event *event, int flags)
  1211. {
  1212. if (flags & PERF_EF_RELOAD)
  1213. local64_set(&event->hw.prev_count, h_24x7_get_value(event));
  1214. }
  1215. static void h_24x7_event_stop(struct perf_event *event, int flags)
  1216. {
  1217. h_24x7_event_read(event);
  1218. }
  1219. static int h_24x7_event_add(struct perf_event *event, int flags)
  1220. {
  1221. if (flags & PERF_EF_START)
  1222. h_24x7_event_start(event, flags);
  1223. return 0;
  1224. }
  1225. /*
  1226. * 24x7 counters only support READ transactions. They are
  1227. * always counting and dont need/support ADD transactions.
  1228. * Cache the flags, but otherwise ignore transactions that
  1229. * are not PERF_PMU_TXN_READ.
  1230. */
  1231. static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags)
  1232. {
  1233. struct hv_24x7_request_buffer *request_buffer;
  1234. struct hv_24x7_data_result_buffer *result_buffer;
  1235. /* We should not be called if we are already in a txn */
  1236. WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags));
  1237. __this_cpu_write(hv_24x7_txn_flags, flags);
  1238. if (flags & ~PERF_PMU_TXN_READ)
  1239. return;
  1240. request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
  1241. result_buffer = (void *)get_cpu_var(hv_24x7_resb);
  1242. init_24x7_request(request_buffer, result_buffer);
  1243. put_cpu_var(hv_24x7_resb);
  1244. put_cpu_var(hv_24x7_reqb);
  1245. }
  1246. /*
  1247. * Clean up transaction state.
  1248. *
  1249. * NOTE: Ignore state of request and result buffers for now.
  1250. * We will initialize them during the next read/txn.
  1251. */
  1252. static void reset_txn(void)
  1253. {
  1254. __this_cpu_write(hv_24x7_txn_flags, 0);
  1255. __this_cpu_write(hv_24x7_txn_err, 0);
  1256. }
  1257. /*
  1258. * 24x7 counters only support READ transactions. They are always counting
  1259. * and dont need/support ADD transactions. Clear ->txn_flags but otherwise
  1260. * ignore transactions that are not of type PERF_PMU_TXN_READ.
  1261. *
  1262. * For READ transactions, submit all pending 24x7 requests (i.e requests
  1263. * that were queued by h_24x7_event_read()), to the hypervisor and update
  1264. * the event counts.
  1265. */
  1266. static int h_24x7_event_commit_txn(struct pmu *pmu)
  1267. {
  1268. struct hv_24x7_request_buffer *request_buffer;
  1269. struct hv_24x7_data_result_buffer *result_buffer;
  1270. struct hv_24x7_result *res, *next_res;
  1271. u64 count;
  1272. int i, ret, txn_flags;
  1273. struct hv_24x7_hw *h24x7hw;
  1274. txn_flags = __this_cpu_read(hv_24x7_txn_flags);
  1275. WARN_ON_ONCE(!txn_flags);
  1276. ret = 0;
  1277. if (txn_flags & ~PERF_PMU_TXN_READ)
  1278. goto out;
  1279. ret = __this_cpu_read(hv_24x7_txn_err);
  1280. if (ret)
  1281. goto out;
  1282. request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
  1283. result_buffer = (void *)get_cpu_var(hv_24x7_resb);
  1284. ret = make_24x7_request(request_buffer, result_buffer);
  1285. if (ret)
  1286. goto put_reqb;
  1287. h24x7hw = &get_cpu_var(hv_24x7_hw);
  1288. /* Go through results in the result buffer to update event counts. */
  1289. for (i = 0, res = result_buffer->results;
  1290. i < result_buffer->num_results; i++, res = next_res) {
  1291. struct perf_event *event = h24x7hw->events[res->result_ix];
  1292. ret = get_count_from_result(event, result_buffer, res, &count,
  1293. &next_res);
  1294. if (ret)
  1295. break;
  1296. update_event_count(event, count);
  1297. }
  1298. put_cpu_var(hv_24x7_hw);
  1299. put_reqb:
  1300. put_cpu_var(hv_24x7_resb);
  1301. put_cpu_var(hv_24x7_reqb);
  1302. out:
  1303. reset_txn();
  1304. return ret;
  1305. }
  1306. /*
  1307. * 24x7 counters only support READ transactions. They are always counting
  1308. * and dont need/support ADD transactions. However, regardless of type
  1309. * of transaction, all we need to do is cleanup, so we don't have to check
  1310. * the type of transaction.
  1311. */
  1312. static void h_24x7_event_cancel_txn(struct pmu *pmu)
  1313. {
  1314. WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags));
  1315. reset_txn();
  1316. }
  1317. static struct pmu h_24x7_pmu = {
  1318. .task_ctx_nr = perf_invalid_context,
  1319. .name = "hv_24x7",
  1320. .attr_groups = attr_groups,
  1321. .event_init = h_24x7_event_init,
  1322. .add = h_24x7_event_add,
  1323. .del = h_24x7_event_stop,
  1324. .start = h_24x7_event_start,
  1325. .stop = h_24x7_event_stop,
  1326. .read = h_24x7_event_read,
  1327. .start_txn = h_24x7_event_start_txn,
  1328. .commit_txn = h_24x7_event_commit_txn,
  1329. .cancel_txn = h_24x7_event_cancel_txn,
  1330. };
  1331. static int hv_24x7_init(void)
  1332. {
  1333. int r;
  1334. unsigned long hret;
  1335. struct hv_perf_caps caps;
  1336. if (!firmware_has_feature(FW_FEATURE_LPAR)) {
  1337. pr_debug("not a virtualized system, not enabling\n");
  1338. return -ENODEV;
  1339. } else if (!cur_cpu_spec->oprofile_cpu_type)
  1340. return -ENODEV;
  1341. /* POWER8 only supports v1, while POWER9 only supports v2. */
  1342. if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8"))
  1343. interface_version = 1;
  1344. else {
  1345. interface_version = 2;
  1346. /* SMT8 in POWER9 needs to aggregate result elements. */
  1347. if (threads_per_core == 8)
  1348. aggregate_result_elements = true;
  1349. }
  1350. hret = hv_perf_caps_get(&caps);
  1351. if (hret) {
  1352. pr_debug("could not obtain capabilities, not enabling, rc=%ld\n",
  1353. hret);
  1354. return -ENODEV;
  1355. }
  1356. hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL);
  1357. if (!hv_page_cache)
  1358. return -ENOMEM;
  1359. /* sampling not supported */
  1360. h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
  1361. r = create_events_from_catalog(&event_group.attrs,
  1362. &event_desc_group.attrs,
  1363. &event_long_desc_group.attrs);
  1364. if (r)
  1365. return r;
  1366. r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
  1367. if (r)
  1368. return r;
  1369. return 0;
  1370. }
  1371. device_initcall(hv_24x7_init);