perf_event_intel_rapl.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783
  1. /*
  2. * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
  3. * Copyright (C) 2013 Google, Inc., Stephane Eranian
  4. *
  5. * Intel RAPL interface is specified in the IA-32 Manual Vol3b
  6. * section 14.7.1 (September 2013)
  7. *
  8. * RAPL provides more controls than just reporting energy consumption
  9. * however here we only expose the 3 energy consumption free running
  10. * counters (pp0, pkg, dram).
  11. *
  12. * Each of those counters increments in a power unit defined by the
  13. * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  14. * but it can vary.
  15. *
  16. * Counter to rapl events mappings:
  17. *
  18. * pp0 counter: consumption of all physical cores (power plane 0)
  19. * event: rapl_energy_cores
  20. * perf code: 0x1
  21. *
  22. * pkg counter: consumption of the whole processor package
  23. * event: rapl_energy_pkg
  24. * perf code: 0x2
  25. *
  26. * dram counter: consumption of the dram domain (servers only)
  27. * event: rapl_energy_dram
  28. * perf code: 0x3
  29. *
  30. * dram counter: consumption of the builtin-gpu domain (client only)
  31. * event: rapl_energy_gpu
  32. * perf code: 0x4
  33. *
  34. * We manage those counters as free running (read-only). They may be
  35. * use simultaneously by other tools, such as turbostat.
  36. *
  37. * The events only support system-wide mode counting. There is no
  38. * sampling support because it does not make sense and is not
  39. * supported by the RAPL hardware.
  40. *
  41. * Because we want to avoid floating-point operations in the kernel,
  42. * the events are all reported in fixed point arithmetic (32.32).
  43. * Tools must adjust the counts to convert them to Watts using
  44. * the duration of the measurement. Tools may use a function such as
  45. * ldexp(raw_count, -32);
  46. */
  47. #include <linux/module.h>
  48. #include <linux/slab.h>
  49. #include <linux/perf_event.h>
  50. #include <asm/cpu_device_id.h>
  51. #include "perf_event.h"
  52. /*
  53. * RAPL energy status counters
  54. */
  55. #define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
  56. #define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
  57. #define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
  58. #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
  59. #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
  60. #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
  61. #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
  62. #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
  63. #define NR_RAPL_DOMAINS 0x4
  64. static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
  65. "pp0-core",
  66. "package",
  67. "dram",
  68. "pp1-gpu",
  69. };
  70. /* Clients have PP0, PKG */
  71. #define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
  72. 1<<RAPL_IDX_PKG_NRG_STAT|\
  73. 1<<RAPL_IDX_PP1_NRG_STAT)
  74. /* Servers have PP0, PKG, RAM */
  75. #define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
  76. 1<<RAPL_IDX_PKG_NRG_STAT|\
  77. 1<<RAPL_IDX_RAM_NRG_STAT)
  78. /* Servers have PP0, PKG, RAM, PP1 */
  79. #define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
  80. 1<<RAPL_IDX_PKG_NRG_STAT|\
  81. 1<<RAPL_IDX_RAM_NRG_STAT|\
  82. 1<<RAPL_IDX_PP1_NRG_STAT)
  83. /* Knights Landing has PKG, RAM */
  84. #define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\
  85. 1<<RAPL_IDX_RAM_NRG_STAT)
  86. /*
  87. * event code: LSB 8 bits, passed in attr->config
  88. * any other bit is reserved
  89. */
  90. #define RAPL_EVENT_MASK 0xFFULL
  91. #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
  92. static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
  93. struct kobj_attribute *attr, \
  94. char *page) \
  95. { \
  96. BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
  97. return sprintf(page, _format "\n"); \
  98. } \
  99. static struct kobj_attribute format_attr_##_var = \
  100. __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
  101. #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
  102. #define RAPL_EVENT_ATTR_STR(_name, v, str) \
  103. static struct perf_pmu_events_attr event_attr_##v = { \
  104. .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
  105. .id = 0, \
  106. .event_str = str, \
  107. };
  108. struct rapl_pmu {
  109. spinlock_t lock;
  110. int n_active; /* number of active events */
  111. struct list_head active_list;
  112. struct pmu *pmu; /* pointer to rapl_pmu_class */
  113. ktime_t timer_interval; /* in ktime_t unit */
  114. struct hrtimer hrtimer;
  115. };
  116. static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */
  117. static struct pmu rapl_pmu_class;
  118. static cpumask_t rapl_cpu_mask;
  119. static int rapl_cntr_mask;
  120. static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
  121. static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
  122. static struct x86_pmu_quirk *rapl_quirks;
  123. static inline u64 rapl_read_counter(struct perf_event *event)
  124. {
  125. u64 raw;
  126. rdmsrl(event->hw.event_base, raw);
  127. return raw;
  128. }
  129. #define rapl_add_quirk(func_) \
  130. do { \
  131. static struct x86_pmu_quirk __quirk __initdata = { \
  132. .func = func_, \
  133. }; \
  134. __quirk.next = rapl_quirks; \
  135. rapl_quirks = &__quirk; \
  136. } while (0)
  137. static inline u64 rapl_scale(u64 v, int cfg)
  138. {
  139. if (cfg > NR_RAPL_DOMAINS) {
  140. pr_warn("invalid domain %d, failed to scale data\n", cfg);
  141. return v;
  142. }
  143. /*
  144. * scale delta to smallest unit (1/2^32)
  145. * users must then scale back: count * 1/(1e9*2^32) to get Joules
  146. * or use ldexp(count, -32).
  147. * Watts = Joules/Time delta
  148. */
  149. return v << (32 - rapl_hw_unit[cfg - 1]);
  150. }
  151. static u64 rapl_event_update(struct perf_event *event)
  152. {
  153. struct hw_perf_event *hwc = &event->hw;
  154. u64 prev_raw_count, new_raw_count;
  155. s64 delta, sdelta;
  156. int shift = RAPL_CNTR_WIDTH;
  157. again:
  158. prev_raw_count = local64_read(&hwc->prev_count);
  159. rdmsrl(event->hw.event_base, new_raw_count);
  160. if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
  161. new_raw_count) != prev_raw_count) {
  162. cpu_relax();
  163. goto again;
  164. }
  165. /*
  166. * Now we have the new raw value and have updated the prev
  167. * timestamp already. We can now calculate the elapsed delta
  168. * (event-)time and add that to the generic event.
  169. *
  170. * Careful, not all hw sign-extends above the physical width
  171. * of the count.
  172. */
  173. delta = (new_raw_count << shift) - (prev_raw_count << shift);
  174. delta >>= shift;
  175. sdelta = rapl_scale(delta, event->hw.config);
  176. local64_add(sdelta, &event->count);
  177. return new_raw_count;
  178. }
  179. static void rapl_start_hrtimer(struct rapl_pmu *pmu)
  180. {
  181. hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
  182. HRTIMER_MODE_REL_PINNED);
  183. }
  184. static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
  185. {
  186. hrtimer_cancel(&pmu->hrtimer);
  187. }
  188. static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
  189. {
  190. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  191. struct perf_event *event;
  192. unsigned long flags;
  193. if (!pmu->n_active)
  194. return HRTIMER_NORESTART;
  195. spin_lock_irqsave(&pmu->lock, flags);
  196. list_for_each_entry(event, &pmu->active_list, active_entry) {
  197. rapl_event_update(event);
  198. }
  199. spin_unlock_irqrestore(&pmu->lock, flags);
  200. hrtimer_forward_now(hrtimer, pmu->timer_interval);
  201. return HRTIMER_RESTART;
  202. }
  203. static void rapl_hrtimer_init(struct rapl_pmu *pmu)
  204. {
  205. struct hrtimer *hr = &pmu->hrtimer;
  206. hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  207. hr->function = rapl_hrtimer_handle;
  208. }
  209. static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
  210. struct perf_event *event)
  211. {
  212. if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
  213. return;
  214. event->hw.state = 0;
  215. list_add_tail(&event->active_entry, &pmu->active_list);
  216. local64_set(&event->hw.prev_count, rapl_read_counter(event));
  217. pmu->n_active++;
  218. if (pmu->n_active == 1)
  219. rapl_start_hrtimer(pmu);
  220. }
  221. static void rapl_pmu_event_start(struct perf_event *event, int mode)
  222. {
  223. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  224. unsigned long flags;
  225. spin_lock_irqsave(&pmu->lock, flags);
  226. __rapl_pmu_event_start(pmu, event);
  227. spin_unlock_irqrestore(&pmu->lock, flags);
  228. }
  229. static void rapl_pmu_event_stop(struct perf_event *event, int mode)
  230. {
  231. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  232. struct hw_perf_event *hwc = &event->hw;
  233. unsigned long flags;
  234. spin_lock_irqsave(&pmu->lock, flags);
  235. /* mark event as deactivated and stopped */
  236. if (!(hwc->state & PERF_HES_STOPPED)) {
  237. WARN_ON_ONCE(pmu->n_active <= 0);
  238. pmu->n_active--;
  239. if (pmu->n_active == 0)
  240. rapl_stop_hrtimer(pmu);
  241. list_del(&event->active_entry);
  242. WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
  243. hwc->state |= PERF_HES_STOPPED;
  244. }
  245. /* check if update of sw counter is necessary */
  246. if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
  247. /*
  248. * Drain the remaining delta count out of a event
  249. * that we are disabling:
  250. */
  251. rapl_event_update(event);
  252. hwc->state |= PERF_HES_UPTODATE;
  253. }
  254. spin_unlock_irqrestore(&pmu->lock, flags);
  255. }
  256. static int rapl_pmu_event_add(struct perf_event *event, int mode)
  257. {
  258. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  259. struct hw_perf_event *hwc = &event->hw;
  260. unsigned long flags;
  261. spin_lock_irqsave(&pmu->lock, flags);
  262. hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
  263. if (mode & PERF_EF_START)
  264. __rapl_pmu_event_start(pmu, event);
  265. spin_unlock_irqrestore(&pmu->lock, flags);
  266. return 0;
  267. }
  268. static void rapl_pmu_event_del(struct perf_event *event, int flags)
  269. {
  270. rapl_pmu_event_stop(event, PERF_EF_UPDATE);
  271. }
  272. static int rapl_pmu_event_init(struct perf_event *event)
  273. {
  274. u64 cfg = event->attr.config & RAPL_EVENT_MASK;
  275. int bit, msr, ret = 0;
  276. /* only look at RAPL events */
  277. if (event->attr.type != rapl_pmu_class.type)
  278. return -ENOENT;
  279. /* check only supported bits are set */
  280. if (event->attr.config & ~RAPL_EVENT_MASK)
  281. return -EINVAL;
  282. /*
  283. * check event is known (determines counter)
  284. */
  285. switch (cfg) {
  286. case INTEL_RAPL_PP0:
  287. bit = RAPL_IDX_PP0_NRG_STAT;
  288. msr = MSR_PP0_ENERGY_STATUS;
  289. break;
  290. case INTEL_RAPL_PKG:
  291. bit = RAPL_IDX_PKG_NRG_STAT;
  292. msr = MSR_PKG_ENERGY_STATUS;
  293. break;
  294. case INTEL_RAPL_RAM:
  295. bit = RAPL_IDX_RAM_NRG_STAT;
  296. msr = MSR_DRAM_ENERGY_STATUS;
  297. break;
  298. case INTEL_RAPL_PP1:
  299. bit = RAPL_IDX_PP1_NRG_STAT;
  300. msr = MSR_PP1_ENERGY_STATUS;
  301. break;
  302. default:
  303. return -EINVAL;
  304. }
  305. /* check event supported */
  306. if (!(rapl_cntr_mask & (1 << bit)))
  307. return -EINVAL;
  308. /* unsupported modes and filters */
  309. if (event->attr.exclude_user ||
  310. event->attr.exclude_kernel ||
  311. event->attr.exclude_hv ||
  312. event->attr.exclude_idle ||
  313. event->attr.exclude_host ||
  314. event->attr.exclude_guest ||
  315. event->attr.sample_period) /* no sampling */
  316. return -EINVAL;
  317. /* must be done before validate_group */
  318. event->hw.event_base = msr;
  319. event->hw.config = cfg;
  320. event->hw.idx = bit;
  321. return ret;
  322. }
  323. static void rapl_pmu_event_read(struct perf_event *event)
  324. {
  325. rapl_event_update(event);
  326. }
  327. static ssize_t rapl_get_attr_cpumask(struct device *dev,
  328. struct device_attribute *attr, char *buf)
  329. {
  330. return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
  331. }
  332. static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
  333. static struct attribute *rapl_pmu_attrs[] = {
  334. &dev_attr_cpumask.attr,
  335. NULL,
  336. };
  337. static struct attribute_group rapl_pmu_attr_group = {
  338. .attrs = rapl_pmu_attrs,
  339. };
  340. RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
  341. RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
  342. RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
  343. RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
  344. RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
  345. RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
  346. RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
  347. RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
  348. /*
  349. * we compute in 0.23 nJ increments regardless of MSR
  350. */
  351. RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
  352. RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
  353. RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
  354. RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
  355. static struct attribute *rapl_events_srv_attr[] = {
  356. EVENT_PTR(rapl_cores),
  357. EVENT_PTR(rapl_pkg),
  358. EVENT_PTR(rapl_ram),
  359. EVENT_PTR(rapl_cores_unit),
  360. EVENT_PTR(rapl_pkg_unit),
  361. EVENT_PTR(rapl_ram_unit),
  362. EVENT_PTR(rapl_cores_scale),
  363. EVENT_PTR(rapl_pkg_scale),
  364. EVENT_PTR(rapl_ram_scale),
  365. NULL,
  366. };
  367. static struct attribute *rapl_events_cln_attr[] = {
  368. EVENT_PTR(rapl_cores),
  369. EVENT_PTR(rapl_pkg),
  370. EVENT_PTR(rapl_gpu),
  371. EVENT_PTR(rapl_cores_unit),
  372. EVENT_PTR(rapl_pkg_unit),
  373. EVENT_PTR(rapl_gpu_unit),
  374. EVENT_PTR(rapl_cores_scale),
  375. EVENT_PTR(rapl_pkg_scale),
  376. EVENT_PTR(rapl_gpu_scale),
  377. NULL,
  378. };
  379. static struct attribute *rapl_events_hsw_attr[] = {
  380. EVENT_PTR(rapl_cores),
  381. EVENT_PTR(rapl_pkg),
  382. EVENT_PTR(rapl_gpu),
  383. EVENT_PTR(rapl_ram),
  384. EVENT_PTR(rapl_cores_unit),
  385. EVENT_PTR(rapl_pkg_unit),
  386. EVENT_PTR(rapl_gpu_unit),
  387. EVENT_PTR(rapl_ram_unit),
  388. EVENT_PTR(rapl_cores_scale),
  389. EVENT_PTR(rapl_pkg_scale),
  390. EVENT_PTR(rapl_gpu_scale),
  391. EVENT_PTR(rapl_ram_scale),
  392. NULL,
  393. };
  394. static struct attribute *rapl_events_knl_attr[] = {
  395. EVENT_PTR(rapl_pkg),
  396. EVENT_PTR(rapl_ram),
  397. EVENT_PTR(rapl_pkg_unit),
  398. EVENT_PTR(rapl_ram_unit),
  399. EVENT_PTR(rapl_pkg_scale),
  400. EVENT_PTR(rapl_ram_scale),
  401. NULL,
  402. };
  403. static struct attribute_group rapl_pmu_events_group = {
  404. .name = "events",
  405. .attrs = NULL, /* patched at runtime */
  406. };
  407. DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
  408. static struct attribute *rapl_formats_attr[] = {
  409. &format_attr_event.attr,
  410. NULL,
  411. };
  412. static struct attribute_group rapl_pmu_format_group = {
  413. .name = "format",
  414. .attrs = rapl_formats_attr,
  415. };
  416. const struct attribute_group *rapl_attr_groups[] = {
  417. &rapl_pmu_attr_group,
  418. &rapl_pmu_format_group,
  419. &rapl_pmu_events_group,
  420. NULL,
  421. };
  422. static struct pmu rapl_pmu_class = {
  423. .attr_groups = rapl_attr_groups,
  424. .task_ctx_nr = perf_invalid_context, /* system-wide only */
  425. .event_init = rapl_pmu_event_init,
  426. .add = rapl_pmu_event_add, /* must have */
  427. .del = rapl_pmu_event_del, /* must have */
  428. .start = rapl_pmu_event_start,
  429. .stop = rapl_pmu_event_stop,
  430. .read = rapl_pmu_event_read,
  431. };
  432. static void rapl_cpu_exit(int cpu)
  433. {
  434. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  435. int i, phys_id = topology_physical_package_id(cpu);
  436. int target = -1;
  437. /* find a new cpu on same package */
  438. for_each_online_cpu(i) {
  439. if (i == cpu)
  440. continue;
  441. if (phys_id == topology_physical_package_id(i)) {
  442. target = i;
  443. break;
  444. }
  445. }
  446. /*
  447. * clear cpu from cpumask
  448. * if was set in cpumask and still some cpu on package,
  449. * then move to new cpu
  450. */
  451. if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
  452. cpumask_set_cpu(target, &rapl_cpu_mask);
  453. WARN_ON(cpumask_empty(&rapl_cpu_mask));
  454. /*
  455. * migrate events and context to new cpu
  456. */
  457. if (target >= 0)
  458. perf_pmu_migrate_context(pmu->pmu, cpu, target);
  459. /* cancel overflow polling timer for CPU */
  460. rapl_stop_hrtimer(pmu);
  461. }
  462. static void rapl_cpu_init(int cpu)
  463. {
  464. int i, phys_id = topology_physical_package_id(cpu);
  465. /* check if phys_is is already covered */
  466. for_each_cpu(i, &rapl_cpu_mask) {
  467. if (phys_id == topology_physical_package_id(i))
  468. return;
  469. }
  470. /* was not found, so add it */
  471. cpumask_set_cpu(cpu, &rapl_cpu_mask);
  472. }
  473. static __init void rapl_hsw_server_quirk(void)
  474. {
  475. /*
  476. * DRAM domain on HSW server has fixed energy unit which can be
  477. * different than the unit from power unit MSR.
  478. * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
  479. * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
  480. */
  481. rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
  482. }
  483. static int rapl_cpu_prepare(int cpu)
  484. {
  485. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  486. int phys_id = topology_physical_package_id(cpu);
  487. u64 ms;
  488. if (pmu)
  489. return 0;
  490. if (phys_id < 0)
  491. return -1;
  492. pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
  493. if (!pmu)
  494. return -1;
  495. spin_lock_init(&pmu->lock);
  496. INIT_LIST_HEAD(&pmu->active_list);
  497. pmu->pmu = &rapl_pmu_class;
  498. /*
  499. * use reference of 200W for scaling the timeout
  500. * to avoid missing counter overflows.
  501. * 200W = 200 Joules/sec
  502. * divide interval by 2 to avoid lockstep (2 * 100)
  503. * if hw unit is 32, then we use 2 ms 1/200/2
  504. */
  505. if (rapl_hw_unit[0] < 32)
  506. ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
  507. else
  508. ms = 2;
  509. pmu->timer_interval = ms_to_ktime(ms);
  510. rapl_hrtimer_init(pmu);
  511. /* set RAPL pmu for this cpu for now */
  512. per_cpu(rapl_pmu, cpu) = pmu;
  513. per_cpu(rapl_pmu_to_free, cpu) = NULL;
  514. return 0;
  515. }
  516. static void rapl_cpu_kfree(int cpu)
  517. {
  518. struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
  519. kfree(pmu);
  520. per_cpu(rapl_pmu_to_free, cpu) = NULL;
  521. }
  522. static int rapl_cpu_dying(int cpu)
  523. {
  524. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  525. if (!pmu)
  526. return 0;
  527. per_cpu(rapl_pmu, cpu) = NULL;
  528. per_cpu(rapl_pmu_to_free, cpu) = pmu;
  529. return 0;
  530. }
  531. static int rapl_cpu_notifier(struct notifier_block *self,
  532. unsigned long action, void *hcpu)
  533. {
  534. unsigned int cpu = (long)hcpu;
  535. switch (action & ~CPU_TASKS_FROZEN) {
  536. case CPU_UP_PREPARE:
  537. rapl_cpu_prepare(cpu);
  538. break;
  539. case CPU_STARTING:
  540. rapl_cpu_init(cpu);
  541. break;
  542. case CPU_UP_CANCELED:
  543. case CPU_DYING:
  544. rapl_cpu_dying(cpu);
  545. break;
  546. case CPU_ONLINE:
  547. case CPU_DEAD:
  548. rapl_cpu_kfree(cpu);
  549. break;
  550. case CPU_DOWN_PREPARE:
  551. rapl_cpu_exit(cpu);
  552. break;
  553. default:
  554. break;
  555. }
  556. return NOTIFY_OK;
  557. }
  558. static int rapl_check_hw_unit(void)
  559. {
  560. u64 msr_rapl_power_unit_bits;
  561. int i;
  562. /* protect rdmsrl() to handle virtualization */
  563. if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
  564. return -1;
  565. for (i = 0; i < NR_RAPL_DOMAINS; i++)
  566. rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
  567. return 0;
  568. }
  569. static const struct x86_cpu_id rapl_cpu_match[] = {
  570. [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
  571. [1] = {},
  572. };
  573. static int __init rapl_pmu_init(void)
  574. {
  575. struct rapl_pmu *pmu;
  576. int cpu, ret;
  577. struct x86_pmu_quirk *quirk;
  578. int i;
  579. /*
  580. * check for Intel processor family 6
  581. */
  582. if (!x86_match_cpu(rapl_cpu_match))
  583. return 0;
  584. /* check supported CPU */
  585. switch (boot_cpu_data.x86_model) {
  586. case 42: /* Sandy Bridge */
  587. case 58: /* Ivy Bridge */
  588. rapl_cntr_mask = RAPL_IDX_CLN;
  589. rapl_pmu_events_group.attrs = rapl_events_cln_attr;
  590. break;
  591. case 63: /* Haswell-Server */
  592. rapl_add_quirk(rapl_hsw_server_quirk);
  593. rapl_cntr_mask = RAPL_IDX_SRV;
  594. rapl_pmu_events_group.attrs = rapl_events_srv_attr;
  595. break;
  596. case 60: /* Haswell */
  597. case 69: /* Haswell-Celeron */
  598. case 61: /* Broadwell */
  599. rapl_cntr_mask = RAPL_IDX_HSW;
  600. rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
  601. break;
  602. case 45: /* Sandy Bridge-EP */
  603. case 62: /* IvyTown */
  604. rapl_cntr_mask = RAPL_IDX_SRV;
  605. rapl_pmu_events_group.attrs = rapl_events_srv_attr;
  606. break;
  607. case 87: /* Knights Landing */
  608. rapl_add_quirk(rapl_hsw_server_quirk);
  609. rapl_cntr_mask = RAPL_IDX_KNL;
  610. rapl_pmu_events_group.attrs = rapl_events_knl_attr;
  611. default:
  612. /* unsupported */
  613. return 0;
  614. }
  615. ret = rapl_check_hw_unit();
  616. if (ret)
  617. return ret;
  618. /* run cpu model quirks */
  619. for (quirk = rapl_quirks; quirk; quirk = quirk->next)
  620. quirk->func();
  621. cpu_notifier_register_begin();
  622. for_each_online_cpu(cpu) {
  623. ret = rapl_cpu_prepare(cpu);
  624. if (ret)
  625. goto out;
  626. rapl_cpu_init(cpu);
  627. }
  628. __perf_cpu_notifier(rapl_cpu_notifier);
  629. ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
  630. if (WARN_ON(ret)) {
  631. pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
  632. cpu_notifier_register_done();
  633. return -1;
  634. }
  635. pmu = __this_cpu_read(rapl_pmu);
  636. pr_info("RAPL PMU detected,"
  637. " API unit is 2^-32 Joules,"
  638. " %d fixed counters"
  639. " %llu ms ovfl timer\n",
  640. hweight32(rapl_cntr_mask),
  641. ktime_to_ms(pmu->timer_interval));
  642. for (i = 0; i < NR_RAPL_DOMAINS; i++) {
  643. if (rapl_cntr_mask & (1 << i)) {
  644. pr_info("hw unit of domain %s 2^-%d Joules\n",
  645. rapl_domain_names[i], rapl_hw_unit[i]);
  646. }
  647. }
  648. out:
  649. cpu_notifier_register_done();
  650. return 0;
  651. }
  652. device_initcall(rapl_pmu_init);