perf_event_intel_rapl.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783
  1. /*
  2. * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
  3. * Copyright (C) 2013 Google, Inc., Stephane Eranian
  4. *
  5. * Intel RAPL interface is specified in the IA-32 Manual Vol3b
  6. * section 14.7.1 (September 2013)
  7. *
  8. * RAPL provides more controls than just reporting energy consumption
  9. * however here we only expose the 3 energy consumption free running
  10. * counters (pp0, pkg, dram).
  11. *
  12. * Each of those counters increments in a power unit defined by the
  13. * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  14. * but it can vary.
  15. *
  16. * Counter to rapl events mappings:
  17. *
  18. * pp0 counter: consumption of all physical cores (power plane 0)
  19. * event: rapl_energy_cores
  20. * perf code: 0x1
  21. *
  22. * pkg counter: consumption of the whole processor package
  23. * event: rapl_energy_pkg
  24. * perf code: 0x2
  25. *
  26. * dram counter: consumption of the dram domain (servers only)
  27. * event: rapl_energy_dram
  28. * perf code: 0x3
  29. *
  30. * dram counter: consumption of the builtin-gpu domain (client only)
  31. * event: rapl_energy_gpu
  32. * perf code: 0x4
  33. *
  34. * We manage those counters as free running (read-only). They may be
  35. * use simultaneously by other tools, such as turbostat.
  36. *
  37. * The events only support system-wide mode counting. There is no
  38. * sampling support because it does not make sense and is not
  39. * supported by the RAPL hardware.
  40. *
  41. * Because we want to avoid floating-point operations in the kernel,
  42. * the events are all reported in fixed point arithmetic (32.32).
  43. * Tools must adjust the counts to convert them to Watts using
  44. * the duration of the measurement. Tools may use a function such as
  45. * ldexp(raw_count, -32);
  46. */
  47. #include <linux/module.h>
  48. #include <linux/slab.h>
  49. #include <linux/perf_event.h>
  50. #include <asm/cpu_device_id.h>
  51. #include "perf_event.h"
  52. /*
  53. * RAPL energy status counters
  54. */
  55. #define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
  56. #define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
  57. #define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
  58. #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
  59. #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
  60. #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
  61. #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
  62. #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
  63. #define NR_RAPL_DOMAINS 0x4
  64. static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
  65. "pp0-core",
  66. "package",
  67. "dram",
  68. "pp1-gpu",
  69. };
  70. /* Clients have PP0, PKG */
  71. #define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
  72. 1<<RAPL_IDX_PKG_NRG_STAT|\
  73. 1<<RAPL_IDX_PP1_NRG_STAT)
  74. /* Servers have PP0, PKG, RAM */
  75. #define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
  76. 1<<RAPL_IDX_PKG_NRG_STAT|\
  77. 1<<RAPL_IDX_RAM_NRG_STAT)
  78. /* Servers have PP0, PKG, RAM, PP1 */
  79. #define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
  80. 1<<RAPL_IDX_PKG_NRG_STAT|\
  81. 1<<RAPL_IDX_RAM_NRG_STAT|\
  82. 1<<RAPL_IDX_PP1_NRG_STAT)
  83. /*
  84. * event code: LSB 8 bits, passed in attr->config
  85. * any other bit is reserved
  86. */
  87. #define RAPL_EVENT_MASK 0xFFULL
  88. #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
  89. static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
  90. struct kobj_attribute *attr, \
  91. char *page) \
  92. { \
  93. BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
  94. return sprintf(page, _format "\n"); \
  95. } \
  96. static struct kobj_attribute format_attr_##_var = \
  97. __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
  98. #define RAPL_EVENT_DESC(_name, _config) \
  99. { \
  100. .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
  101. .config = _config, \
  102. }
  103. #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
  104. #define RAPL_EVENT_ATTR_STR(_name, v, str) \
  105. static struct perf_pmu_events_attr event_attr_##v = { \
  106. .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \
  107. .id = 0, \
  108. .event_str = str, \
  109. };
  110. struct rapl_pmu {
  111. spinlock_t lock;
  112. int n_active; /* number of active events */
  113. struct list_head active_list;
  114. struct pmu *pmu; /* pointer to rapl_pmu_class */
  115. ktime_t timer_interval; /* in ktime_t unit */
  116. struct hrtimer hrtimer;
  117. };
  118. static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */
  119. static struct pmu rapl_pmu_class;
  120. static cpumask_t rapl_cpu_mask;
  121. static int rapl_cntr_mask;
  122. static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
  123. static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
  124. static struct x86_pmu_quirk *rapl_quirks;
  125. static inline u64 rapl_read_counter(struct perf_event *event)
  126. {
  127. u64 raw;
  128. rdmsrl(event->hw.event_base, raw);
  129. return raw;
  130. }
  131. #define rapl_add_quirk(func_) \
  132. do { \
  133. static struct x86_pmu_quirk __quirk __initdata = { \
  134. .func = func_, \
  135. }; \
  136. __quirk.next = rapl_quirks; \
  137. rapl_quirks = &__quirk; \
  138. } while (0)
  139. static inline u64 rapl_scale(u64 v, int cfg)
  140. {
  141. if (cfg > NR_RAPL_DOMAINS) {
  142. pr_warn("invalid domain %d, failed to scale data\n", cfg);
  143. return v;
  144. }
  145. /*
  146. * scale delta to smallest unit (1/2^32)
  147. * users must then scale back: count * 1/(1e9*2^32) to get Joules
  148. * or use ldexp(count, -32).
  149. * Watts = Joules/Time delta
  150. */
  151. return v << (32 - rapl_hw_unit[cfg - 1]);
  152. }
  153. static u64 rapl_event_update(struct perf_event *event)
  154. {
  155. struct hw_perf_event *hwc = &event->hw;
  156. u64 prev_raw_count, new_raw_count;
  157. s64 delta, sdelta;
  158. int shift = RAPL_CNTR_WIDTH;
  159. again:
  160. prev_raw_count = local64_read(&hwc->prev_count);
  161. rdmsrl(event->hw.event_base, new_raw_count);
  162. if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
  163. new_raw_count) != prev_raw_count) {
  164. cpu_relax();
  165. goto again;
  166. }
  167. /*
  168. * Now we have the new raw value and have updated the prev
  169. * timestamp already. We can now calculate the elapsed delta
  170. * (event-)time and add that to the generic event.
  171. *
  172. * Careful, not all hw sign-extends above the physical width
  173. * of the count.
  174. */
  175. delta = (new_raw_count << shift) - (prev_raw_count << shift);
  176. delta >>= shift;
  177. sdelta = rapl_scale(delta, event->hw.config);
  178. local64_add(sdelta, &event->count);
  179. return new_raw_count;
  180. }
  181. static void rapl_start_hrtimer(struct rapl_pmu *pmu)
  182. {
  183. __hrtimer_start_range_ns(&pmu->hrtimer,
  184. pmu->timer_interval, 0,
  185. HRTIMER_MODE_REL_PINNED, 0);
  186. }
  187. static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
  188. {
  189. hrtimer_cancel(&pmu->hrtimer);
  190. }
  191. static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
  192. {
  193. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  194. struct perf_event *event;
  195. unsigned long flags;
  196. if (!pmu->n_active)
  197. return HRTIMER_NORESTART;
  198. spin_lock_irqsave(&pmu->lock, flags);
  199. list_for_each_entry(event, &pmu->active_list, active_entry) {
  200. rapl_event_update(event);
  201. }
  202. spin_unlock_irqrestore(&pmu->lock, flags);
  203. hrtimer_forward_now(hrtimer, pmu->timer_interval);
  204. return HRTIMER_RESTART;
  205. }
  206. static void rapl_hrtimer_init(struct rapl_pmu *pmu)
  207. {
  208. struct hrtimer *hr = &pmu->hrtimer;
  209. hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  210. hr->function = rapl_hrtimer_handle;
  211. }
  212. static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
  213. struct perf_event *event)
  214. {
  215. if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
  216. return;
  217. event->hw.state = 0;
  218. list_add_tail(&event->active_entry, &pmu->active_list);
  219. local64_set(&event->hw.prev_count, rapl_read_counter(event));
  220. pmu->n_active++;
  221. if (pmu->n_active == 1)
  222. rapl_start_hrtimer(pmu);
  223. }
  224. static void rapl_pmu_event_start(struct perf_event *event, int mode)
  225. {
  226. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  227. unsigned long flags;
  228. spin_lock_irqsave(&pmu->lock, flags);
  229. __rapl_pmu_event_start(pmu, event);
  230. spin_unlock_irqrestore(&pmu->lock, flags);
  231. }
  232. static void rapl_pmu_event_stop(struct perf_event *event, int mode)
  233. {
  234. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  235. struct hw_perf_event *hwc = &event->hw;
  236. unsigned long flags;
  237. spin_lock_irqsave(&pmu->lock, flags);
  238. /* mark event as deactivated and stopped */
  239. if (!(hwc->state & PERF_HES_STOPPED)) {
  240. WARN_ON_ONCE(pmu->n_active <= 0);
  241. pmu->n_active--;
  242. if (pmu->n_active == 0)
  243. rapl_stop_hrtimer(pmu);
  244. list_del(&event->active_entry);
  245. WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
  246. hwc->state |= PERF_HES_STOPPED;
  247. }
  248. /* check if update of sw counter is necessary */
  249. if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
  250. /*
  251. * Drain the remaining delta count out of a event
  252. * that we are disabling:
  253. */
  254. rapl_event_update(event);
  255. hwc->state |= PERF_HES_UPTODATE;
  256. }
  257. spin_unlock_irqrestore(&pmu->lock, flags);
  258. }
  259. static int rapl_pmu_event_add(struct perf_event *event, int mode)
  260. {
  261. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  262. struct hw_perf_event *hwc = &event->hw;
  263. unsigned long flags;
  264. spin_lock_irqsave(&pmu->lock, flags);
  265. hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
  266. if (mode & PERF_EF_START)
  267. __rapl_pmu_event_start(pmu, event);
  268. spin_unlock_irqrestore(&pmu->lock, flags);
  269. return 0;
  270. }
  271. static void rapl_pmu_event_del(struct perf_event *event, int flags)
  272. {
  273. rapl_pmu_event_stop(event, PERF_EF_UPDATE);
  274. }
  275. static int rapl_pmu_event_init(struct perf_event *event)
  276. {
  277. u64 cfg = event->attr.config & RAPL_EVENT_MASK;
  278. int bit, msr, ret = 0;
  279. /* only look at RAPL events */
  280. if (event->attr.type != rapl_pmu_class.type)
  281. return -ENOENT;
  282. /* check only supported bits are set */
  283. if (event->attr.config & ~RAPL_EVENT_MASK)
  284. return -EINVAL;
  285. /*
  286. * check event is known (determines counter)
  287. */
  288. switch (cfg) {
  289. case INTEL_RAPL_PP0:
  290. bit = RAPL_IDX_PP0_NRG_STAT;
  291. msr = MSR_PP0_ENERGY_STATUS;
  292. break;
  293. case INTEL_RAPL_PKG:
  294. bit = RAPL_IDX_PKG_NRG_STAT;
  295. msr = MSR_PKG_ENERGY_STATUS;
  296. break;
  297. case INTEL_RAPL_RAM:
  298. bit = RAPL_IDX_RAM_NRG_STAT;
  299. msr = MSR_DRAM_ENERGY_STATUS;
  300. break;
  301. case INTEL_RAPL_PP1:
  302. bit = RAPL_IDX_PP1_NRG_STAT;
  303. msr = MSR_PP1_ENERGY_STATUS;
  304. break;
  305. default:
  306. return -EINVAL;
  307. }
  308. /* check event supported */
  309. if (!(rapl_cntr_mask & (1 << bit)))
  310. return -EINVAL;
  311. /* unsupported modes and filters */
  312. if (event->attr.exclude_user ||
  313. event->attr.exclude_kernel ||
  314. event->attr.exclude_hv ||
  315. event->attr.exclude_idle ||
  316. event->attr.exclude_host ||
  317. event->attr.exclude_guest ||
  318. event->attr.sample_period) /* no sampling */
  319. return -EINVAL;
  320. /* must be done before validate_group */
  321. event->hw.event_base = msr;
  322. event->hw.config = cfg;
  323. event->hw.idx = bit;
  324. return ret;
  325. }
  326. static void rapl_pmu_event_read(struct perf_event *event)
  327. {
  328. rapl_event_update(event);
  329. }
  330. static ssize_t rapl_get_attr_cpumask(struct device *dev,
  331. struct device_attribute *attr, char *buf)
  332. {
  333. return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
  334. }
  335. static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
  336. static struct attribute *rapl_pmu_attrs[] = {
  337. &dev_attr_cpumask.attr,
  338. NULL,
  339. };
  340. static struct attribute_group rapl_pmu_attr_group = {
  341. .attrs = rapl_pmu_attrs,
  342. };
  343. static ssize_t rapl_sysfs_show(struct device *dev,
  344. struct device_attribute *attr,
  345. char *page)
  346. {
  347. struct perf_pmu_events_attr *pmu_attr = \
  348. container_of(attr, struct perf_pmu_events_attr, attr);
  349. if (pmu_attr->event_str)
  350. return sprintf(page, "%s", pmu_attr->event_str);
  351. return 0;
  352. }
  353. RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
  354. RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
  355. RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
  356. RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
  357. RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
  358. RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
  359. RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
  360. RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
  361. /*
  362. * we compute in 0.23 nJ increments regardless of MSR
  363. */
  364. RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
  365. RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
  366. RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
  367. RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
  368. static struct attribute *rapl_events_srv_attr[] = {
  369. EVENT_PTR(rapl_cores),
  370. EVENT_PTR(rapl_pkg),
  371. EVENT_PTR(rapl_ram),
  372. EVENT_PTR(rapl_cores_unit),
  373. EVENT_PTR(rapl_pkg_unit),
  374. EVENT_PTR(rapl_ram_unit),
  375. EVENT_PTR(rapl_cores_scale),
  376. EVENT_PTR(rapl_pkg_scale),
  377. EVENT_PTR(rapl_ram_scale),
  378. NULL,
  379. };
  380. static struct attribute *rapl_events_cln_attr[] = {
  381. EVENT_PTR(rapl_cores),
  382. EVENT_PTR(rapl_pkg),
  383. EVENT_PTR(rapl_gpu),
  384. EVENT_PTR(rapl_cores_unit),
  385. EVENT_PTR(rapl_pkg_unit),
  386. EVENT_PTR(rapl_gpu_unit),
  387. EVENT_PTR(rapl_cores_scale),
  388. EVENT_PTR(rapl_pkg_scale),
  389. EVENT_PTR(rapl_gpu_scale),
  390. NULL,
  391. };
  392. static struct attribute *rapl_events_hsw_attr[] = {
  393. EVENT_PTR(rapl_cores),
  394. EVENT_PTR(rapl_pkg),
  395. EVENT_PTR(rapl_gpu),
  396. EVENT_PTR(rapl_ram),
  397. EVENT_PTR(rapl_cores_unit),
  398. EVENT_PTR(rapl_pkg_unit),
  399. EVENT_PTR(rapl_gpu_unit),
  400. EVENT_PTR(rapl_ram_unit),
  401. EVENT_PTR(rapl_cores_scale),
  402. EVENT_PTR(rapl_pkg_scale),
  403. EVENT_PTR(rapl_gpu_scale),
  404. EVENT_PTR(rapl_ram_scale),
  405. NULL,
  406. };
  407. static struct attribute_group rapl_pmu_events_group = {
  408. .name = "events",
  409. .attrs = NULL, /* patched at runtime */
  410. };
  411. DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
  412. static struct attribute *rapl_formats_attr[] = {
  413. &format_attr_event.attr,
  414. NULL,
  415. };
  416. static struct attribute_group rapl_pmu_format_group = {
  417. .name = "format",
  418. .attrs = rapl_formats_attr,
  419. };
  420. const struct attribute_group *rapl_attr_groups[] = {
  421. &rapl_pmu_attr_group,
  422. &rapl_pmu_format_group,
  423. &rapl_pmu_events_group,
  424. NULL,
  425. };
  426. static struct pmu rapl_pmu_class = {
  427. .attr_groups = rapl_attr_groups,
  428. .task_ctx_nr = perf_invalid_context, /* system-wide only */
  429. .event_init = rapl_pmu_event_init,
  430. .add = rapl_pmu_event_add, /* must have */
  431. .del = rapl_pmu_event_del, /* must have */
  432. .start = rapl_pmu_event_start,
  433. .stop = rapl_pmu_event_stop,
  434. .read = rapl_pmu_event_read,
  435. };
  436. static void rapl_cpu_exit(int cpu)
  437. {
  438. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  439. int i, phys_id = topology_physical_package_id(cpu);
  440. int target = -1;
  441. /* find a new cpu on same package */
  442. for_each_online_cpu(i) {
  443. if (i == cpu)
  444. continue;
  445. if (phys_id == topology_physical_package_id(i)) {
  446. target = i;
  447. break;
  448. }
  449. }
  450. /*
  451. * clear cpu from cpumask
  452. * if was set in cpumask and still some cpu on package,
  453. * then move to new cpu
  454. */
  455. if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
  456. cpumask_set_cpu(target, &rapl_cpu_mask);
  457. WARN_ON(cpumask_empty(&rapl_cpu_mask));
  458. /*
  459. * migrate events and context to new cpu
  460. */
  461. if (target >= 0)
  462. perf_pmu_migrate_context(pmu->pmu, cpu, target);
  463. /* cancel overflow polling timer for CPU */
  464. rapl_stop_hrtimer(pmu);
  465. }
  466. static void rapl_cpu_init(int cpu)
  467. {
  468. int i, phys_id = topology_physical_package_id(cpu);
  469. /* check if phys_is is already covered */
  470. for_each_cpu(i, &rapl_cpu_mask) {
  471. if (phys_id == topology_physical_package_id(i))
  472. return;
  473. }
  474. /* was not found, so add it */
  475. cpumask_set_cpu(cpu, &rapl_cpu_mask);
  476. }
  477. static __init void rapl_hsw_server_quirk(void)
  478. {
  479. /*
  480. * DRAM domain on HSW server has fixed energy unit which can be
  481. * different than the unit from power unit MSR.
  482. * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
  483. * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
  484. */
  485. rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
  486. }
  487. static int rapl_cpu_prepare(int cpu)
  488. {
  489. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  490. int phys_id = topology_physical_package_id(cpu);
  491. u64 ms;
  492. if (pmu)
  493. return 0;
  494. if (phys_id < 0)
  495. return -1;
  496. pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
  497. if (!pmu)
  498. return -1;
  499. spin_lock_init(&pmu->lock);
  500. INIT_LIST_HEAD(&pmu->active_list);
  501. pmu->pmu = &rapl_pmu_class;
  502. /*
  503. * use reference of 200W for scaling the timeout
  504. * to avoid missing counter overflows.
  505. * 200W = 200 Joules/sec
  506. * divide interval by 2 to avoid lockstep (2 * 100)
  507. * if hw unit is 32, then we use 2 ms 1/200/2
  508. */
  509. if (rapl_hw_unit[0] < 32)
  510. ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
  511. else
  512. ms = 2;
  513. pmu->timer_interval = ms_to_ktime(ms);
  514. rapl_hrtimer_init(pmu);
  515. /* set RAPL pmu for this cpu for now */
  516. per_cpu(rapl_pmu, cpu) = pmu;
  517. per_cpu(rapl_pmu_to_free, cpu) = NULL;
  518. return 0;
  519. }
  520. static void rapl_cpu_kfree(int cpu)
  521. {
  522. struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
  523. kfree(pmu);
  524. per_cpu(rapl_pmu_to_free, cpu) = NULL;
  525. }
  526. static int rapl_cpu_dying(int cpu)
  527. {
  528. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  529. if (!pmu)
  530. return 0;
  531. per_cpu(rapl_pmu, cpu) = NULL;
  532. per_cpu(rapl_pmu_to_free, cpu) = pmu;
  533. return 0;
  534. }
  535. static int rapl_cpu_notifier(struct notifier_block *self,
  536. unsigned long action, void *hcpu)
  537. {
  538. unsigned int cpu = (long)hcpu;
  539. switch (action & ~CPU_TASKS_FROZEN) {
  540. case CPU_UP_PREPARE:
  541. rapl_cpu_prepare(cpu);
  542. break;
  543. case CPU_STARTING:
  544. rapl_cpu_init(cpu);
  545. break;
  546. case CPU_UP_CANCELED:
  547. case CPU_DYING:
  548. rapl_cpu_dying(cpu);
  549. break;
  550. case CPU_ONLINE:
  551. case CPU_DEAD:
  552. rapl_cpu_kfree(cpu);
  553. break;
  554. case CPU_DOWN_PREPARE:
  555. rapl_cpu_exit(cpu);
  556. break;
  557. default:
  558. break;
  559. }
  560. return NOTIFY_OK;
  561. }
  562. static int rapl_check_hw_unit(void)
  563. {
  564. u64 msr_rapl_power_unit_bits;
  565. int i;
  566. /* protect rdmsrl() to handle virtualization */
  567. if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
  568. return -1;
  569. for (i = 0; i < NR_RAPL_DOMAINS; i++)
  570. rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
  571. return 0;
  572. }
  573. static const struct x86_cpu_id rapl_cpu_match[] = {
  574. [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
  575. [1] = {},
  576. };
  577. static int __init rapl_pmu_init(void)
  578. {
  579. struct rapl_pmu *pmu;
  580. int cpu, ret;
  581. struct x86_pmu_quirk *quirk;
  582. int i;
  583. /*
  584. * check for Intel processor family 6
  585. */
  586. if (!x86_match_cpu(rapl_cpu_match))
  587. return 0;
  588. /* check supported CPU */
  589. switch (boot_cpu_data.x86_model) {
  590. case 42: /* Sandy Bridge */
  591. case 58: /* Ivy Bridge */
  592. rapl_cntr_mask = RAPL_IDX_CLN;
  593. rapl_pmu_events_group.attrs = rapl_events_cln_attr;
  594. break;
  595. case 63: /* Haswell-Server */
  596. rapl_add_quirk(rapl_hsw_server_quirk);
  597. rapl_cntr_mask = RAPL_IDX_SRV;
  598. rapl_pmu_events_group.attrs = rapl_events_srv_attr;
  599. break;
  600. case 60: /* Haswell */
  601. case 69: /* Haswell-Celeron */
  602. case 61: /* Broadwell */
  603. rapl_cntr_mask = RAPL_IDX_HSW;
  604. rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
  605. break;
  606. case 45: /* Sandy Bridge-EP */
  607. case 62: /* IvyTown */
  608. rapl_cntr_mask = RAPL_IDX_SRV;
  609. rapl_pmu_events_group.attrs = rapl_events_srv_attr;
  610. break;
  611. default:
  612. /* unsupported */
  613. return 0;
  614. }
  615. ret = rapl_check_hw_unit();
  616. if (ret)
  617. return ret;
  618. /* run cpu model quirks */
  619. for (quirk = rapl_quirks; quirk; quirk = quirk->next)
  620. quirk->func();
  621. cpu_notifier_register_begin();
  622. for_each_online_cpu(cpu) {
  623. ret = rapl_cpu_prepare(cpu);
  624. if (ret)
  625. goto out;
  626. rapl_cpu_init(cpu);
  627. }
  628. __perf_cpu_notifier(rapl_cpu_notifier);
  629. ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
  630. if (WARN_ON(ret)) {
  631. pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
  632. cpu_notifier_register_done();
  633. return -1;
  634. }
  635. pmu = __this_cpu_read(rapl_pmu);
  636. pr_info("RAPL PMU detected,"
  637. " API unit is 2^-32 Joules,"
  638. " %d fixed counters"
  639. " %llu ms ovfl timer\n",
  640. hweight32(rapl_cntr_mask),
  641. ktime_to_ms(pmu->timer_interval));
  642. for (i = 0; i < NR_RAPL_DOMAINS; i++) {
  643. if (rapl_cntr_mask & (1 << i)) {
  644. pr_info("hw unit of domain %s 2^-%d Joules\n",
  645. rapl_domain_names[i], rapl_hw_unit[i]);
  646. }
  647. }
  648. out:
  649. cpu_notifier_register_done();
  650. return 0;
  651. }
  652. device_initcall(rapl_pmu_init);