perf_event_intel_rapl.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802
  1. /*
  2. * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
  3. * Copyright (C) 2013 Google, Inc., Stephane Eranian
  4. *
  5. * Intel RAPL interface is specified in the IA-32 Manual Vol3b
  6. * section 14.7.1 (September 2013)
  7. *
  8. * RAPL provides more controls than just reporting energy consumption
  9. * however here we only expose the 3 energy consumption free running
  10. * counters (pp0, pkg, dram).
  11. *
  12. * Each of those counters increments in a power unit defined by the
  13. * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  14. * but it can vary.
  15. *
  16. * Counter to rapl events mappings:
  17. *
  18. * pp0 counter: consumption of all physical cores (power plane 0)
  19. * event: rapl_energy_cores
  20. * perf code: 0x1
  21. *
  22. * pkg counter: consumption of the whole processor package
  23. * event: rapl_energy_pkg
  24. * perf code: 0x2
  25. *
  26. * dram counter: consumption of the dram domain (servers only)
  27. * event: rapl_energy_dram
  28. * perf code: 0x3
  29. *
  30. * dram counter: consumption of the builtin-gpu domain (client only)
  31. * event: rapl_energy_gpu
  32. * perf code: 0x4
  33. *
  34. * We manage those counters as free running (read-only). They may be
  35. * use simultaneously by other tools, such as turbostat.
  36. *
  37. * The events only support system-wide mode counting. There is no
  38. * sampling support because it does not make sense and is not
  39. * supported by the RAPL hardware.
  40. *
  41. * Because we want to avoid floating-point operations in the kernel,
  42. * the events are all reported in fixed point arithmetic (32.32).
  43. * Tools must adjust the counts to convert them to Watts using
  44. * the duration of the measurement. Tools may use a function such as
  45. * ldexp(raw_count, -32);
  46. */
  47. #include <linux/module.h>
  48. #include <linux/slab.h>
  49. #include <linux/perf_event.h>
  50. #include <asm/cpu_device_id.h>
  51. #include "perf_event.h"
  52. /*
  53. * RAPL energy status counters
  54. */
  55. #define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
  56. #define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
  57. #define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
  58. #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
  59. #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
  60. #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
  61. #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
  62. #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
  63. #define NR_RAPL_DOMAINS 0x4
  64. static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
  65. "pp0-core",
  66. "package",
  67. "dram",
  68. "pp1-gpu",
  69. };
  70. /* Clients have PP0, PKG */
  71. #define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
  72. 1<<RAPL_IDX_PKG_NRG_STAT|\
  73. 1<<RAPL_IDX_PP1_NRG_STAT)
  74. /* Servers have PP0, PKG, RAM */
  75. #define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
  76. 1<<RAPL_IDX_PKG_NRG_STAT|\
  77. 1<<RAPL_IDX_RAM_NRG_STAT)
  78. /* Servers have PP0, PKG, RAM, PP1 */
  79. #define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
  80. 1<<RAPL_IDX_PKG_NRG_STAT|\
  81. 1<<RAPL_IDX_RAM_NRG_STAT|\
  82. 1<<RAPL_IDX_PP1_NRG_STAT)
  83. /* Knights Landing has PKG, RAM */
  84. #define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\
  85. 1<<RAPL_IDX_RAM_NRG_STAT)
  86. /*
  87. * event code: LSB 8 bits, passed in attr->config
  88. * any other bit is reserved
  89. */
  90. #define RAPL_EVENT_MASK 0xFFULL
  91. #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
  92. static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
  93. struct kobj_attribute *attr, \
  94. char *page) \
  95. { \
  96. BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
  97. return sprintf(page, _format "\n"); \
  98. } \
  99. static struct kobj_attribute format_attr_##_var = \
  100. __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
  101. #define RAPL_EVENT_DESC(_name, _config) \
  102. { \
  103. .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
  104. .config = _config, \
  105. }
  106. #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
  107. #define RAPL_EVENT_ATTR_STR(_name, v, str) \
  108. static struct perf_pmu_events_attr event_attr_##v = { \
  109. .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \
  110. .id = 0, \
  111. .event_str = str, \
  112. };
  113. struct rapl_pmu {
  114. spinlock_t lock;
  115. int n_active; /* number of active events */
  116. struct list_head active_list;
  117. struct pmu *pmu; /* pointer to rapl_pmu_class */
  118. ktime_t timer_interval; /* in ktime_t unit */
  119. struct hrtimer hrtimer;
  120. };
  121. static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */
  122. static struct pmu rapl_pmu_class;
  123. static cpumask_t rapl_cpu_mask;
  124. static int rapl_cntr_mask;
  125. static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
  126. static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
  127. static struct x86_pmu_quirk *rapl_quirks;
  128. static inline u64 rapl_read_counter(struct perf_event *event)
  129. {
  130. u64 raw;
  131. rdmsrl(event->hw.event_base, raw);
  132. return raw;
  133. }
  134. #define rapl_add_quirk(func_) \
  135. do { \
  136. static struct x86_pmu_quirk __quirk __initdata = { \
  137. .func = func_, \
  138. }; \
  139. __quirk.next = rapl_quirks; \
  140. rapl_quirks = &__quirk; \
  141. } while (0)
  142. static inline u64 rapl_scale(u64 v, int cfg)
  143. {
  144. if (cfg > NR_RAPL_DOMAINS) {
  145. pr_warn("invalid domain %d, failed to scale data\n", cfg);
  146. return v;
  147. }
  148. /*
  149. * scale delta to smallest unit (1/2^32)
  150. * users must then scale back: count * 1/(1e9*2^32) to get Joules
  151. * or use ldexp(count, -32).
  152. * Watts = Joules/Time delta
  153. */
  154. return v << (32 - rapl_hw_unit[cfg - 1]);
  155. }
  156. static u64 rapl_event_update(struct perf_event *event)
  157. {
  158. struct hw_perf_event *hwc = &event->hw;
  159. u64 prev_raw_count, new_raw_count;
  160. s64 delta, sdelta;
  161. int shift = RAPL_CNTR_WIDTH;
  162. again:
  163. prev_raw_count = local64_read(&hwc->prev_count);
  164. rdmsrl(event->hw.event_base, new_raw_count);
  165. if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
  166. new_raw_count) != prev_raw_count) {
  167. cpu_relax();
  168. goto again;
  169. }
  170. /*
  171. * Now we have the new raw value and have updated the prev
  172. * timestamp already. We can now calculate the elapsed delta
  173. * (event-)time and add that to the generic event.
  174. *
  175. * Careful, not all hw sign-extends above the physical width
  176. * of the count.
  177. */
  178. delta = (new_raw_count << shift) - (prev_raw_count << shift);
  179. delta >>= shift;
  180. sdelta = rapl_scale(delta, event->hw.config);
  181. local64_add(sdelta, &event->count);
  182. return new_raw_count;
  183. }
  184. static void rapl_start_hrtimer(struct rapl_pmu *pmu)
  185. {
  186. hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
  187. HRTIMER_MODE_REL_PINNED);
  188. }
  189. static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
  190. {
  191. hrtimer_cancel(&pmu->hrtimer);
  192. }
  193. static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
  194. {
  195. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  196. struct perf_event *event;
  197. unsigned long flags;
  198. if (!pmu->n_active)
  199. return HRTIMER_NORESTART;
  200. spin_lock_irqsave(&pmu->lock, flags);
  201. list_for_each_entry(event, &pmu->active_list, active_entry) {
  202. rapl_event_update(event);
  203. }
  204. spin_unlock_irqrestore(&pmu->lock, flags);
  205. hrtimer_forward_now(hrtimer, pmu->timer_interval);
  206. return HRTIMER_RESTART;
  207. }
  208. static void rapl_hrtimer_init(struct rapl_pmu *pmu)
  209. {
  210. struct hrtimer *hr = &pmu->hrtimer;
  211. hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  212. hr->function = rapl_hrtimer_handle;
  213. }
  214. static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
  215. struct perf_event *event)
  216. {
  217. if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
  218. return;
  219. event->hw.state = 0;
  220. list_add_tail(&event->active_entry, &pmu->active_list);
  221. local64_set(&event->hw.prev_count, rapl_read_counter(event));
  222. pmu->n_active++;
  223. if (pmu->n_active == 1)
  224. rapl_start_hrtimer(pmu);
  225. }
  226. static void rapl_pmu_event_start(struct perf_event *event, int mode)
  227. {
  228. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  229. unsigned long flags;
  230. spin_lock_irqsave(&pmu->lock, flags);
  231. __rapl_pmu_event_start(pmu, event);
  232. spin_unlock_irqrestore(&pmu->lock, flags);
  233. }
  234. static void rapl_pmu_event_stop(struct perf_event *event, int mode)
  235. {
  236. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  237. struct hw_perf_event *hwc = &event->hw;
  238. unsigned long flags;
  239. spin_lock_irqsave(&pmu->lock, flags);
  240. /* mark event as deactivated and stopped */
  241. if (!(hwc->state & PERF_HES_STOPPED)) {
  242. WARN_ON_ONCE(pmu->n_active <= 0);
  243. pmu->n_active--;
  244. if (pmu->n_active == 0)
  245. rapl_stop_hrtimer(pmu);
  246. list_del(&event->active_entry);
  247. WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
  248. hwc->state |= PERF_HES_STOPPED;
  249. }
  250. /* check if update of sw counter is necessary */
  251. if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
  252. /*
  253. * Drain the remaining delta count out of a event
  254. * that we are disabling:
  255. */
  256. rapl_event_update(event);
  257. hwc->state |= PERF_HES_UPTODATE;
  258. }
  259. spin_unlock_irqrestore(&pmu->lock, flags);
  260. }
  261. static int rapl_pmu_event_add(struct perf_event *event, int mode)
  262. {
  263. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  264. struct hw_perf_event *hwc = &event->hw;
  265. unsigned long flags;
  266. spin_lock_irqsave(&pmu->lock, flags);
  267. hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
  268. if (mode & PERF_EF_START)
  269. __rapl_pmu_event_start(pmu, event);
  270. spin_unlock_irqrestore(&pmu->lock, flags);
  271. return 0;
  272. }
  273. static void rapl_pmu_event_del(struct perf_event *event, int flags)
  274. {
  275. rapl_pmu_event_stop(event, PERF_EF_UPDATE);
  276. }
  277. static int rapl_pmu_event_init(struct perf_event *event)
  278. {
  279. u64 cfg = event->attr.config & RAPL_EVENT_MASK;
  280. int bit, msr, ret = 0;
  281. /* only look at RAPL events */
  282. if (event->attr.type != rapl_pmu_class.type)
  283. return -ENOENT;
  284. /* check only supported bits are set */
  285. if (event->attr.config & ~RAPL_EVENT_MASK)
  286. return -EINVAL;
  287. /*
  288. * check event is known (determines counter)
  289. */
  290. switch (cfg) {
  291. case INTEL_RAPL_PP0:
  292. bit = RAPL_IDX_PP0_NRG_STAT;
  293. msr = MSR_PP0_ENERGY_STATUS;
  294. break;
  295. case INTEL_RAPL_PKG:
  296. bit = RAPL_IDX_PKG_NRG_STAT;
  297. msr = MSR_PKG_ENERGY_STATUS;
  298. break;
  299. case INTEL_RAPL_RAM:
  300. bit = RAPL_IDX_RAM_NRG_STAT;
  301. msr = MSR_DRAM_ENERGY_STATUS;
  302. break;
  303. case INTEL_RAPL_PP1:
  304. bit = RAPL_IDX_PP1_NRG_STAT;
  305. msr = MSR_PP1_ENERGY_STATUS;
  306. break;
  307. default:
  308. return -EINVAL;
  309. }
  310. /* check event supported */
  311. if (!(rapl_cntr_mask & (1 << bit)))
  312. return -EINVAL;
  313. /* unsupported modes and filters */
  314. if (event->attr.exclude_user ||
  315. event->attr.exclude_kernel ||
  316. event->attr.exclude_hv ||
  317. event->attr.exclude_idle ||
  318. event->attr.exclude_host ||
  319. event->attr.exclude_guest ||
  320. event->attr.sample_period) /* no sampling */
  321. return -EINVAL;
  322. /* must be done before validate_group */
  323. event->hw.event_base = msr;
  324. event->hw.config = cfg;
  325. event->hw.idx = bit;
  326. return ret;
  327. }
  328. static void rapl_pmu_event_read(struct perf_event *event)
  329. {
  330. rapl_event_update(event);
  331. }
  332. static ssize_t rapl_get_attr_cpumask(struct device *dev,
  333. struct device_attribute *attr, char *buf)
  334. {
  335. return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
  336. }
  337. static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
  338. static struct attribute *rapl_pmu_attrs[] = {
  339. &dev_attr_cpumask.attr,
  340. NULL,
  341. };
  342. static struct attribute_group rapl_pmu_attr_group = {
  343. .attrs = rapl_pmu_attrs,
  344. };
  345. static ssize_t rapl_sysfs_show(struct device *dev,
  346. struct device_attribute *attr,
  347. char *page)
  348. {
  349. struct perf_pmu_events_attr *pmu_attr = \
  350. container_of(attr, struct perf_pmu_events_attr, attr);
  351. if (pmu_attr->event_str)
  352. return sprintf(page, "%s", pmu_attr->event_str);
  353. return 0;
  354. }
  355. RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
  356. RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
  357. RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
  358. RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
  359. RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
  360. RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
  361. RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
  362. RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
  363. /*
  364. * we compute in 0.23 nJ increments regardless of MSR
  365. */
  366. RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
  367. RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
  368. RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
  369. RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
  370. static struct attribute *rapl_events_srv_attr[] = {
  371. EVENT_PTR(rapl_cores),
  372. EVENT_PTR(rapl_pkg),
  373. EVENT_PTR(rapl_ram),
  374. EVENT_PTR(rapl_cores_unit),
  375. EVENT_PTR(rapl_pkg_unit),
  376. EVENT_PTR(rapl_ram_unit),
  377. EVENT_PTR(rapl_cores_scale),
  378. EVENT_PTR(rapl_pkg_scale),
  379. EVENT_PTR(rapl_ram_scale),
  380. NULL,
  381. };
  382. static struct attribute *rapl_events_cln_attr[] = {
  383. EVENT_PTR(rapl_cores),
  384. EVENT_PTR(rapl_pkg),
  385. EVENT_PTR(rapl_gpu),
  386. EVENT_PTR(rapl_cores_unit),
  387. EVENT_PTR(rapl_pkg_unit),
  388. EVENT_PTR(rapl_gpu_unit),
  389. EVENT_PTR(rapl_cores_scale),
  390. EVENT_PTR(rapl_pkg_scale),
  391. EVENT_PTR(rapl_gpu_scale),
  392. NULL,
  393. };
  394. static struct attribute *rapl_events_hsw_attr[] = {
  395. EVENT_PTR(rapl_cores),
  396. EVENT_PTR(rapl_pkg),
  397. EVENT_PTR(rapl_gpu),
  398. EVENT_PTR(rapl_ram),
  399. EVENT_PTR(rapl_cores_unit),
  400. EVENT_PTR(rapl_pkg_unit),
  401. EVENT_PTR(rapl_gpu_unit),
  402. EVENT_PTR(rapl_ram_unit),
  403. EVENT_PTR(rapl_cores_scale),
  404. EVENT_PTR(rapl_pkg_scale),
  405. EVENT_PTR(rapl_gpu_scale),
  406. EVENT_PTR(rapl_ram_scale),
  407. NULL,
  408. };
  409. static struct attribute *rapl_events_knl_attr[] = {
  410. EVENT_PTR(rapl_pkg),
  411. EVENT_PTR(rapl_ram),
  412. EVENT_PTR(rapl_pkg_unit),
  413. EVENT_PTR(rapl_ram_unit),
  414. EVENT_PTR(rapl_pkg_scale),
  415. EVENT_PTR(rapl_ram_scale),
  416. NULL,
  417. };
  418. static struct attribute_group rapl_pmu_events_group = {
  419. .name = "events",
  420. .attrs = NULL, /* patched at runtime */
  421. };
  422. DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
  423. static struct attribute *rapl_formats_attr[] = {
  424. &format_attr_event.attr,
  425. NULL,
  426. };
  427. static struct attribute_group rapl_pmu_format_group = {
  428. .name = "format",
  429. .attrs = rapl_formats_attr,
  430. };
  431. const struct attribute_group *rapl_attr_groups[] = {
  432. &rapl_pmu_attr_group,
  433. &rapl_pmu_format_group,
  434. &rapl_pmu_events_group,
  435. NULL,
  436. };
  437. static struct pmu rapl_pmu_class = {
  438. .attr_groups = rapl_attr_groups,
  439. .task_ctx_nr = perf_invalid_context, /* system-wide only */
  440. .event_init = rapl_pmu_event_init,
  441. .add = rapl_pmu_event_add, /* must have */
  442. .del = rapl_pmu_event_del, /* must have */
  443. .start = rapl_pmu_event_start,
  444. .stop = rapl_pmu_event_stop,
  445. .read = rapl_pmu_event_read,
  446. };
  447. static void rapl_cpu_exit(int cpu)
  448. {
  449. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  450. int i, phys_id = topology_physical_package_id(cpu);
  451. int target = -1;
  452. /* find a new cpu on same package */
  453. for_each_online_cpu(i) {
  454. if (i == cpu)
  455. continue;
  456. if (phys_id == topology_physical_package_id(i)) {
  457. target = i;
  458. break;
  459. }
  460. }
  461. /*
  462. * clear cpu from cpumask
  463. * if was set in cpumask and still some cpu on package,
  464. * then move to new cpu
  465. */
  466. if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
  467. cpumask_set_cpu(target, &rapl_cpu_mask);
  468. WARN_ON(cpumask_empty(&rapl_cpu_mask));
  469. /*
  470. * migrate events and context to new cpu
  471. */
  472. if (target >= 0)
  473. perf_pmu_migrate_context(pmu->pmu, cpu, target);
  474. /* cancel overflow polling timer for CPU */
  475. rapl_stop_hrtimer(pmu);
  476. }
  477. static void rapl_cpu_init(int cpu)
  478. {
  479. int i, phys_id = topology_physical_package_id(cpu);
  480. /* check if phys_is is already covered */
  481. for_each_cpu(i, &rapl_cpu_mask) {
  482. if (phys_id == topology_physical_package_id(i))
  483. return;
  484. }
  485. /* was not found, so add it */
  486. cpumask_set_cpu(cpu, &rapl_cpu_mask);
  487. }
  488. static __init void rapl_hsw_server_quirk(void)
  489. {
  490. /*
  491. * DRAM domain on HSW server has fixed energy unit which can be
  492. * different than the unit from power unit MSR.
  493. * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
  494. * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
  495. */
  496. rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
  497. }
  498. static int rapl_cpu_prepare(int cpu)
  499. {
  500. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  501. int phys_id = topology_physical_package_id(cpu);
  502. u64 ms;
  503. if (pmu)
  504. return 0;
  505. if (phys_id < 0)
  506. return -1;
  507. pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
  508. if (!pmu)
  509. return -1;
  510. spin_lock_init(&pmu->lock);
  511. INIT_LIST_HEAD(&pmu->active_list);
  512. pmu->pmu = &rapl_pmu_class;
  513. /*
  514. * use reference of 200W for scaling the timeout
  515. * to avoid missing counter overflows.
  516. * 200W = 200 Joules/sec
  517. * divide interval by 2 to avoid lockstep (2 * 100)
  518. * if hw unit is 32, then we use 2 ms 1/200/2
  519. */
  520. if (rapl_hw_unit[0] < 32)
  521. ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
  522. else
  523. ms = 2;
  524. pmu->timer_interval = ms_to_ktime(ms);
  525. rapl_hrtimer_init(pmu);
  526. /* set RAPL pmu for this cpu for now */
  527. per_cpu(rapl_pmu, cpu) = pmu;
  528. per_cpu(rapl_pmu_to_free, cpu) = NULL;
  529. return 0;
  530. }
  531. static void rapl_cpu_kfree(int cpu)
  532. {
  533. struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
  534. kfree(pmu);
  535. per_cpu(rapl_pmu_to_free, cpu) = NULL;
  536. }
  537. static int rapl_cpu_dying(int cpu)
  538. {
  539. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  540. if (!pmu)
  541. return 0;
  542. per_cpu(rapl_pmu, cpu) = NULL;
  543. per_cpu(rapl_pmu_to_free, cpu) = pmu;
  544. return 0;
  545. }
  546. static int rapl_cpu_notifier(struct notifier_block *self,
  547. unsigned long action, void *hcpu)
  548. {
  549. unsigned int cpu = (long)hcpu;
  550. switch (action & ~CPU_TASKS_FROZEN) {
  551. case CPU_UP_PREPARE:
  552. rapl_cpu_prepare(cpu);
  553. break;
  554. case CPU_STARTING:
  555. rapl_cpu_init(cpu);
  556. break;
  557. case CPU_UP_CANCELED:
  558. case CPU_DYING:
  559. rapl_cpu_dying(cpu);
  560. break;
  561. case CPU_ONLINE:
  562. case CPU_DEAD:
  563. rapl_cpu_kfree(cpu);
  564. break;
  565. case CPU_DOWN_PREPARE:
  566. rapl_cpu_exit(cpu);
  567. break;
  568. default:
  569. break;
  570. }
  571. return NOTIFY_OK;
  572. }
  573. static int rapl_check_hw_unit(void)
  574. {
  575. u64 msr_rapl_power_unit_bits;
  576. int i;
  577. /* protect rdmsrl() to handle virtualization */
  578. if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
  579. return -1;
  580. for (i = 0; i < NR_RAPL_DOMAINS; i++)
  581. rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
  582. return 0;
  583. }
  584. static const struct x86_cpu_id rapl_cpu_match[] = {
  585. [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
  586. [1] = {},
  587. };
  588. static int __init rapl_pmu_init(void)
  589. {
  590. struct rapl_pmu *pmu;
  591. int cpu, ret;
  592. struct x86_pmu_quirk *quirk;
  593. int i;
  594. /*
  595. * check for Intel processor family 6
  596. */
  597. if (!x86_match_cpu(rapl_cpu_match))
  598. return 0;
  599. /* check supported CPU */
  600. switch (boot_cpu_data.x86_model) {
  601. case 42: /* Sandy Bridge */
  602. case 58: /* Ivy Bridge */
  603. rapl_cntr_mask = RAPL_IDX_CLN;
  604. rapl_pmu_events_group.attrs = rapl_events_cln_attr;
  605. break;
  606. case 63: /* Haswell-Server */
  607. rapl_add_quirk(rapl_hsw_server_quirk);
  608. rapl_cntr_mask = RAPL_IDX_SRV;
  609. rapl_pmu_events_group.attrs = rapl_events_srv_attr;
  610. break;
  611. case 60: /* Haswell */
  612. case 69: /* Haswell-Celeron */
  613. case 61: /* Broadwell */
  614. rapl_cntr_mask = RAPL_IDX_HSW;
  615. rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
  616. break;
  617. case 45: /* Sandy Bridge-EP */
  618. case 62: /* IvyTown */
  619. rapl_cntr_mask = RAPL_IDX_SRV;
  620. rapl_pmu_events_group.attrs = rapl_events_srv_attr;
  621. break;
  622. case 87: /* Knights Landing */
  623. rapl_add_quirk(rapl_hsw_server_quirk);
  624. rapl_cntr_mask = RAPL_IDX_KNL;
  625. rapl_pmu_events_group.attrs = rapl_events_knl_attr;
  626. default:
  627. /* unsupported */
  628. return 0;
  629. }
  630. ret = rapl_check_hw_unit();
  631. if (ret)
  632. return ret;
  633. /* run cpu model quirks */
  634. for (quirk = rapl_quirks; quirk; quirk = quirk->next)
  635. quirk->func();
  636. cpu_notifier_register_begin();
  637. for_each_online_cpu(cpu) {
  638. ret = rapl_cpu_prepare(cpu);
  639. if (ret)
  640. goto out;
  641. rapl_cpu_init(cpu);
  642. }
  643. __perf_cpu_notifier(rapl_cpu_notifier);
  644. ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
  645. if (WARN_ON(ret)) {
  646. pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
  647. cpu_notifier_register_done();
  648. return -1;
  649. }
  650. pmu = __this_cpu_read(rapl_pmu);
  651. pr_info("RAPL PMU detected,"
  652. " API unit is 2^-32 Joules,"
  653. " %d fixed counters"
  654. " %llu ms ovfl timer\n",
  655. hweight32(rapl_cntr_mask),
  656. ktime_to_ms(pmu->timer_interval));
  657. for (i = 0; i < NR_RAPL_DOMAINS; i++) {
  658. if (rapl_cntr_mask & (1 << i)) {
  659. pr_info("hw unit of domain %s 2^-%d Joules\n",
  660. rapl_domain_names[i], rapl_hw_unit[i]);
  661. }
  662. }
  663. out:
  664. cpu_notifier_register_done();
  665. return 0;
  666. }
  667. device_initcall(rapl_pmu_init);