perf_event_intel_cqm.c 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379
  1. /*
  2. * Intel Cache Quality-of-Service Monitoring (CQM) support.
  3. *
  4. * Based very, very heavily on work by Peter Zijlstra.
  5. */
  6. #include <linux/perf_event.h>
  7. #include <linux/slab.h>
  8. #include <asm/cpu_device_id.h>
  9. #include "perf_event.h"
  10. #define MSR_IA32_PQR_ASSOC 0x0c8f
  11. #define MSR_IA32_QM_CTR 0x0c8e
  12. #define MSR_IA32_QM_EVTSEL 0x0c8d
  13. static unsigned int cqm_max_rmid = -1;
  14. static unsigned int cqm_l3_scale; /* supposedly cacheline size */
  15. struct intel_cqm_state {
  16. raw_spinlock_t lock;
  17. int rmid;
  18. int cnt;
  19. };
  20. static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
  21. /*
  22. * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
  23. * Also protects event->hw.cqm_rmid
  24. *
  25. * Hold either for stability, both for modification of ->hw.cqm_rmid.
  26. */
  27. static DEFINE_MUTEX(cache_mutex);
  28. static DEFINE_RAW_SPINLOCK(cache_lock);
  29. /*
  30. * Groups of events that have the same target(s), one RMID per group.
  31. */
  32. static LIST_HEAD(cache_groups);
  33. /*
  34. * Mask of CPUs for reading CQM values. We only need one per-socket.
  35. */
  36. static cpumask_t cqm_cpumask;
  37. #define RMID_VAL_ERROR (1ULL << 63)
  38. #define RMID_VAL_UNAVAIL (1ULL << 62)
  39. #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
  40. #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
  41. /*
  42. * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
  43. *
  44. * This rmid is always free and is guaranteed to have an associated
  45. * near-zero occupancy value, i.e. no cachelines are tagged with this
  46. * RMID, once __intel_cqm_rmid_rotate() returns.
  47. */
  48. static unsigned int intel_cqm_rotation_rmid;
  49. #define INVALID_RMID (-1)
  50. /*
  51. * Is @rmid valid for programming the hardware?
  52. *
  53. * rmid 0 is reserved by the hardware for all non-monitored tasks, which
  54. * means that we should never come across an rmid with that value.
  55. * Likewise, an rmid value of -1 is used to indicate "no rmid currently
  56. * assigned" and is used as part of the rotation code.
  57. */
  58. static inline bool __rmid_valid(unsigned int rmid)
  59. {
  60. if (!rmid || rmid == INVALID_RMID)
  61. return false;
  62. return true;
  63. }
  64. static u64 __rmid_read(unsigned int rmid)
  65. {
  66. u64 val;
  67. /*
  68. * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
  69. * it just says that to increase confusion.
  70. */
  71. wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
  72. rdmsrl(MSR_IA32_QM_CTR, val);
  73. /*
  74. * Aside from the ERROR and UNAVAIL bits, assume this thing returns
  75. * the number of cachelines tagged with @rmid.
  76. */
  77. return val;
  78. }
  79. enum rmid_recycle_state {
  80. RMID_YOUNG = 0,
  81. RMID_AVAILABLE,
  82. RMID_DIRTY,
  83. };
  84. struct cqm_rmid_entry {
  85. unsigned int rmid;
  86. enum rmid_recycle_state state;
  87. struct list_head list;
  88. unsigned long queue_time;
  89. };
  90. /*
  91. * cqm_rmid_free_lru - A least recently used list of RMIDs.
  92. *
  93. * Oldest entry at the head, newest (most recently used) entry at the
  94. * tail. This list is never traversed, it's only used to keep track of
  95. * the lru order. That is, we only pick entries of the head or insert
  96. * them on the tail.
  97. *
  98. * All entries on the list are 'free', and their RMIDs are not currently
  99. * in use. To mark an RMID as in use, remove its entry from the lru
  100. * list.
  101. *
  102. *
  103. * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
  104. *
  105. * This list is contains RMIDs that no one is currently using but that
  106. * may have a non-zero occupancy value associated with them. The
  107. * rotation worker moves RMIDs from the limbo list to the free list once
  108. * the occupancy value drops below __intel_cqm_threshold.
  109. *
  110. * Both lists are protected by cache_mutex.
  111. */
  112. static LIST_HEAD(cqm_rmid_free_lru);
  113. static LIST_HEAD(cqm_rmid_limbo_lru);
  114. /*
  115. * We use a simple array of pointers so that we can lookup a struct
  116. * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
  117. * and __put_rmid() from having to worry about dealing with struct
  118. * cqm_rmid_entry - they just deal with rmids, i.e. integers.
  119. *
  120. * Once this array is initialized it is read-only. No locks are required
  121. * to access it.
  122. *
  123. * All entries for all RMIDs can be looked up in the this array at all
  124. * times.
  125. */
  126. static struct cqm_rmid_entry **cqm_rmid_ptrs;
  127. static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
  128. {
  129. struct cqm_rmid_entry *entry;
  130. entry = cqm_rmid_ptrs[rmid];
  131. WARN_ON(entry->rmid != rmid);
  132. return entry;
  133. }
  134. /*
  135. * Returns < 0 on fail.
  136. *
  137. * We expect to be called with cache_mutex held.
  138. */
  139. static int __get_rmid(void)
  140. {
  141. struct cqm_rmid_entry *entry;
  142. lockdep_assert_held(&cache_mutex);
  143. if (list_empty(&cqm_rmid_free_lru))
  144. return INVALID_RMID;
  145. entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
  146. list_del(&entry->list);
  147. return entry->rmid;
  148. }
  149. static void __put_rmid(unsigned int rmid)
  150. {
  151. struct cqm_rmid_entry *entry;
  152. lockdep_assert_held(&cache_mutex);
  153. WARN_ON(!__rmid_valid(rmid));
  154. entry = __rmid_entry(rmid);
  155. entry->queue_time = jiffies;
  156. entry->state = RMID_YOUNG;
  157. list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
  158. }
  159. static int intel_cqm_setup_rmid_cache(void)
  160. {
  161. struct cqm_rmid_entry *entry;
  162. unsigned int nr_rmids;
  163. int r = 0;
  164. nr_rmids = cqm_max_rmid + 1;
  165. cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
  166. nr_rmids, GFP_KERNEL);
  167. if (!cqm_rmid_ptrs)
  168. return -ENOMEM;
  169. for (; r <= cqm_max_rmid; r++) {
  170. struct cqm_rmid_entry *entry;
  171. entry = kmalloc(sizeof(*entry), GFP_KERNEL);
  172. if (!entry)
  173. goto fail;
  174. INIT_LIST_HEAD(&entry->list);
  175. entry->rmid = r;
  176. cqm_rmid_ptrs[r] = entry;
  177. list_add_tail(&entry->list, &cqm_rmid_free_lru);
  178. }
  179. /*
  180. * RMID 0 is special and is always allocated. It's used for all
  181. * tasks that are not monitored.
  182. */
  183. entry = __rmid_entry(0);
  184. list_del(&entry->list);
  185. mutex_lock(&cache_mutex);
  186. intel_cqm_rotation_rmid = __get_rmid();
  187. mutex_unlock(&cache_mutex);
  188. return 0;
  189. fail:
  190. while (r--)
  191. kfree(cqm_rmid_ptrs[r]);
  192. kfree(cqm_rmid_ptrs);
  193. return -ENOMEM;
  194. }
  195. /*
  196. * Determine if @a and @b measure the same set of tasks.
  197. *
  198. * If @a and @b measure the same set of tasks then we want to share a
  199. * single RMID.
  200. */
  201. static bool __match_event(struct perf_event *a, struct perf_event *b)
  202. {
  203. /* Per-cpu and task events don't mix */
  204. if ((a->attach_state & PERF_ATTACH_TASK) !=
  205. (b->attach_state & PERF_ATTACH_TASK))
  206. return false;
  207. #ifdef CONFIG_CGROUP_PERF
  208. if (a->cgrp != b->cgrp)
  209. return false;
  210. #endif
  211. /* If not task event, we're machine wide */
  212. if (!(b->attach_state & PERF_ATTACH_TASK))
  213. return true;
  214. /*
  215. * Events that target same task are placed into the same cache group.
  216. */
  217. if (a->hw.target == b->hw.target)
  218. return true;
  219. /*
  220. * Are we an inherited event?
  221. */
  222. if (b->parent == a)
  223. return true;
  224. return false;
  225. }
  226. #ifdef CONFIG_CGROUP_PERF
  227. static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
  228. {
  229. if (event->attach_state & PERF_ATTACH_TASK)
  230. return perf_cgroup_from_task(event->hw.target);
  231. return event->cgrp;
  232. }
  233. #endif
  234. /*
  235. * Determine if @a's tasks intersect with @b's tasks
  236. *
  237. * There are combinations of events that we explicitly prohibit,
  238. *
  239. * PROHIBITS
  240. * system-wide -> cgroup and task
  241. * cgroup -> system-wide
  242. * -> task in cgroup
  243. * task -> system-wide
  244. * -> task in cgroup
  245. *
  246. * Call this function before allocating an RMID.
  247. */
  248. static bool __conflict_event(struct perf_event *a, struct perf_event *b)
  249. {
  250. #ifdef CONFIG_CGROUP_PERF
  251. /*
  252. * We can have any number of cgroups but only one system-wide
  253. * event at a time.
  254. */
  255. if (a->cgrp && b->cgrp) {
  256. struct perf_cgroup *ac = a->cgrp;
  257. struct perf_cgroup *bc = b->cgrp;
  258. /*
  259. * This condition should have been caught in
  260. * __match_event() and we should be sharing an RMID.
  261. */
  262. WARN_ON_ONCE(ac == bc);
  263. if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
  264. cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
  265. return true;
  266. return false;
  267. }
  268. if (a->cgrp || b->cgrp) {
  269. struct perf_cgroup *ac, *bc;
  270. /*
  271. * cgroup and system-wide events are mutually exclusive
  272. */
  273. if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
  274. (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
  275. return true;
  276. /*
  277. * Ensure neither event is part of the other's cgroup
  278. */
  279. ac = event_to_cgroup(a);
  280. bc = event_to_cgroup(b);
  281. if (ac == bc)
  282. return true;
  283. /*
  284. * Must have cgroup and non-intersecting task events.
  285. */
  286. if (!ac || !bc)
  287. return false;
  288. /*
  289. * We have cgroup and task events, and the task belongs
  290. * to a cgroup. Check for for overlap.
  291. */
  292. if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
  293. cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
  294. return true;
  295. return false;
  296. }
  297. #endif
  298. /*
  299. * If one of them is not a task, same story as above with cgroups.
  300. */
  301. if (!(a->attach_state & PERF_ATTACH_TASK) ||
  302. !(b->attach_state & PERF_ATTACH_TASK))
  303. return true;
  304. /*
  305. * Must be non-overlapping.
  306. */
  307. return false;
  308. }
  309. struct rmid_read {
  310. unsigned int rmid;
  311. atomic64_t value;
  312. };
  313. static void __intel_cqm_event_count(void *info);
  314. /*
  315. * Exchange the RMID of a group of events.
  316. */
  317. static unsigned int
  318. intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
  319. {
  320. struct perf_event *event;
  321. unsigned int old_rmid = group->hw.cqm_rmid;
  322. struct list_head *head = &group->hw.cqm_group_entry;
  323. lockdep_assert_held(&cache_mutex);
  324. /*
  325. * If our RMID is being deallocated, perform a read now.
  326. */
  327. if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
  328. struct rmid_read rr = {
  329. .value = ATOMIC64_INIT(0),
  330. .rmid = old_rmid,
  331. };
  332. on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
  333. &rr, 1);
  334. local64_set(&group->count, atomic64_read(&rr.value));
  335. }
  336. raw_spin_lock_irq(&cache_lock);
  337. group->hw.cqm_rmid = rmid;
  338. list_for_each_entry(event, head, hw.cqm_group_entry)
  339. event->hw.cqm_rmid = rmid;
  340. raw_spin_unlock_irq(&cache_lock);
  341. return old_rmid;
  342. }
  343. /*
  344. * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
  345. * cachelines are still tagged with RMIDs in limbo, we progressively
  346. * increment the threshold until we find an RMID in limbo with <=
  347. * __intel_cqm_threshold lines tagged. This is designed to mitigate the
  348. * problem where cachelines tagged with an RMID are not steadily being
  349. * evicted.
  350. *
  351. * On successful rotations we decrease the threshold back towards zero.
  352. *
  353. * __intel_cqm_max_threshold provides an upper bound on the threshold,
  354. * and is measured in bytes because it's exposed to userland.
  355. */
  356. static unsigned int __intel_cqm_threshold;
  357. static unsigned int __intel_cqm_max_threshold;
  358. /*
  359. * Test whether an RMID has a zero occupancy value on this cpu.
  360. */
  361. static void intel_cqm_stable(void *arg)
  362. {
  363. struct cqm_rmid_entry *entry;
  364. list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
  365. if (entry->state != RMID_AVAILABLE)
  366. break;
  367. if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
  368. entry->state = RMID_DIRTY;
  369. }
  370. }
  371. /*
  372. * If we have group events waiting for an RMID that don't conflict with
  373. * events already running, assign @rmid.
  374. */
  375. static bool intel_cqm_sched_in_event(unsigned int rmid)
  376. {
  377. struct perf_event *leader, *event;
  378. lockdep_assert_held(&cache_mutex);
  379. leader = list_first_entry(&cache_groups, struct perf_event,
  380. hw.cqm_groups_entry);
  381. event = leader;
  382. list_for_each_entry_continue(event, &cache_groups,
  383. hw.cqm_groups_entry) {
  384. if (__rmid_valid(event->hw.cqm_rmid))
  385. continue;
  386. if (__conflict_event(event, leader))
  387. continue;
  388. intel_cqm_xchg_rmid(event, rmid);
  389. return true;
  390. }
  391. return false;
  392. }
  393. /*
  394. * Initially use this constant for both the limbo queue time and the
  395. * rotation timer interval, pmu::hrtimer_interval_ms.
  396. *
  397. * They don't need to be the same, but the two are related since if you
  398. * rotate faster than you recycle RMIDs, you may run out of available
  399. * RMIDs.
  400. */
  401. #define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
  402. static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
  403. /*
  404. * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
  405. * @nr_available: number of freeable RMIDs on the limbo list
  406. *
  407. * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
  408. * cachelines are tagged with those RMIDs. After this we can reuse them
  409. * and know that the current set of active RMIDs is stable.
  410. *
  411. * Return %true or %false depending on whether stabilization needs to be
  412. * reattempted.
  413. *
  414. * If we return %true then @nr_available is updated to indicate the
  415. * number of RMIDs on the limbo list that have been queued for the
  416. * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
  417. * are above __intel_cqm_threshold.
  418. */
  419. static bool intel_cqm_rmid_stabilize(unsigned int *available)
  420. {
  421. struct cqm_rmid_entry *entry, *tmp;
  422. lockdep_assert_held(&cache_mutex);
  423. *available = 0;
  424. list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
  425. unsigned long min_queue_time;
  426. unsigned long now = jiffies;
  427. /*
  428. * We hold RMIDs placed into limbo for a minimum queue
  429. * time. Before the minimum queue time has elapsed we do
  430. * not recycle RMIDs.
  431. *
  432. * The reasoning is that until a sufficient time has
  433. * passed since we stopped using an RMID, any RMID
  434. * placed onto the limbo list will likely still have
  435. * data tagged in the cache, which means we'll probably
  436. * fail to recycle it anyway.
  437. *
  438. * We can save ourselves an expensive IPI by skipping
  439. * any RMIDs that have not been queued for the minimum
  440. * time.
  441. */
  442. min_queue_time = entry->queue_time +
  443. msecs_to_jiffies(__rmid_queue_time_ms);
  444. if (time_after(min_queue_time, now))
  445. break;
  446. entry->state = RMID_AVAILABLE;
  447. (*available)++;
  448. }
  449. /*
  450. * Fast return if none of the RMIDs on the limbo list have been
  451. * sitting on the queue for the minimum queue time.
  452. */
  453. if (!*available)
  454. return false;
  455. /*
  456. * Test whether an RMID is free for each package.
  457. */
  458. on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
  459. list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
  460. /*
  461. * Exhausted all RMIDs that have waited min queue time.
  462. */
  463. if (entry->state == RMID_YOUNG)
  464. break;
  465. if (entry->state == RMID_DIRTY)
  466. continue;
  467. list_del(&entry->list); /* remove from limbo */
  468. /*
  469. * The rotation RMID gets priority if it's
  470. * currently invalid. In which case, skip adding
  471. * the RMID to the the free lru.
  472. */
  473. if (!__rmid_valid(intel_cqm_rotation_rmid)) {
  474. intel_cqm_rotation_rmid = entry->rmid;
  475. continue;
  476. }
  477. /*
  478. * If we have groups waiting for RMIDs, hand
  479. * them one now provided they don't conflict.
  480. */
  481. if (intel_cqm_sched_in_event(entry->rmid))
  482. continue;
  483. /*
  484. * Otherwise place it onto the free list.
  485. */
  486. list_add_tail(&entry->list, &cqm_rmid_free_lru);
  487. }
  488. return __rmid_valid(intel_cqm_rotation_rmid);
  489. }
  490. /*
  491. * Pick a victim group and move it to the tail of the group list.
  492. * @next: The first group without an RMID
  493. */
  494. static void __intel_cqm_pick_and_rotate(struct perf_event *next)
  495. {
  496. struct perf_event *rotor;
  497. unsigned int rmid;
  498. lockdep_assert_held(&cache_mutex);
  499. rotor = list_first_entry(&cache_groups, struct perf_event,
  500. hw.cqm_groups_entry);
  501. /*
  502. * The group at the front of the list should always have a valid
  503. * RMID. If it doesn't then no groups have RMIDs assigned and we
  504. * don't need to rotate the list.
  505. */
  506. if (next == rotor)
  507. return;
  508. rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
  509. __put_rmid(rmid);
  510. list_rotate_left(&cache_groups);
  511. }
  512. /*
  513. * Deallocate the RMIDs from any events that conflict with @event, and
  514. * place them on the back of the group list.
  515. */
  516. static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
  517. {
  518. struct perf_event *group, *g;
  519. unsigned int rmid;
  520. lockdep_assert_held(&cache_mutex);
  521. list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
  522. if (group == event)
  523. continue;
  524. rmid = group->hw.cqm_rmid;
  525. /*
  526. * Skip events that don't have a valid RMID.
  527. */
  528. if (!__rmid_valid(rmid))
  529. continue;
  530. /*
  531. * No conflict? No problem! Leave the event alone.
  532. */
  533. if (!__conflict_event(group, event))
  534. continue;
  535. intel_cqm_xchg_rmid(group, INVALID_RMID);
  536. __put_rmid(rmid);
  537. }
  538. }
  539. /*
  540. * Attempt to rotate the groups and assign new RMIDs.
  541. *
  542. * We rotate for two reasons,
  543. * 1. To handle the scheduling of conflicting events
  544. * 2. To recycle RMIDs
  545. *
  546. * Rotating RMIDs is complicated because the hardware doesn't give us
  547. * any clues.
  548. *
  549. * There's problems with the hardware interface; when you change the
  550. * task:RMID map cachelines retain their 'old' tags, giving a skewed
  551. * picture. In order to work around this, we must always keep one free
  552. * RMID - intel_cqm_rotation_rmid.
  553. *
  554. * Rotation works by taking away an RMID from a group (the old RMID),
  555. * and assigning the free RMID to another group (the new RMID). We must
  556. * then wait for the old RMID to not be used (no cachelines tagged).
  557. * This ensure that all cachelines are tagged with 'active' RMIDs. At
  558. * this point we can start reading values for the new RMID and treat the
  559. * old RMID as the free RMID for the next rotation.
  560. *
  561. * Return %true or %false depending on whether we did any rotating.
  562. */
  563. static bool __intel_cqm_rmid_rotate(void)
  564. {
  565. struct perf_event *group, *start = NULL;
  566. unsigned int threshold_limit;
  567. unsigned int nr_needed = 0;
  568. unsigned int nr_available;
  569. bool rotated = false;
  570. mutex_lock(&cache_mutex);
  571. again:
  572. /*
  573. * Fast path through this function if there are no groups and no
  574. * RMIDs that need cleaning.
  575. */
  576. if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
  577. goto out;
  578. list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
  579. if (!__rmid_valid(group->hw.cqm_rmid)) {
  580. if (!start)
  581. start = group;
  582. nr_needed++;
  583. }
  584. }
  585. /*
  586. * We have some event groups, but they all have RMIDs assigned
  587. * and no RMIDs need cleaning.
  588. */
  589. if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
  590. goto out;
  591. if (!nr_needed)
  592. goto stabilize;
  593. /*
  594. * We have more event groups without RMIDs than available RMIDs,
  595. * or we have event groups that conflict with the ones currently
  596. * scheduled.
  597. *
  598. * We force deallocate the rmid of the group at the head of
  599. * cache_groups. The first event group without an RMID then gets
  600. * assigned intel_cqm_rotation_rmid. This ensures we always make
  601. * forward progress.
  602. *
  603. * Rotate the cache_groups list so the previous head is now the
  604. * tail.
  605. */
  606. __intel_cqm_pick_and_rotate(start);
  607. /*
  608. * If the rotation is going to succeed, reduce the threshold so
  609. * that we don't needlessly reuse dirty RMIDs.
  610. */
  611. if (__rmid_valid(intel_cqm_rotation_rmid)) {
  612. intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
  613. intel_cqm_rotation_rmid = __get_rmid();
  614. intel_cqm_sched_out_conflicting_events(start);
  615. if (__intel_cqm_threshold)
  616. __intel_cqm_threshold--;
  617. }
  618. rotated = true;
  619. stabilize:
  620. /*
  621. * We now need to stablize the RMID we freed above (if any) to
  622. * ensure that the next time we rotate we have an RMID with zero
  623. * occupancy value.
  624. *
  625. * Alternatively, if we didn't need to perform any rotation,
  626. * we'll have a bunch of RMIDs in limbo that need stabilizing.
  627. */
  628. threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
  629. while (intel_cqm_rmid_stabilize(&nr_available) &&
  630. __intel_cqm_threshold < threshold_limit) {
  631. unsigned int steal_limit;
  632. /*
  633. * Don't spin if nobody is actively waiting for an RMID,
  634. * the rotation worker will be kicked as soon as an
  635. * event needs an RMID anyway.
  636. */
  637. if (!nr_needed)
  638. break;
  639. /* Allow max 25% of RMIDs to be in limbo. */
  640. steal_limit = (cqm_max_rmid + 1) / 4;
  641. /*
  642. * We failed to stabilize any RMIDs so our rotation
  643. * logic is now stuck. In order to make forward progress
  644. * we have a few options:
  645. *
  646. * 1. rotate ("steal") another RMID
  647. * 2. increase the threshold
  648. * 3. do nothing
  649. *
  650. * We do both of 1. and 2. until we hit the steal limit.
  651. *
  652. * The steal limit prevents all RMIDs ending up on the
  653. * limbo list. This can happen if every RMID has a
  654. * non-zero occupancy above threshold_limit, and the
  655. * occupancy values aren't dropping fast enough.
  656. *
  657. * Note that there is prioritisation at work here - we'd
  658. * rather increase the number of RMIDs on the limbo list
  659. * than increase the threshold, because increasing the
  660. * threshold skews the event data (because we reuse
  661. * dirty RMIDs) - threshold bumps are a last resort.
  662. */
  663. if (nr_available < steal_limit)
  664. goto again;
  665. __intel_cqm_threshold++;
  666. }
  667. out:
  668. mutex_unlock(&cache_mutex);
  669. return rotated;
  670. }
  671. static void intel_cqm_rmid_rotate(struct work_struct *work);
  672. static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
  673. static struct pmu intel_cqm_pmu;
  674. static void intel_cqm_rmid_rotate(struct work_struct *work)
  675. {
  676. unsigned long delay;
  677. __intel_cqm_rmid_rotate();
  678. delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
  679. schedule_delayed_work(&intel_cqm_rmid_work, delay);
  680. }
  681. /*
  682. * Find a group and setup RMID.
  683. *
  684. * If we're part of a group, we use the group's RMID.
  685. */
  686. static void intel_cqm_setup_event(struct perf_event *event,
  687. struct perf_event **group)
  688. {
  689. struct perf_event *iter;
  690. unsigned int rmid;
  691. bool conflict = false;
  692. list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
  693. rmid = iter->hw.cqm_rmid;
  694. if (__match_event(iter, event)) {
  695. /* All tasks in a group share an RMID */
  696. event->hw.cqm_rmid = rmid;
  697. *group = iter;
  698. return;
  699. }
  700. /*
  701. * We only care about conflicts for events that are
  702. * actually scheduled in (and hence have a valid RMID).
  703. */
  704. if (__conflict_event(iter, event) && __rmid_valid(rmid))
  705. conflict = true;
  706. }
  707. if (conflict)
  708. rmid = INVALID_RMID;
  709. else
  710. rmid = __get_rmid();
  711. event->hw.cqm_rmid = rmid;
  712. }
  713. static void intel_cqm_event_read(struct perf_event *event)
  714. {
  715. unsigned long flags;
  716. unsigned int rmid;
  717. u64 val;
  718. /*
  719. * Task events are handled by intel_cqm_event_count().
  720. */
  721. if (event->cpu == -1)
  722. return;
  723. raw_spin_lock_irqsave(&cache_lock, flags);
  724. rmid = event->hw.cqm_rmid;
  725. if (!__rmid_valid(rmid))
  726. goto out;
  727. val = __rmid_read(rmid);
  728. /*
  729. * Ignore this reading on error states and do not update the value.
  730. */
  731. if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
  732. goto out;
  733. local64_set(&event->count, val);
  734. out:
  735. raw_spin_unlock_irqrestore(&cache_lock, flags);
  736. }
  737. static void __intel_cqm_event_count(void *info)
  738. {
  739. struct rmid_read *rr = info;
  740. u64 val;
  741. val = __rmid_read(rr->rmid);
  742. if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
  743. return;
  744. atomic64_add(val, &rr->value);
  745. }
  746. static inline bool cqm_group_leader(struct perf_event *event)
  747. {
  748. return !list_empty(&event->hw.cqm_groups_entry);
  749. }
  750. static u64 intel_cqm_event_count(struct perf_event *event)
  751. {
  752. unsigned long flags;
  753. struct rmid_read rr = {
  754. .value = ATOMIC64_INIT(0),
  755. };
  756. /*
  757. * We only need to worry about task events. System-wide events
  758. * are handled like usual, i.e. entirely with
  759. * intel_cqm_event_read().
  760. */
  761. if (event->cpu != -1)
  762. return __perf_event_count(event);
  763. /*
  764. * Only the group leader gets to report values. This stops us
  765. * reporting duplicate values to userspace, and gives us a clear
  766. * rule for which task gets to report the values.
  767. *
  768. * Note that it is impossible to attribute these values to
  769. * specific packages - we forfeit that ability when we create
  770. * task events.
  771. */
  772. if (!cqm_group_leader(event))
  773. return 0;
  774. /*
  775. * Notice that we don't perform the reading of an RMID
  776. * atomically, because we can't hold a spin lock across the
  777. * IPIs.
  778. *
  779. * Speculatively perform the read, since @event might be
  780. * assigned a different (possibly invalid) RMID while we're
  781. * busying performing the IPI calls. It's therefore necessary to
  782. * check @event's RMID afterwards, and if it has changed,
  783. * discard the result of the read.
  784. */
  785. rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
  786. if (!__rmid_valid(rr.rmid))
  787. goto out;
  788. on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
  789. raw_spin_lock_irqsave(&cache_lock, flags);
  790. if (event->hw.cqm_rmid == rr.rmid)
  791. local64_set(&event->count, atomic64_read(&rr.value));
  792. raw_spin_unlock_irqrestore(&cache_lock, flags);
  793. out:
  794. return __perf_event_count(event);
  795. }
  796. static void intel_cqm_event_start(struct perf_event *event, int mode)
  797. {
  798. struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
  799. unsigned int rmid = event->hw.cqm_rmid;
  800. unsigned long flags;
  801. if (!(event->hw.cqm_state & PERF_HES_STOPPED))
  802. return;
  803. event->hw.cqm_state &= ~PERF_HES_STOPPED;
  804. raw_spin_lock_irqsave(&state->lock, flags);
  805. if (state->cnt++)
  806. WARN_ON_ONCE(state->rmid != rmid);
  807. else
  808. WARN_ON_ONCE(state->rmid);
  809. state->rmid = rmid;
  810. wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
  811. raw_spin_unlock_irqrestore(&state->lock, flags);
  812. }
  813. static void intel_cqm_event_stop(struct perf_event *event, int mode)
  814. {
  815. struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
  816. unsigned long flags;
  817. if (event->hw.cqm_state & PERF_HES_STOPPED)
  818. return;
  819. event->hw.cqm_state |= PERF_HES_STOPPED;
  820. raw_spin_lock_irqsave(&state->lock, flags);
  821. intel_cqm_event_read(event);
  822. if (!--state->cnt) {
  823. state->rmid = 0;
  824. wrmsrl(MSR_IA32_PQR_ASSOC, 0);
  825. } else {
  826. WARN_ON_ONCE(!state->rmid);
  827. }
  828. raw_spin_unlock_irqrestore(&state->lock, flags);
  829. }
  830. static int intel_cqm_event_add(struct perf_event *event, int mode)
  831. {
  832. unsigned long flags;
  833. unsigned int rmid;
  834. raw_spin_lock_irqsave(&cache_lock, flags);
  835. event->hw.cqm_state = PERF_HES_STOPPED;
  836. rmid = event->hw.cqm_rmid;
  837. if (__rmid_valid(rmid) && (mode & PERF_EF_START))
  838. intel_cqm_event_start(event, mode);
  839. raw_spin_unlock_irqrestore(&cache_lock, flags);
  840. return 0;
  841. }
  842. static void intel_cqm_event_del(struct perf_event *event, int mode)
  843. {
  844. intel_cqm_event_stop(event, mode);
  845. }
  846. static void intel_cqm_event_destroy(struct perf_event *event)
  847. {
  848. struct perf_event *group_other = NULL;
  849. mutex_lock(&cache_mutex);
  850. /*
  851. * If there's another event in this group...
  852. */
  853. if (!list_empty(&event->hw.cqm_group_entry)) {
  854. group_other = list_first_entry(&event->hw.cqm_group_entry,
  855. struct perf_event,
  856. hw.cqm_group_entry);
  857. list_del(&event->hw.cqm_group_entry);
  858. }
  859. /*
  860. * And we're the group leader..
  861. */
  862. if (cqm_group_leader(event)) {
  863. /*
  864. * If there was a group_other, make that leader, otherwise
  865. * destroy the group and return the RMID.
  866. */
  867. if (group_other) {
  868. list_replace(&event->hw.cqm_groups_entry,
  869. &group_other->hw.cqm_groups_entry);
  870. } else {
  871. unsigned int rmid = event->hw.cqm_rmid;
  872. if (__rmid_valid(rmid))
  873. __put_rmid(rmid);
  874. list_del(&event->hw.cqm_groups_entry);
  875. }
  876. }
  877. mutex_unlock(&cache_mutex);
  878. }
  879. static int intel_cqm_event_init(struct perf_event *event)
  880. {
  881. struct perf_event *group = NULL;
  882. bool rotate = false;
  883. if (event->attr.type != intel_cqm_pmu.type)
  884. return -ENOENT;
  885. if (event->attr.config & ~QOS_EVENT_MASK)
  886. return -EINVAL;
  887. /* unsupported modes and filters */
  888. if (event->attr.exclude_user ||
  889. event->attr.exclude_kernel ||
  890. event->attr.exclude_hv ||
  891. event->attr.exclude_idle ||
  892. event->attr.exclude_host ||
  893. event->attr.exclude_guest ||
  894. event->attr.sample_period) /* no sampling */
  895. return -EINVAL;
  896. INIT_LIST_HEAD(&event->hw.cqm_group_entry);
  897. INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
  898. event->destroy = intel_cqm_event_destroy;
  899. mutex_lock(&cache_mutex);
  900. /* Will also set rmid */
  901. intel_cqm_setup_event(event, &group);
  902. if (group) {
  903. list_add_tail(&event->hw.cqm_group_entry,
  904. &group->hw.cqm_group_entry);
  905. } else {
  906. list_add_tail(&event->hw.cqm_groups_entry,
  907. &cache_groups);
  908. /*
  909. * All RMIDs are either in use or have recently been
  910. * used. Kick the rotation worker to clean/free some.
  911. *
  912. * We only do this for the group leader, rather than for
  913. * every event in a group to save on needless work.
  914. */
  915. if (!__rmid_valid(event->hw.cqm_rmid))
  916. rotate = true;
  917. }
  918. mutex_unlock(&cache_mutex);
  919. if (rotate)
  920. schedule_delayed_work(&intel_cqm_rmid_work, 0);
  921. return 0;
  922. }
  923. EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
  924. EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
  925. EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
  926. EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
  927. EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
  928. static struct attribute *intel_cqm_events_attr[] = {
  929. EVENT_PTR(intel_cqm_llc),
  930. EVENT_PTR(intel_cqm_llc_pkg),
  931. EVENT_PTR(intel_cqm_llc_unit),
  932. EVENT_PTR(intel_cqm_llc_scale),
  933. EVENT_PTR(intel_cqm_llc_snapshot),
  934. NULL,
  935. };
  936. static struct attribute_group intel_cqm_events_group = {
  937. .name = "events",
  938. .attrs = intel_cqm_events_attr,
  939. };
  940. PMU_FORMAT_ATTR(event, "config:0-7");
  941. static struct attribute *intel_cqm_formats_attr[] = {
  942. &format_attr_event.attr,
  943. NULL,
  944. };
  945. static struct attribute_group intel_cqm_format_group = {
  946. .name = "format",
  947. .attrs = intel_cqm_formats_attr,
  948. };
  949. static ssize_t
  950. max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
  951. char *page)
  952. {
  953. ssize_t rv;
  954. mutex_lock(&cache_mutex);
  955. rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
  956. mutex_unlock(&cache_mutex);
  957. return rv;
  958. }
  959. static ssize_t
  960. max_recycle_threshold_store(struct device *dev,
  961. struct device_attribute *attr,
  962. const char *buf, size_t count)
  963. {
  964. unsigned int bytes, cachelines;
  965. int ret;
  966. ret = kstrtouint(buf, 0, &bytes);
  967. if (ret)
  968. return ret;
  969. mutex_lock(&cache_mutex);
  970. __intel_cqm_max_threshold = bytes;
  971. cachelines = bytes / cqm_l3_scale;
  972. /*
  973. * The new maximum takes effect immediately.
  974. */
  975. if (__intel_cqm_threshold > cachelines)
  976. __intel_cqm_threshold = cachelines;
  977. mutex_unlock(&cache_mutex);
  978. return count;
  979. }
  980. static DEVICE_ATTR_RW(max_recycle_threshold);
  981. static struct attribute *intel_cqm_attrs[] = {
  982. &dev_attr_max_recycle_threshold.attr,
  983. NULL,
  984. };
  985. static const struct attribute_group intel_cqm_group = {
  986. .attrs = intel_cqm_attrs,
  987. };
  988. static const struct attribute_group *intel_cqm_attr_groups[] = {
  989. &intel_cqm_events_group,
  990. &intel_cqm_format_group,
  991. &intel_cqm_group,
  992. NULL,
  993. };
  994. static struct pmu intel_cqm_pmu = {
  995. .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
  996. .attr_groups = intel_cqm_attr_groups,
  997. .task_ctx_nr = perf_sw_context,
  998. .event_init = intel_cqm_event_init,
  999. .add = intel_cqm_event_add,
  1000. .del = intel_cqm_event_del,
  1001. .start = intel_cqm_event_start,
  1002. .stop = intel_cqm_event_stop,
  1003. .read = intel_cqm_event_read,
  1004. .count = intel_cqm_event_count,
  1005. };
  1006. static inline void cqm_pick_event_reader(int cpu)
  1007. {
  1008. int phys_id = topology_physical_package_id(cpu);
  1009. int i;
  1010. for_each_cpu(i, &cqm_cpumask) {
  1011. if (phys_id == topology_physical_package_id(i))
  1012. return; /* already got reader for this socket */
  1013. }
  1014. cpumask_set_cpu(cpu, &cqm_cpumask);
  1015. }
  1016. static void intel_cqm_cpu_prepare(unsigned int cpu)
  1017. {
  1018. struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
  1019. struct cpuinfo_x86 *c = &cpu_data(cpu);
  1020. raw_spin_lock_init(&state->lock);
  1021. state->rmid = 0;
  1022. state->cnt = 0;
  1023. WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
  1024. WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
  1025. }
  1026. static void intel_cqm_cpu_exit(unsigned int cpu)
  1027. {
  1028. int phys_id = topology_physical_package_id(cpu);
  1029. int i;
  1030. /*
  1031. * Is @cpu a designated cqm reader?
  1032. */
  1033. if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
  1034. return;
  1035. for_each_online_cpu(i) {
  1036. if (i == cpu)
  1037. continue;
  1038. if (phys_id == topology_physical_package_id(i)) {
  1039. cpumask_set_cpu(i, &cqm_cpumask);
  1040. break;
  1041. }
  1042. }
  1043. }
  1044. static int intel_cqm_cpu_notifier(struct notifier_block *nb,
  1045. unsigned long action, void *hcpu)
  1046. {
  1047. unsigned int cpu = (unsigned long)hcpu;
  1048. switch (action & ~CPU_TASKS_FROZEN) {
  1049. case CPU_UP_PREPARE:
  1050. intel_cqm_cpu_prepare(cpu);
  1051. break;
  1052. case CPU_DOWN_PREPARE:
  1053. intel_cqm_cpu_exit(cpu);
  1054. break;
  1055. case CPU_STARTING:
  1056. cqm_pick_event_reader(cpu);
  1057. break;
  1058. }
  1059. return NOTIFY_OK;
  1060. }
  1061. static const struct x86_cpu_id intel_cqm_match[] = {
  1062. { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
  1063. {}
  1064. };
  1065. static int __init intel_cqm_init(void)
  1066. {
  1067. char *str, scale[20];
  1068. int i, cpu, ret;
  1069. if (!x86_match_cpu(intel_cqm_match))
  1070. return -ENODEV;
  1071. cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
  1072. /*
  1073. * It's possible that not all resources support the same number
  1074. * of RMIDs. Instead of making scheduling much more complicated
  1075. * (where we have to match a task's RMID to a cpu that supports
  1076. * that many RMIDs) just find the minimum RMIDs supported across
  1077. * all cpus.
  1078. *
  1079. * Also, check that the scales match on all cpus.
  1080. */
  1081. cpu_notifier_register_begin();
  1082. for_each_online_cpu(cpu) {
  1083. struct cpuinfo_x86 *c = &cpu_data(cpu);
  1084. if (c->x86_cache_max_rmid < cqm_max_rmid)
  1085. cqm_max_rmid = c->x86_cache_max_rmid;
  1086. if (c->x86_cache_occ_scale != cqm_l3_scale) {
  1087. pr_err("Multiple LLC scale values, disabling\n");
  1088. ret = -EINVAL;
  1089. goto out;
  1090. }
  1091. }
  1092. /*
  1093. * A reasonable upper limit on the max threshold is the number
  1094. * of lines tagged per RMID if all RMIDs have the same number of
  1095. * lines tagged in the LLC.
  1096. *
  1097. * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
  1098. */
  1099. __intel_cqm_max_threshold =
  1100. boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
  1101. snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
  1102. str = kstrdup(scale, GFP_KERNEL);
  1103. if (!str) {
  1104. ret = -ENOMEM;
  1105. goto out;
  1106. }
  1107. event_attr_intel_cqm_llc_scale.event_str = str;
  1108. ret = intel_cqm_setup_rmid_cache();
  1109. if (ret)
  1110. goto out;
  1111. for_each_online_cpu(i) {
  1112. intel_cqm_cpu_prepare(i);
  1113. cqm_pick_event_reader(i);
  1114. }
  1115. __perf_cpu_notifier(intel_cqm_cpu_notifier);
  1116. ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
  1117. if (ret)
  1118. pr_err("Intel CQM perf registration failed: %d\n", ret);
  1119. else
  1120. pr_info("Intel CQM monitoring enabled\n");
  1121. out:
  1122. cpu_notifier_register_done();
  1123. return ret;
  1124. }
  1125. device_initcall(intel_cqm_init);