x86_pkg_temp_thermal.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. /*
  2. * x86_pkg_temp_thermal driver
  3. * Copyright (c) 2013, Intel Corporation.
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms and conditions of the GNU General Public License,
  7. * version 2, as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. * more details.
  13. *
  14. * You should have received a copy of the GNU General Public License along with
  15. * this program; if not, write to the Free Software Foundation, Inc.
  16. *
  17. */
  18. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  19. #include <linux/module.h>
  20. #include <linux/init.h>
  21. #include <linux/err.h>
  22. #include <linux/param.h>
  23. #include <linux/device.h>
  24. #include <linux/platform_device.h>
  25. #include <linux/cpu.h>
  26. #include <linux/smp.h>
  27. #include <linux/slab.h>
  28. #include <linux/pm.h>
  29. #include <linux/thermal.h>
  30. #include <linux/debugfs.h>
  31. #include <asm/cpu_device_id.h>
  32. #include <asm/mce.h>
  33. /*
  34. * Rate control delay: Idea is to introduce denounce effect
  35. * This should be long enough to avoid reduce events, when
  36. * threshold is set to a temperature, which is constantly
  37. * violated, but at the short enough to take any action.
  38. * The action can be remove threshold or change it to next
  39. * interesting setting. Based on experiments, in around
  40. * every 5 seconds under load will give us a significant
  41. * temperature change.
  42. */
  43. #define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000
  44. static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
  45. module_param(notify_delay_ms, int, 0644);
  46. MODULE_PARM_DESC(notify_delay_ms,
  47. "User space notification delay in milli seconds.");
  48. /* Number of trip points in thermal zone. Currently it can't
  49. * be more than 2. MSR can allow setting and getting notifications
  50. * for only 2 thresholds. This define enforces this, if there
  51. * is some wrong values returned by cpuid for number of thresholds.
  52. */
  53. #define MAX_NUMBER_OF_TRIPS 2
  54. /* Limit number of package temp zones */
  55. #define MAX_PKG_TEMP_ZONE_IDS 256
  56. struct pkg_device {
  57. struct list_head list;
  58. u16 phys_proc_id;
  59. u16 cpu;
  60. u32 tj_max;
  61. u32 msr_pkg_therm_low;
  62. u32 msr_pkg_therm_high;
  63. struct thermal_zone_device *tzone;
  64. struct cpumask cpumask;
  65. };
  66. static struct thermal_zone_params pkg_temp_tz_params = {
  67. .no_hwmon = true,
  68. };
  69. /* List maintaining number of package instances */
  70. static LIST_HEAD(phy_dev_list);
  71. /* Serializes interrupt notification, work and hotplug */
  72. static DEFINE_SPINLOCK(pkg_temp_lock);
  73. /* Protects zone operation in the work function against hotplug removal */
  74. static DEFINE_MUTEX(thermal_zone_mutex);
  75. /* Interrupt to work function schedule queue */
  76. static DEFINE_PER_CPU(struct delayed_work, pkg_temp_thermal_threshold_work);
  77. /* To track if the work is already scheduled on a package */
  78. static u8 *pkg_work_scheduled;
  79. static u16 max_phy_id;
  80. /* Debug counters to show using debugfs */
  81. static struct dentry *debugfs;
  82. static unsigned int pkg_interrupt_cnt;
  83. static unsigned int pkg_work_cnt;
  84. static int pkg_temp_debugfs_init(void)
  85. {
  86. struct dentry *d;
  87. debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
  88. if (!debugfs)
  89. return -ENOENT;
  90. d = debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
  91. (u32 *)&pkg_interrupt_cnt);
  92. if (!d)
  93. goto err_out;
  94. d = debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
  95. (u32 *)&pkg_work_cnt);
  96. if (!d)
  97. goto err_out;
  98. return 0;
  99. err_out:
  100. debugfs_remove_recursive(debugfs);
  101. return -ENOENT;
  102. }
  103. /*
  104. * Protection:
  105. *
  106. * - cpu hotplug: Read serialized by cpu hotplug lock
  107. * Write must hold pkg_temp_lock
  108. *
  109. * - Other callsites: Must hold pkg_temp_lock
  110. */
  111. static struct pkg_device *pkg_temp_thermal_get_dev(unsigned int cpu)
  112. {
  113. u16 phys_proc_id = topology_physical_package_id(cpu);
  114. struct pkg_device *pkgdev;
  115. list_for_each_entry(pkgdev, &phy_dev_list, list) {
  116. if (pkgdev->phys_proc_id == phys_proc_id)
  117. return pkgdev;
  118. }
  119. return NULL;
  120. }
  121. /*
  122. * tj-max is is interesting because threshold is set relative to this
  123. * temperature.
  124. */
  125. static int get_tj_max(int cpu, u32 *tj_max)
  126. {
  127. u32 eax, edx, val;
  128. int err;
  129. err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
  130. if (err)
  131. return err;
  132. val = (eax >> 16) & 0xff;
  133. *tj_max = val * 1000;
  134. return val ? 0 : -EINVAL;
  135. }
  136. static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
  137. {
  138. struct pkg_device *pkgdev = tzd->devdata;
  139. u32 eax, edx;
  140. rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_STATUS, &eax, &edx);
  141. if (eax & 0x80000000) {
  142. *temp = pkgdev->tj_max - ((eax >> 16) & 0x7f) * 1000;
  143. pr_debug("sys_get_curr_temp %d\n", *temp);
  144. return 0;
  145. }
  146. return -EINVAL;
  147. }
  148. static int sys_get_trip_temp(struct thermal_zone_device *tzd,
  149. int trip, int *temp)
  150. {
  151. struct pkg_device *pkgdev = tzd->devdata;
  152. unsigned long thres_reg_value;
  153. u32 mask, shift, eax, edx;
  154. int ret;
  155. if (trip >= MAX_NUMBER_OF_TRIPS)
  156. return -EINVAL;
  157. if (trip) {
  158. mask = THERM_MASK_THRESHOLD1;
  159. shift = THERM_SHIFT_THRESHOLD1;
  160. } else {
  161. mask = THERM_MASK_THRESHOLD0;
  162. shift = THERM_SHIFT_THRESHOLD0;
  163. }
  164. ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
  165. &eax, &edx);
  166. if (ret < 0)
  167. return ret;
  168. thres_reg_value = (eax & mask) >> shift;
  169. if (thres_reg_value)
  170. *temp = pkgdev->tj_max - thres_reg_value * 1000;
  171. else
  172. *temp = 0;
  173. pr_debug("sys_get_trip_temp %d\n", *temp);
  174. return 0;
  175. }
  176. static int
  177. sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
  178. {
  179. struct pkg_device *pkgdev = tzd->devdata;
  180. u32 l, h, mask, shift, intr;
  181. int ret;
  182. if (trip >= MAX_NUMBER_OF_TRIPS || temp >= pkgdev->tj_max)
  183. return -EINVAL;
  184. ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
  185. &l, &h);
  186. if (ret < 0)
  187. return ret;
  188. if (trip) {
  189. mask = THERM_MASK_THRESHOLD1;
  190. shift = THERM_SHIFT_THRESHOLD1;
  191. intr = THERM_INT_THRESHOLD1_ENABLE;
  192. } else {
  193. mask = THERM_MASK_THRESHOLD0;
  194. shift = THERM_SHIFT_THRESHOLD0;
  195. intr = THERM_INT_THRESHOLD0_ENABLE;
  196. }
  197. l &= ~mask;
  198. /*
  199. * When users space sets a trip temperature == 0, which is indication
  200. * that, it is no longer interested in receiving notifications.
  201. */
  202. if (!temp) {
  203. l &= ~intr;
  204. } else {
  205. l |= (pkgdev->tj_max - temp)/1000 << shift;
  206. l |= intr;
  207. }
  208. return wrmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
  209. }
  210. static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip,
  211. enum thermal_trip_type *type)
  212. {
  213. *type = THERMAL_TRIP_PASSIVE;
  214. return 0;
  215. }
  216. /* Thermal zone callback registry */
  217. static struct thermal_zone_device_ops tzone_ops = {
  218. .get_temp = sys_get_curr_temp,
  219. .get_trip_temp = sys_get_trip_temp,
  220. .get_trip_type = sys_get_trip_type,
  221. .set_trip_temp = sys_set_trip_temp,
  222. };
  223. static bool pkg_thermal_rate_control(void)
  224. {
  225. return true;
  226. }
  227. /* Enable threshold interrupt on local package/cpu */
  228. static inline void enable_pkg_thres_interrupt(void)
  229. {
  230. u8 thres_0, thres_1;
  231. u32 l, h;
  232. rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
  233. /* only enable/disable if it had valid threshold value */
  234. thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
  235. thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
  236. if (thres_0)
  237. l |= THERM_INT_THRESHOLD0_ENABLE;
  238. if (thres_1)
  239. l |= THERM_INT_THRESHOLD1_ENABLE;
  240. wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
  241. }
  242. /* Disable threshold interrupt on local package/cpu */
  243. static inline void disable_pkg_thres_interrupt(void)
  244. {
  245. u32 l, h;
  246. rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
  247. l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
  248. wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
  249. }
  250. static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
  251. {
  252. struct thermal_zone_device *tzone = NULL;
  253. int phy_id, cpu = smp_processor_id();
  254. struct pkg_device *pkgdev;
  255. u64 msr_val, wr_val;
  256. mutex_lock(&thermal_zone_mutex);
  257. spin_lock_irq(&pkg_temp_lock);
  258. ++pkg_work_cnt;
  259. pkgdev = pkg_temp_thermal_get_dev(cpu);
  260. if (!pkgdev) {
  261. spin_unlock_irq(&pkg_temp_lock);
  262. mutex_unlock(&thermal_zone_mutex);
  263. return;
  264. }
  265. pkg_work_scheduled[phy_id] = 0;
  266. rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
  267. wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
  268. if (wr_val != msr_val) {
  269. wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val);
  270. tzone = pkgdev->tzone;
  271. }
  272. enable_pkg_thres_interrupt();
  273. spin_unlock_irq(&pkg_temp_lock);
  274. /*
  275. * If tzone is not NULL, then thermal_zone_mutex will prevent the
  276. * concurrent removal in the cpu offline callback.
  277. */
  278. if (tzone)
  279. thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
  280. mutex_unlock(&thermal_zone_mutex);
  281. }
  282. static int pkg_thermal_notify(u64 msr_val)
  283. {
  284. int cpu = smp_processor_id();
  285. int phy_id = topology_physical_package_id(cpu);
  286. struct pkg_device *pkgdev;
  287. unsigned long flags;
  288. spin_lock_irqsave(&pkg_temp_lock, flags);
  289. ++pkg_interrupt_cnt;
  290. disable_pkg_thres_interrupt();
  291. /* Work is per package, so scheduling it once is enough. */
  292. pkgdev = pkg_temp_thermal_get_dev(cpu);
  293. if (pkgdev && pkg_work_scheduled && !pkg_work_scheduled[phy_id]) {
  294. pkg_work_scheduled[phy_id] = 1;
  295. schedule_delayed_work_on(cpu,
  296. &per_cpu(pkg_temp_thermal_threshold_work, cpu),
  297. msecs_to_jiffies(notify_delay_ms));
  298. }
  299. spin_unlock_irqrestore(&pkg_temp_lock, flags);
  300. return 0;
  301. }
  302. static int pkg_temp_thermal_device_add(unsigned int cpu)
  303. {
  304. u32 tj_max, eax, ebx, ecx, edx;
  305. struct pkg_device *pkgdev;
  306. int thres_count, err;
  307. unsigned long flags;
  308. u8 *temp;
  309. cpuid(6, &eax, &ebx, &ecx, &edx);
  310. thres_count = ebx & 0x07;
  311. if (!thres_count)
  312. return -ENODEV;
  313. if (topology_physical_package_id(cpu) > MAX_PKG_TEMP_ZONE_IDS)
  314. return -ENODEV;
  315. thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
  316. err = get_tj_max(cpu, &tj_max);
  317. if (err)
  318. return err;
  319. pkgdev = kzalloc(sizeof(*pkgdev), GFP_KERNEL);
  320. if (!pkgdev)
  321. return -ENOMEM;
  322. spin_lock_irqsave(&pkg_temp_lock, flags);
  323. if (topology_physical_package_id(cpu) > max_phy_id)
  324. max_phy_id = topology_physical_package_id(cpu);
  325. temp = krealloc(pkg_work_scheduled,
  326. (max_phy_id+1) * sizeof(u8), GFP_ATOMIC);
  327. if (!temp) {
  328. spin_unlock_irqrestore(&pkg_temp_lock, flags);
  329. kfree(pkgdev);
  330. return -ENOMEM;
  331. }
  332. pkg_work_scheduled = temp;
  333. pkg_work_scheduled[topology_physical_package_id(cpu)] = 0;
  334. spin_unlock_irqrestore(&pkg_temp_lock, flags);
  335. pkgdev->phys_proc_id = topology_physical_package_id(cpu);
  336. pkgdev->cpu = cpu;
  337. pkgdev->tj_max = tj_max;
  338. pkgdev->tzone = thermal_zone_device_register("x86_pkg_temp",
  339. thres_count,
  340. (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01,
  341. pkgdev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
  342. if (IS_ERR(pkgdev->tzone)) {
  343. err = PTR_ERR(pkgdev->tzone);
  344. kfree(pkgdev);
  345. return err;
  346. }
  347. /* Store MSR value for package thermal interrupt, to restore at exit */
  348. rdmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
  349. &pkgdev->msr_pkg_therm_low,
  350. &pkgdev->msr_pkg_therm_high);
  351. cpumask_set_cpu(cpu, &pkgdev->cpumask);
  352. spin_lock_irq(&pkg_temp_lock);
  353. list_add_tail(&pkgdev->list, &phy_dev_list);
  354. spin_unlock_irq(&pkg_temp_lock);
  355. return 0;
  356. }
  357. static void put_core_offline(unsigned int cpu)
  358. {
  359. struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu);
  360. bool lastcpu;
  361. int target;
  362. if (!pkgdev)
  363. return;
  364. target = cpumask_any_but(&pkgdev->cpumask, cpu);
  365. cpumask_clear_cpu(cpu, &pkgdev->cpumask);
  366. lastcpu = target >= nr_cpu_ids;
  367. /*
  368. * Remove the sysfs files, if this is the last cpu in the package
  369. * before doing further cleanups.
  370. */
  371. if (lastcpu) {
  372. struct thermal_zone_device *tzone = pkgdev->tzone;
  373. /*
  374. * We must protect against a work function calling
  375. * thermal_zone_update, after/while unregister. We null out
  376. * the pointer under the zone mutex, so the worker function
  377. * won't try to call.
  378. */
  379. mutex_lock(&thermal_zone_mutex);
  380. pkgdev->tzone = NULL;
  381. mutex_unlock(&thermal_zone_mutex);
  382. thermal_zone_device_unregister(tzone);
  383. }
  384. /*
  385. * If this is the last CPU in the package, restore the interrupt
  386. * MSR and remove the package reference from the array.
  387. */
  388. if (lastcpu) {
  389. /* Protect against work and interrupts */
  390. spin_lock_irq(&pkg_temp_lock);
  391. list_del(&pkgdev->list);
  392. /*
  393. * After this point nothing touches the MSR anymore. We
  394. * must drop the lock to make the cross cpu call. This goes
  395. * away once we move that code to the hotplug state machine.
  396. */
  397. spin_unlock_irq(&pkg_temp_lock);
  398. wrmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
  399. pkgdev->msr_pkg_therm_low,
  400. pkgdev->msr_pkg_therm_high);
  401. kfree(pkgdev);
  402. }
  403. /*
  404. * Note, this is broken when work was really scheduled on the
  405. * outgoing cpu because this will leave the work_scheduled flag set
  406. * and the thermal interrupts disabled. Will be fixed in the next
  407. * step as there is no way to fix it in a sane way with the per cpu
  408. * work nonsense.
  409. */
  410. cancel_delayed_work_sync(&per_cpu(pkg_temp_thermal_threshold_work, cpu));
  411. }
  412. static int get_core_online(unsigned int cpu)
  413. {
  414. struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu);
  415. struct cpuinfo_x86 *c = &cpu_data(cpu);
  416. /* Paranoia check */
  417. if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
  418. return -ENODEV;
  419. INIT_DELAYED_WORK(&per_cpu(pkg_temp_thermal_threshold_work, cpu),
  420. pkg_temp_thermal_threshold_work_fn);
  421. /* If the package exists, nothing to do */
  422. if (pkgdev) {
  423. cpumask_set_cpu(cpu, &pkgdev->cpumask);
  424. return 0;
  425. }
  426. return pkg_temp_thermal_device_add(cpu);
  427. }
  428. static int pkg_temp_thermal_cpu_callback(struct notifier_block *nfb,
  429. unsigned long action, void *hcpu)
  430. {
  431. unsigned int cpu = (unsigned long) hcpu;
  432. switch (action & ~CPU_TASKS_FROZEN) {
  433. case CPU_ONLINE:
  434. case CPU_DOWN_FAILED:
  435. get_core_online(cpu);
  436. break;
  437. case CPU_DOWN_PREPARE:
  438. put_core_offline(cpu);
  439. break;
  440. }
  441. return NOTIFY_OK;
  442. }
  443. static struct notifier_block pkg_temp_thermal_notifier __refdata = {
  444. .notifier_call = pkg_temp_thermal_cpu_callback,
  445. };
  446. static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
  447. { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_PTS },
  448. {}
  449. };
  450. MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
  451. static int __init pkg_temp_thermal_init(void)
  452. {
  453. int i;
  454. if (!x86_match_cpu(pkg_temp_thermal_ids))
  455. return -ENODEV;
  456. cpu_notifier_register_begin();
  457. for_each_online_cpu(i)
  458. if (get_core_online(i))
  459. goto err_ret;
  460. __register_hotcpu_notifier(&pkg_temp_thermal_notifier);
  461. cpu_notifier_register_done();
  462. platform_thermal_package_notify = pkg_thermal_notify;
  463. platform_thermal_package_rate_control = pkg_thermal_rate_control;
  464. /* Don't care if it fails */
  465. pkg_temp_debugfs_init();
  466. return 0;
  467. err_ret:
  468. for_each_online_cpu(i)
  469. put_core_offline(i);
  470. cpu_notifier_register_done();
  471. kfree(pkg_work_scheduled);
  472. return -ENODEV;
  473. }
  474. module_init(pkg_temp_thermal_init)
  475. static void __exit pkg_temp_thermal_exit(void)
  476. {
  477. int i;
  478. platform_thermal_package_notify = NULL;
  479. platform_thermal_package_rate_control = NULL;
  480. cpu_notifier_register_begin();
  481. __unregister_hotcpu_notifier(&pkg_temp_thermal_notifier);
  482. for_each_online_cpu(i)
  483. put_core_offline(i);
  484. cpu_notifier_register_done();
  485. kfree(pkg_work_scheduled);
  486. debugfs_remove_recursive(debugfs);
  487. }
  488. module_exit(pkg_temp_thermal_exit)
  489. MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
  490. MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
  491. MODULE_LICENSE("GPL v2");