proc.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. /*
  2. * kernel/sched/proc.c
  3. *
  4. * Kernel load calculations, forked from sched/core.c
  5. */
  6. #include <linux/export.h>
  7. #include "sched.h"
  8. /*
  9. * Global load-average calculations
  10. *
  11. * We take a distributed and async approach to calculating the global load-avg
  12. * in order to minimize overhead.
  13. *
  14. * The global load average is an exponentially decaying average of nr_running +
  15. * nr_uninterruptible.
  16. *
  17. * Once every LOAD_FREQ:
  18. *
  19. * nr_active = 0;
  20. * for_each_possible_cpu(cpu)
  21. * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
  22. *
  23. * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
  24. *
  25. * Due to a number of reasons the above turns in the mess below:
  26. *
  27. * - for_each_possible_cpu() is prohibitively expensive on machines with
  28. * serious number of cpus, therefore we need to take a distributed approach
  29. * to calculating nr_active.
  30. *
  31. * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
  32. * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
  33. *
  34. * So assuming nr_active := 0 when we start out -- true per definition, we
  35. * can simply take per-cpu deltas and fold those into a global accumulate
  36. * to obtain the same result. See calc_load_fold_active().
  37. *
  38. * Furthermore, in order to avoid synchronizing all per-cpu delta folding
  39. * across the machine, we assume 10 ticks is sufficient time for every
  40. * cpu to have completed this task.
  41. *
  42. * This places an upper-bound on the IRQ-off latency of the machine. Then
  43. * again, being late doesn't loose the delta, just wrecks the sample.
  44. *
  45. * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
  46. * this would add another cross-cpu cacheline miss and atomic operation
  47. * to the wakeup path. Instead we increment on whatever cpu the task ran
  48. * when it went into uninterruptible state and decrement on whatever cpu
  49. * did the wakeup. This means that only the sum of nr_uninterruptible over
  50. * all cpus yields the correct result.
  51. *
  52. * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
  53. */
  54. /* Variables and functions for calc_load */
  55. atomic_long_t calc_load_tasks;
  56. unsigned long calc_load_update;
  57. unsigned long avenrun[3];
  58. EXPORT_SYMBOL(avenrun); /* should be removed */
  59. /**
  60. * get_avenrun - get the load average array
  61. * @loads: pointer to dest load array
  62. * @offset: offset to add
  63. * @shift: shift count to shift the result left
  64. *
  65. * These values are estimates at best, so no need for locking.
  66. */
  67. void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
  68. {
  69. loads[0] = (avenrun[0] + offset) << shift;
  70. loads[1] = (avenrun[1] + offset) << shift;
  71. loads[2] = (avenrun[2] + offset) << shift;
  72. }
  73. long calc_load_fold_active(struct rq *this_rq)
  74. {
  75. long nr_active, delta = 0;
  76. nr_active = this_rq->nr_running;
  77. nr_active += (long) this_rq->nr_uninterruptible;
  78. if (nr_active != this_rq->calc_load_active) {
  79. delta = nr_active - this_rq->calc_load_active;
  80. this_rq->calc_load_active = nr_active;
  81. }
  82. return delta;
  83. }
  84. /*
  85. * a1 = a0 * e + a * (1 - e)
  86. */
  87. static unsigned long
  88. calc_load(unsigned long load, unsigned long exp, unsigned long active)
  89. {
  90. load *= exp;
  91. load += active * (FIXED_1 - exp);
  92. load += 1UL << (FSHIFT - 1);
  93. return load >> FSHIFT;
  94. }
  95. #ifdef CONFIG_NO_HZ_COMMON
  96. /*
  97. * Handle NO_HZ for the global load-average.
  98. *
  99. * Since the above described distributed algorithm to compute the global
  100. * load-average relies on per-cpu sampling from the tick, it is affected by
  101. * NO_HZ.
  102. *
  103. * The basic idea is to fold the nr_active delta into a global idle-delta upon
  104. * entering NO_HZ state such that we can include this as an 'extra' cpu delta
  105. * when we read the global state.
  106. *
  107. * Obviously reality has to ruin such a delightfully simple scheme:
  108. *
  109. * - When we go NO_HZ idle during the window, we can negate our sample
  110. * contribution, causing under-accounting.
  111. *
  112. * We avoid this by keeping two idle-delta counters and flipping them
  113. * when the window starts, thus separating old and new NO_HZ load.
  114. *
  115. * The only trick is the slight shift in index flip for read vs write.
  116. *
  117. * 0s 5s 10s 15s
  118. * +10 +10 +10 +10
  119. * |-|-----------|-|-----------|-|-----------|-|
  120. * r:0 0 1 1 0 0 1 1 0
  121. * w:0 1 1 0 0 1 1 0 0
  122. *
  123. * This ensures we'll fold the old idle contribution in this window while
  124. * accumlating the new one.
  125. *
  126. * - When we wake up from NO_HZ idle during the window, we push up our
  127. * contribution, since we effectively move our sample point to a known
  128. * busy state.
  129. *
  130. * This is solved by pushing the window forward, and thus skipping the
  131. * sample, for this cpu (effectively using the idle-delta for this cpu which
  132. * was in effect at the time the window opened). This also solves the issue
  133. * of having to deal with a cpu having been in NOHZ idle for multiple
  134. * LOAD_FREQ intervals.
  135. *
  136. * When making the ILB scale, we should try to pull this in as well.
  137. */
  138. static atomic_long_t calc_load_idle[2];
  139. static int calc_load_idx;
  140. static inline int calc_load_write_idx(void)
  141. {
  142. int idx = calc_load_idx;
  143. /*
  144. * See calc_global_nohz(), if we observe the new index, we also
  145. * need to observe the new update time.
  146. */
  147. smp_rmb();
  148. /*
  149. * If the folding window started, make sure we start writing in the
  150. * next idle-delta.
  151. */
  152. if (!time_before(jiffies, calc_load_update))
  153. idx++;
  154. return idx & 1;
  155. }
  156. static inline int calc_load_read_idx(void)
  157. {
  158. return calc_load_idx & 1;
  159. }
  160. void calc_load_enter_idle(void)
  161. {
  162. struct rq *this_rq = this_rq();
  163. long delta;
  164. /*
  165. * We're going into NOHZ mode, if there's any pending delta, fold it
  166. * into the pending idle delta.
  167. */
  168. delta = calc_load_fold_active(this_rq);
  169. if (delta) {
  170. int idx = calc_load_write_idx();
  171. atomic_long_add(delta, &calc_load_idle[idx]);
  172. }
  173. }
  174. void calc_load_exit_idle(void)
  175. {
  176. struct rq *this_rq = this_rq();
  177. /*
  178. * If we're still before the sample window, we're done.
  179. */
  180. if (time_before(jiffies, this_rq->calc_load_update))
  181. return;
  182. /*
  183. * We woke inside or after the sample window, this means we're already
  184. * accounted through the nohz accounting, so skip the entire deal and
  185. * sync up for the next window.
  186. */
  187. this_rq->calc_load_update = calc_load_update;
  188. if (time_before(jiffies, this_rq->calc_load_update + 10))
  189. this_rq->calc_load_update += LOAD_FREQ;
  190. }
  191. static long calc_load_fold_idle(void)
  192. {
  193. int idx = calc_load_read_idx();
  194. long delta = 0;
  195. if (atomic_long_read(&calc_load_idle[idx]))
  196. delta = atomic_long_xchg(&calc_load_idle[idx], 0);
  197. return delta;
  198. }
  199. /**
  200. * fixed_power_int - compute: x^n, in O(log n) time
  201. *
  202. * @x: base of the power
  203. * @frac_bits: fractional bits of @x
  204. * @n: power to raise @x to.
  205. *
  206. * By exploiting the relation between the definition of the natural power
  207. * function: x^n := x*x*...*x (x multiplied by itself for n times), and
  208. * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
  209. * (where: n_i \elem {0, 1}, the binary vector representing n),
  210. * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
  211. * of course trivially computable in O(log_2 n), the length of our binary
  212. * vector.
  213. */
  214. static unsigned long
  215. fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
  216. {
  217. unsigned long result = 1UL << frac_bits;
  218. if (n) for (;;) {
  219. if (n & 1) {
  220. result *= x;
  221. result += 1UL << (frac_bits - 1);
  222. result >>= frac_bits;
  223. }
  224. n >>= 1;
  225. if (!n)
  226. break;
  227. x *= x;
  228. x += 1UL << (frac_bits - 1);
  229. x >>= frac_bits;
  230. }
  231. return result;
  232. }
  233. /*
  234. * a1 = a0 * e + a * (1 - e)
  235. *
  236. * a2 = a1 * e + a * (1 - e)
  237. * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
  238. * = a0 * e^2 + a * (1 - e) * (1 + e)
  239. *
  240. * a3 = a2 * e + a * (1 - e)
  241. * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
  242. * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
  243. *
  244. * ...
  245. *
  246. * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
  247. * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
  248. * = a0 * e^n + a * (1 - e^n)
  249. *
  250. * [1] application of the geometric series:
  251. *
  252. * n 1 - x^(n+1)
  253. * S_n := \Sum x^i = -------------
  254. * i=0 1 - x
  255. */
  256. static unsigned long
  257. calc_load_n(unsigned long load, unsigned long exp,
  258. unsigned long active, unsigned int n)
  259. {
  260. return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
  261. }
  262. /*
  263. * NO_HZ can leave us missing all per-cpu ticks calling
  264. * calc_load_account_active(), but since an idle CPU folds its delta into
  265. * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
  266. * in the pending idle delta if our idle period crossed a load cycle boundary.
  267. *
  268. * Once we've updated the global active value, we need to apply the exponential
  269. * weights adjusted to the number of cycles missed.
  270. */
  271. static void calc_global_nohz(void)
  272. {
  273. long delta, active, n;
  274. if (!time_before(jiffies, calc_load_update + 10)) {
  275. /*
  276. * Catch-up, fold however many we are behind still
  277. */
  278. delta = jiffies - calc_load_update - 10;
  279. n = 1 + (delta / LOAD_FREQ);
  280. active = atomic_long_read(&calc_load_tasks);
  281. active = active > 0 ? active * FIXED_1 : 0;
  282. avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
  283. avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
  284. avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
  285. calc_load_update += n * LOAD_FREQ;
  286. }
  287. /*
  288. * Flip the idle index...
  289. *
  290. * Make sure we first write the new time then flip the index, so that
  291. * calc_load_write_idx() will see the new time when it reads the new
  292. * index, this avoids a double flip messing things up.
  293. */
  294. smp_wmb();
  295. calc_load_idx++;
  296. }
  297. #else /* !CONFIG_NO_HZ_COMMON */
  298. static inline long calc_load_fold_idle(void) { return 0; }
  299. static inline void calc_global_nohz(void) { }
  300. #endif /* CONFIG_NO_HZ_COMMON */
  301. /*
  302. * calc_load - update the avenrun load estimates 10 ticks after the
  303. * CPUs have updated calc_load_tasks.
  304. */
  305. void calc_global_load(unsigned long ticks)
  306. {
  307. long active, delta;
  308. if (time_before(jiffies, calc_load_update + 10))
  309. return;
  310. /*
  311. * Fold the 'old' idle-delta to include all NO_HZ cpus.
  312. */
  313. delta = calc_load_fold_idle();
  314. if (delta)
  315. atomic_long_add(delta, &calc_load_tasks);
  316. active = atomic_long_read(&calc_load_tasks);
  317. active = active > 0 ? active * FIXED_1 : 0;
  318. avenrun[0] = calc_load(avenrun[0], EXP_1, active);
  319. avenrun[1] = calc_load(avenrun[1], EXP_5, active);
  320. avenrun[2] = calc_load(avenrun[2], EXP_15, active);
  321. calc_load_update += LOAD_FREQ;
  322. /*
  323. * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
  324. */
  325. calc_global_nohz();
  326. }
  327. /*
  328. * Called from update_cpu_load() to periodically update this CPU's
  329. * active count.
  330. */
  331. static void calc_load_account_active(struct rq *this_rq)
  332. {
  333. long delta;
  334. if (time_before(jiffies, this_rq->calc_load_update))
  335. return;
  336. delta = calc_load_fold_active(this_rq);
  337. if (delta)
  338. atomic_long_add(delta, &calc_load_tasks);
  339. this_rq->calc_load_update += LOAD_FREQ;
  340. }
  341. /*
  342. * End of global load-average stuff
  343. */
  344. /*
  345. * The exact cpuload at various idx values, calculated at every tick would be
  346. * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
  347. *
  348. * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
  349. * on nth tick when cpu may be busy, then we have:
  350. * load = ((2^idx - 1) / 2^idx)^(n-1) * load
  351. * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
  352. *
  353. * decay_load_missed() below does efficient calculation of
  354. * load = ((2^idx - 1) / 2^idx)^(n-1) * load
  355. * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
  356. *
  357. * The calculation is approximated on a 128 point scale.
  358. * degrade_zero_ticks is the number of ticks after which load at any
  359. * particular idx is approximated to be zero.
  360. * degrade_factor is a precomputed table, a row for each load idx.
  361. * Each column corresponds to degradation factor for a power of two ticks,
  362. * based on 128 point scale.
  363. * Example:
  364. * row 2, col 3 (=12) says that the degradation at load idx 2 after
  365. * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
  366. *
  367. * With this power of 2 load factors, we can degrade the load n times
  368. * by looking at 1 bits in n and doing as many mult/shift instead of
  369. * n mult/shifts needed by the exact degradation.
  370. */
  371. #define DEGRADE_SHIFT 7
  372. static const unsigned char
  373. degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
  374. static const unsigned char
  375. degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
  376. {0, 0, 0, 0, 0, 0, 0, 0},
  377. {64, 32, 8, 0, 0, 0, 0, 0},
  378. {96, 72, 40, 12, 1, 0, 0},
  379. {112, 98, 75, 43, 15, 1, 0},
  380. {120, 112, 98, 76, 45, 16, 2} };
  381. /*
  382. * Update cpu_load for any missed ticks, due to tickless idle. The backlog
  383. * would be when CPU is idle and so we just decay the old load without
  384. * adding any new load.
  385. */
  386. static unsigned long
  387. decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
  388. {
  389. int j = 0;
  390. if (!missed_updates)
  391. return load;
  392. if (missed_updates >= degrade_zero_ticks[idx])
  393. return 0;
  394. if (idx == 1)
  395. return load >> missed_updates;
  396. while (missed_updates) {
  397. if (missed_updates % 2)
  398. load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
  399. missed_updates >>= 1;
  400. j++;
  401. }
  402. return load;
  403. }
  404. /*
  405. * Update rq->cpu_load[] statistics. This function is usually called every
  406. * scheduler tick (TICK_NSEC). With tickless idle this will not be called
  407. * every tick. We fix it up based on jiffies.
  408. */
  409. static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
  410. unsigned long pending_updates)
  411. {
  412. int i, scale;
  413. this_rq->nr_load_updates++;
  414. /* Update our load: */
  415. this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
  416. for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
  417. unsigned long old_load, new_load;
  418. /* scale is effectively 1 << i now, and >> i divides by scale */
  419. old_load = this_rq->cpu_load[i];
  420. old_load = decay_load_missed(old_load, pending_updates - 1, i);
  421. new_load = this_load;
  422. /*
  423. * Round up the averaging division if load is increasing. This
  424. * prevents us from getting stuck on 9 if the load is 10, for
  425. * example.
  426. */
  427. if (new_load > old_load)
  428. new_load += scale - 1;
  429. this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
  430. }
  431. sched_avg_update(this_rq);
  432. }
  433. #ifdef CONFIG_SMP
  434. static inline unsigned long get_rq_runnable_load(struct rq *rq)
  435. {
  436. return rq->cfs.runnable_load_avg;
  437. }
  438. #else
  439. static inline unsigned long get_rq_runnable_load(struct rq *rq)
  440. {
  441. return rq->load.weight;
  442. }
  443. #endif
  444. #ifdef CONFIG_NO_HZ_COMMON
  445. /*
  446. * There is no sane way to deal with nohz on smp when using jiffies because the
  447. * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
  448. * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
  449. *
  450. * Therefore we cannot use the delta approach from the regular tick since that
  451. * would seriously skew the load calculation. However we'll make do for those
  452. * updates happening while idle (nohz_idle_balance) or coming out of idle
  453. * (tick_nohz_idle_exit).
  454. *
  455. * This means we might still be one tick off for nohz periods.
  456. */
  457. /*
  458. * Called from nohz_idle_balance() to update the load ratings before doing the
  459. * idle balance.
  460. */
  461. void update_idle_cpu_load(struct rq *this_rq)
  462. {
  463. unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
  464. unsigned long load = get_rq_runnable_load(this_rq);
  465. unsigned long pending_updates;
  466. /*
  467. * bail if there's load or we're actually up-to-date.
  468. */
  469. if (load || curr_jiffies == this_rq->last_load_update_tick)
  470. return;
  471. pending_updates = curr_jiffies - this_rq->last_load_update_tick;
  472. this_rq->last_load_update_tick = curr_jiffies;
  473. __update_cpu_load(this_rq, load, pending_updates);
  474. }
  475. /*
  476. * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
  477. */
  478. void update_cpu_load_nohz(void)
  479. {
  480. struct rq *this_rq = this_rq();
  481. unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
  482. unsigned long pending_updates;
  483. if (curr_jiffies == this_rq->last_load_update_tick)
  484. return;
  485. raw_spin_lock(&this_rq->lock);
  486. pending_updates = curr_jiffies - this_rq->last_load_update_tick;
  487. if (pending_updates) {
  488. this_rq->last_load_update_tick = curr_jiffies;
  489. /*
  490. * We were idle, this means load 0, the current load might be
  491. * !0 due to remote wakeups and the sort.
  492. */
  493. __update_cpu_load(this_rq, 0, pending_updates);
  494. }
  495. raw_spin_unlock(&this_rq->lock);
  496. }
  497. #endif /* CONFIG_NO_HZ */
  498. /*
  499. * Called from scheduler_tick()
  500. */
  501. void update_cpu_load_active(struct rq *this_rq)
  502. {
  503. unsigned long load = get_rq_runnable_load(this_rq);
  504. /*
  505. * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
  506. */
  507. this_rq->last_load_update_tick = jiffies;
  508. __update_cpu_load(this_rq, load, 1);
  509. calc_load_account_active(this_rq);
  510. }