rstat.c 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. #include "cgroup-internal.h"
  2. #include <linux/sched/cputime.h>
  3. static DEFINE_MUTEX(cgroup_rstat_mutex);
  4. static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
  5. static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
  6. static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
  7. {
  8. return per_cpu_ptr(cgrp->rstat_cpu, cpu);
  9. }
  10. /**
  11. * cgroup_rstat_cpu_updated - keep track of updated rstat_cpu
  12. * @cgrp: target cgroup
  13. * @cpu: cpu on which rstat_cpu was updated
  14. *
  15. * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
  16. * rstat_cpu->updated_children list. See the comment on top of
  17. * cgroup_rstat_cpu definition for details.
  18. */
  19. static void cgroup_rstat_cpu_updated(struct cgroup *cgrp, int cpu)
  20. {
  21. raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
  22. struct cgroup *parent;
  23. unsigned long flags;
  24. /*
  25. * Speculative already-on-list test. This may race leading to
  26. * temporary inaccuracies, which is fine.
  27. *
  28. * Because @parent's updated_children is terminated with @parent
  29. * instead of NULL, we can tell whether @cgrp is on the list by
  30. * testing the next pointer for NULL.
  31. */
  32. if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
  33. return;
  34. raw_spin_lock_irqsave(cpu_lock, flags);
  35. /* put @cgrp and all ancestors on the corresponding updated lists */
  36. for (parent = cgroup_parent(cgrp); parent;
  37. cgrp = parent, parent = cgroup_parent(cgrp)) {
  38. struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  39. struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
  40. /*
  41. * Both additions and removals are bottom-up. If a cgroup
  42. * is already in the tree, all ancestors are.
  43. */
  44. if (rstatc->updated_next)
  45. break;
  46. rstatc->updated_next = prstatc->updated_children;
  47. prstatc->updated_children = cgrp;
  48. }
  49. raw_spin_unlock_irqrestore(cpu_lock, flags);
  50. }
  51. /**
  52. * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
  53. * @pos: current position
  54. * @root: root of the tree to traversal
  55. * @cpu: target cpu
  56. *
  57. * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts
  58. * the traversal and %NULL return indicates the end. During traversal,
  59. * each returned cgroup is unlinked from the tree. Must be called with the
  60. * matching cgroup_rstat_cpu_lock held.
  61. *
  62. * The only ordering guarantee is that, for a parent and a child pair
  63. * covered by a given traversal, if a child is visited, its parent is
  64. * guaranteed to be visited afterwards.
  65. */
  66. static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
  67. struct cgroup *root, int cpu)
  68. {
  69. struct cgroup_rstat_cpu *rstatc;
  70. struct cgroup *parent;
  71. if (pos == root)
  72. return NULL;
  73. /*
  74. * We're gonna walk down to the first leaf and visit/remove it. We
  75. * can pick whatever unvisited node as the starting point.
  76. */
  77. if (!pos)
  78. pos = root;
  79. else
  80. pos = cgroup_parent(pos);
  81. /* walk down to the first leaf */
  82. while (true) {
  83. rstatc = cgroup_rstat_cpu(pos, cpu);
  84. if (rstatc->updated_children == pos)
  85. break;
  86. pos = rstatc->updated_children;
  87. }
  88. /*
  89. * Unlink @pos from the tree. As the updated_children list is
  90. * singly linked, we have to walk it to find the removal point.
  91. * However, due to the way we traverse, @pos will be the first
  92. * child in most cases. The only exception is @root.
  93. */
  94. parent = cgroup_parent(pos);
  95. if (parent && rstatc->updated_next) {
  96. struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
  97. struct cgroup_rstat_cpu *nrstatc;
  98. struct cgroup **nextp;
  99. nextp = &prstatc->updated_children;
  100. while (true) {
  101. nrstatc = cgroup_rstat_cpu(*nextp, cpu);
  102. if (*nextp == pos)
  103. break;
  104. WARN_ON_ONCE(*nextp == parent);
  105. nextp = &nrstatc->updated_next;
  106. }
  107. *nextp = rstatc->updated_next;
  108. rstatc->updated_next = NULL;
  109. }
  110. return pos;
  111. }
  112. /* see cgroup_rstat_flush() */
  113. static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
  114. {
  115. int cpu;
  116. lockdep_assert_held(&cgroup_rstat_mutex);
  117. for_each_possible_cpu(cpu) {
  118. raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
  119. cpu);
  120. struct cgroup *pos = NULL;
  121. raw_spin_lock_irq(cpu_lock);
  122. while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu)))
  123. cgroup_base_stat_flush(pos, cpu);
  124. raw_spin_unlock_irq(cpu_lock);
  125. }
  126. }
  127. /**
  128. * cgroup_rstat_flush - flush stats in @cgrp's subtree
  129. * @cgrp: target cgroup
  130. *
  131. * Collect all per-cpu stats in @cgrp's subtree into the global counters
  132. * and propagate them upwards. After this function returns, all cgroups in
  133. * the subtree have up-to-date ->stat.
  134. *
  135. * This also gets all cgroups in the subtree including @cgrp off the
  136. * ->updated_children lists.
  137. */
  138. void cgroup_rstat_flush(struct cgroup *cgrp)
  139. {
  140. mutex_lock(&cgroup_rstat_mutex);
  141. cgroup_rstat_flush_locked(cgrp);
  142. mutex_unlock(&cgroup_rstat_mutex);
  143. }
  144. int cgroup_rstat_init(struct cgroup *cgrp)
  145. {
  146. int cpu;
  147. /* the root cgrp has rstat_cpu preallocated */
  148. if (!cgrp->rstat_cpu) {
  149. cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
  150. if (!cgrp->rstat_cpu)
  151. return -ENOMEM;
  152. }
  153. /* ->updated_children list is self terminated */
  154. for_each_possible_cpu(cpu) {
  155. struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  156. rstatc->updated_children = cgrp;
  157. u64_stats_init(&rstatc->bsync);
  158. }
  159. return 0;
  160. }
  161. void cgroup_rstat_exit(struct cgroup *cgrp)
  162. {
  163. int cpu;
  164. cgroup_rstat_flush(cgrp);
  165. /* sanity check */
  166. for_each_possible_cpu(cpu) {
  167. struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  168. if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
  169. WARN_ON_ONCE(rstatc->updated_next))
  170. return;
  171. }
  172. free_percpu(cgrp->rstat_cpu);
  173. cgrp->rstat_cpu = NULL;
  174. }
  175. void __init cgroup_rstat_boot(void)
  176. {
  177. int cpu;
  178. for_each_possible_cpu(cpu)
  179. raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
  180. BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
  181. }
  182. /*
  183. * Functions for cgroup basic resource statistics implemented on top of
  184. * rstat.
  185. */
  186. static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat,
  187. struct cgroup_base_stat *src_bstat)
  188. {
  189. dst_bstat->cputime.utime += src_bstat->cputime.utime;
  190. dst_bstat->cputime.stime += src_bstat->cputime.stime;
  191. dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
  192. }
  193. static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
  194. {
  195. struct cgroup *parent = cgroup_parent(cgrp);
  196. struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  197. struct task_cputime *last_cputime = &rstatc->last_bstat.cputime;
  198. struct task_cputime cputime;
  199. struct cgroup_base_stat delta;
  200. unsigned seq;
  201. lockdep_assert_held(&cgroup_rstat_mutex);
  202. /* fetch the current per-cpu values */
  203. do {
  204. seq = __u64_stats_fetch_begin(&rstatc->bsync);
  205. cputime = rstatc->bstat.cputime;
  206. } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
  207. /* accumulate the deltas to propgate */
  208. delta.cputime.utime = cputime.utime - last_cputime->utime;
  209. delta.cputime.stime = cputime.stime - last_cputime->stime;
  210. delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
  211. last_cputime->sum_exec_runtime;
  212. *last_cputime = cputime;
  213. /* transfer the pending stat into delta */
  214. cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat);
  215. memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat));
  216. /* propagate delta into the global stat and the parent's pending */
  217. cgroup_base_stat_accumulate(&cgrp->bstat, &delta);
  218. if (parent)
  219. cgroup_base_stat_accumulate(&parent->pending_bstat, &delta);
  220. }
  221. static struct cgroup_rstat_cpu *
  222. cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
  223. {
  224. struct cgroup_rstat_cpu *rstatc;
  225. rstatc = get_cpu_ptr(cgrp->rstat_cpu);
  226. u64_stats_update_begin(&rstatc->bsync);
  227. return rstatc;
  228. }
  229. static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
  230. struct cgroup_rstat_cpu *rstatc)
  231. {
  232. u64_stats_update_end(&rstatc->bsync);
  233. cgroup_rstat_cpu_updated(cgrp, smp_processor_id());
  234. put_cpu_ptr(rstatc);
  235. }
  236. void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
  237. {
  238. struct cgroup_rstat_cpu *rstatc;
  239. rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
  240. rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
  241. cgroup_base_stat_cputime_account_end(cgrp, rstatc);
  242. }
  243. void __cgroup_account_cputime_field(struct cgroup *cgrp,
  244. enum cpu_usage_stat index, u64 delta_exec)
  245. {
  246. struct cgroup_rstat_cpu *rstatc;
  247. rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
  248. switch (index) {
  249. case CPUTIME_USER:
  250. case CPUTIME_NICE:
  251. rstatc->bstat.cputime.utime += delta_exec;
  252. break;
  253. case CPUTIME_SYSTEM:
  254. case CPUTIME_IRQ:
  255. case CPUTIME_SOFTIRQ:
  256. rstatc->bstat.cputime.stime += delta_exec;
  257. break;
  258. default:
  259. break;
  260. }
  261. cgroup_base_stat_cputime_account_end(cgrp, rstatc);
  262. }
  263. void cgroup_base_stat_cputime_show(struct seq_file *seq)
  264. {
  265. struct cgroup *cgrp = seq_css(seq)->cgroup;
  266. u64 usage, utime, stime;
  267. if (!cgroup_parent(cgrp))
  268. return;
  269. mutex_lock(&cgroup_rstat_mutex);
  270. cgroup_rstat_flush_locked(cgrp);
  271. usage = cgrp->bstat.cputime.sum_exec_runtime;
  272. cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime);
  273. mutex_unlock(&cgroup_rstat_mutex);
  274. do_div(usage, NSEC_PER_USEC);
  275. do_div(utime, NSEC_PER_USEC);
  276. do_div(stime, NSEC_PER_USEC);
  277. seq_printf(seq, "usage_usec %llu\n"
  278. "user_usec %llu\n"
  279. "system_usec %llu\n",
  280. usage, utime, stime);
  281. }