cgroup-v1.c 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398
  1. #include "cgroup-internal.h"
  2. #include <linux/ctype.h>
  3. #include <linux/kmod.h>
  4. #include <linux/sort.h>
  5. #include <linux/delay.h>
  6. #include <linux/mm.h>
  7. #include <linux/sched/signal.h>
  8. #include <linux/sched/task.h>
  9. #include <linux/magic.h>
  10. #include <linux/slab.h>
  11. #include <linux/vmalloc.h>
  12. #include <linux/delayacct.h>
  13. #include <linux/pid_namespace.h>
  14. #include <linux/cgroupstats.h>
  15. #include <trace/events/cgroup.h>
  16. /*
  17. * pidlists linger the following amount before being destroyed. The goal
  18. * is avoiding frequent destruction in the middle of consecutive read calls
  19. * Expiring in the middle is a performance problem not a correctness one.
  20. * 1 sec should be enough.
  21. */
  22. #define CGROUP_PIDLIST_DESTROY_DELAY HZ
  23. /* Controllers blocked by the commandline in v1 */
  24. static u16 cgroup_no_v1_mask;
  25. /*
  26. * pidlist destructions need to be flushed on cgroup destruction. Use a
  27. * separate workqueue as flush domain.
  28. */
  29. static struct workqueue_struct *cgroup_pidlist_destroy_wq;
  30. /*
  31. * Protects cgroup_subsys->release_agent_path. Modifying it also requires
  32. * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
  33. */
  34. static DEFINE_SPINLOCK(release_agent_path_lock);
  35. bool cgroup1_ssid_disabled(int ssid)
  36. {
  37. return cgroup_no_v1_mask & (1 << ssid);
  38. }
  39. /**
  40. * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
  41. * @from: attach to all cgroups of a given task
  42. * @tsk: the task to be attached
  43. */
  44. int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
  45. {
  46. struct cgroup_root *root;
  47. int retval = 0;
  48. mutex_lock(&cgroup_mutex);
  49. percpu_down_write(&cgroup_threadgroup_rwsem);
  50. for_each_root(root) {
  51. struct cgroup *from_cgrp;
  52. if (root == &cgrp_dfl_root)
  53. continue;
  54. spin_lock_irq(&css_set_lock);
  55. from_cgrp = task_cgroup_from_root(from, root);
  56. spin_unlock_irq(&css_set_lock);
  57. retval = cgroup_attach_task(from_cgrp, tsk, false);
  58. if (retval)
  59. break;
  60. }
  61. percpu_up_write(&cgroup_threadgroup_rwsem);
  62. mutex_unlock(&cgroup_mutex);
  63. return retval;
  64. }
  65. EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
  66. /**
  67. * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
  68. * @to: cgroup to which the tasks will be moved
  69. * @from: cgroup in which the tasks currently reside
  70. *
  71. * Locking rules between cgroup_post_fork() and the migration path
  72. * guarantee that, if a task is forking while being migrated, the new child
  73. * is guaranteed to be either visible in the source cgroup after the
  74. * parent's migration is complete or put into the target cgroup. No task
  75. * can slip out of migration through forking.
  76. */
  77. int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
  78. {
  79. DEFINE_CGROUP_MGCTX(mgctx);
  80. struct cgrp_cset_link *link;
  81. struct css_task_iter it;
  82. struct task_struct *task;
  83. int ret;
  84. if (cgroup_on_dfl(to))
  85. return -EINVAL;
  86. if (!cgroup_may_migrate_to(to))
  87. return -EBUSY;
  88. mutex_lock(&cgroup_mutex);
  89. percpu_down_write(&cgroup_threadgroup_rwsem);
  90. /* all tasks in @from are being moved, all csets are source */
  91. spin_lock_irq(&css_set_lock);
  92. list_for_each_entry(link, &from->cset_links, cset_link)
  93. cgroup_migrate_add_src(link->cset, to, &mgctx);
  94. spin_unlock_irq(&css_set_lock);
  95. ret = cgroup_migrate_prepare_dst(&mgctx);
  96. if (ret)
  97. goto out_err;
  98. /*
  99. * Migrate tasks one-by-one until @from is empty. This fails iff
  100. * ->can_attach() fails.
  101. */
  102. do {
  103. css_task_iter_start(&from->self, &it);
  104. task = css_task_iter_next(&it);
  105. if (task)
  106. get_task_struct(task);
  107. css_task_iter_end(&it);
  108. if (task) {
  109. ret = cgroup_migrate(task, false, &mgctx);
  110. if (!ret)
  111. trace_cgroup_transfer_tasks(to, task, false);
  112. put_task_struct(task);
  113. }
  114. } while (task && !ret);
  115. out_err:
  116. cgroup_migrate_finish(&mgctx);
  117. percpu_up_write(&cgroup_threadgroup_rwsem);
  118. mutex_unlock(&cgroup_mutex);
  119. return ret;
  120. }
  121. /*
  122. * Stuff for reading the 'tasks'/'procs' files.
  123. *
  124. * Reading this file can return large amounts of data if a cgroup has
  125. * *lots* of attached tasks. So it may need several calls to read(),
  126. * but we cannot guarantee that the information we produce is correct
  127. * unless we produce it entirely atomically.
  128. *
  129. */
  130. /* which pidlist file are we talking about? */
  131. enum cgroup_filetype {
  132. CGROUP_FILE_PROCS,
  133. CGROUP_FILE_TASKS,
  134. };
  135. /*
  136. * A pidlist is a list of pids that virtually represents the contents of one
  137. * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
  138. * a pair (one each for procs, tasks) for each pid namespace that's relevant
  139. * to the cgroup.
  140. */
  141. struct cgroup_pidlist {
  142. /*
  143. * used to find which pidlist is wanted. doesn't change as long as
  144. * this particular list stays in the list.
  145. */
  146. struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
  147. /* array of xids */
  148. pid_t *list;
  149. /* how many elements the above list has */
  150. int length;
  151. /* each of these stored in a list by its cgroup */
  152. struct list_head links;
  153. /* pointer to the cgroup we belong to, for list removal purposes */
  154. struct cgroup *owner;
  155. /* for delayed destruction */
  156. struct delayed_work destroy_dwork;
  157. };
  158. /*
  159. * The following two functions "fix" the issue where there are more pids
  160. * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
  161. * TODO: replace with a kernel-wide solution to this problem
  162. */
  163. #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
  164. static void *pidlist_allocate(int count)
  165. {
  166. if (PIDLIST_TOO_LARGE(count))
  167. return vmalloc(count * sizeof(pid_t));
  168. else
  169. return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
  170. }
  171. static void pidlist_free(void *p)
  172. {
  173. kvfree(p);
  174. }
  175. /*
  176. * Used to destroy all pidlists lingering waiting for destroy timer. None
  177. * should be left afterwards.
  178. */
  179. void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
  180. {
  181. struct cgroup_pidlist *l, *tmp_l;
  182. mutex_lock(&cgrp->pidlist_mutex);
  183. list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
  184. mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
  185. mutex_unlock(&cgrp->pidlist_mutex);
  186. flush_workqueue(cgroup_pidlist_destroy_wq);
  187. BUG_ON(!list_empty(&cgrp->pidlists));
  188. }
  189. static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
  190. {
  191. struct delayed_work *dwork = to_delayed_work(work);
  192. struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
  193. destroy_dwork);
  194. struct cgroup_pidlist *tofree = NULL;
  195. mutex_lock(&l->owner->pidlist_mutex);
  196. /*
  197. * Destroy iff we didn't get queued again. The state won't change
  198. * as destroy_dwork can only be queued while locked.
  199. */
  200. if (!delayed_work_pending(dwork)) {
  201. list_del(&l->links);
  202. pidlist_free(l->list);
  203. put_pid_ns(l->key.ns);
  204. tofree = l;
  205. }
  206. mutex_unlock(&l->owner->pidlist_mutex);
  207. kfree(tofree);
  208. }
  209. /*
  210. * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
  211. * Returns the number of unique elements.
  212. */
  213. static int pidlist_uniq(pid_t *list, int length)
  214. {
  215. int src, dest = 1;
  216. /*
  217. * we presume the 0th element is unique, so i starts at 1. trivial
  218. * edge cases first; no work needs to be done for either
  219. */
  220. if (length == 0 || length == 1)
  221. return length;
  222. /* src and dest walk down the list; dest counts unique elements */
  223. for (src = 1; src < length; src++) {
  224. /* find next unique element */
  225. while (list[src] == list[src-1]) {
  226. src++;
  227. if (src == length)
  228. goto after;
  229. }
  230. /* dest always points to where the next unique element goes */
  231. list[dest] = list[src];
  232. dest++;
  233. }
  234. after:
  235. return dest;
  236. }
  237. /*
  238. * The two pid files - task and cgroup.procs - guaranteed that the result
  239. * is sorted, which forced this whole pidlist fiasco. As pid order is
  240. * different per namespace, each namespace needs differently sorted list,
  241. * making it impossible to use, for example, single rbtree of member tasks
  242. * sorted by task pointer. As pidlists can be fairly large, allocating one
  243. * per open file is dangerous, so cgroup had to implement shared pool of
  244. * pidlists keyed by cgroup and namespace.
  245. */
  246. static int cmppid(const void *a, const void *b)
  247. {
  248. return *(pid_t *)a - *(pid_t *)b;
  249. }
  250. static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
  251. enum cgroup_filetype type)
  252. {
  253. struct cgroup_pidlist *l;
  254. /* don't need task_nsproxy() if we're looking at ourself */
  255. struct pid_namespace *ns = task_active_pid_ns(current);
  256. lockdep_assert_held(&cgrp->pidlist_mutex);
  257. list_for_each_entry(l, &cgrp->pidlists, links)
  258. if (l->key.type == type && l->key.ns == ns)
  259. return l;
  260. return NULL;
  261. }
  262. /*
  263. * find the appropriate pidlist for our purpose (given procs vs tasks)
  264. * returns with the lock on that pidlist already held, and takes care
  265. * of the use count, or returns NULL with no locks held if we're out of
  266. * memory.
  267. */
  268. static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
  269. enum cgroup_filetype type)
  270. {
  271. struct cgroup_pidlist *l;
  272. lockdep_assert_held(&cgrp->pidlist_mutex);
  273. l = cgroup_pidlist_find(cgrp, type);
  274. if (l)
  275. return l;
  276. /* entry not found; create a new one */
  277. l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
  278. if (!l)
  279. return l;
  280. INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
  281. l->key.type = type;
  282. /* don't need task_nsproxy() if we're looking at ourself */
  283. l->key.ns = get_pid_ns(task_active_pid_ns(current));
  284. l->owner = cgrp;
  285. list_add(&l->links, &cgrp->pidlists);
  286. return l;
  287. }
  288. /**
  289. * cgroup_task_count - count the number of tasks in a cgroup.
  290. * @cgrp: the cgroup in question
  291. *
  292. * Return the number of tasks in the cgroup. The returned number can be
  293. * higher than the actual number of tasks due to css_set references from
  294. * namespace roots and temporary usages.
  295. */
  296. static int cgroup_task_count(const struct cgroup *cgrp)
  297. {
  298. int count = 0;
  299. struct cgrp_cset_link *link;
  300. spin_lock_irq(&css_set_lock);
  301. list_for_each_entry(link, &cgrp->cset_links, cset_link)
  302. count += atomic_read(&link->cset->refcount);
  303. spin_unlock_irq(&css_set_lock);
  304. return count;
  305. }
  306. /*
  307. * Load a cgroup's pidarray with either procs' tgids or tasks' pids
  308. */
  309. static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
  310. struct cgroup_pidlist **lp)
  311. {
  312. pid_t *array;
  313. int length;
  314. int pid, n = 0; /* used for populating the array */
  315. struct css_task_iter it;
  316. struct task_struct *tsk;
  317. struct cgroup_pidlist *l;
  318. lockdep_assert_held(&cgrp->pidlist_mutex);
  319. /*
  320. * If cgroup gets more users after we read count, we won't have
  321. * enough space - tough. This race is indistinguishable to the
  322. * caller from the case that the additional cgroup users didn't
  323. * show up until sometime later on.
  324. */
  325. length = cgroup_task_count(cgrp);
  326. array = pidlist_allocate(length);
  327. if (!array)
  328. return -ENOMEM;
  329. /* now, populate the array */
  330. css_task_iter_start(&cgrp->self, &it);
  331. while ((tsk = css_task_iter_next(&it))) {
  332. if (unlikely(n == length))
  333. break;
  334. /* get tgid or pid for procs or tasks file respectively */
  335. if (type == CGROUP_FILE_PROCS)
  336. pid = task_tgid_vnr(tsk);
  337. else
  338. pid = task_pid_vnr(tsk);
  339. if (pid > 0) /* make sure to only use valid results */
  340. array[n++] = pid;
  341. }
  342. css_task_iter_end(&it);
  343. length = n;
  344. /* now sort & (if procs) strip out duplicates */
  345. sort(array, length, sizeof(pid_t), cmppid, NULL);
  346. if (type == CGROUP_FILE_PROCS)
  347. length = pidlist_uniq(array, length);
  348. l = cgroup_pidlist_find_create(cgrp, type);
  349. if (!l) {
  350. pidlist_free(array);
  351. return -ENOMEM;
  352. }
  353. /* store array, freeing old if necessary */
  354. pidlist_free(l->list);
  355. l->list = array;
  356. l->length = length;
  357. *lp = l;
  358. return 0;
  359. }
  360. /*
  361. * seq_file methods for the tasks/procs files. The seq_file position is the
  362. * next pid to display; the seq_file iterator is a pointer to the pid
  363. * in the cgroup->l->list array.
  364. */
  365. static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
  366. {
  367. /*
  368. * Initially we receive a position value that corresponds to
  369. * one more than the last pid shown (or 0 on the first call or
  370. * after a seek to the start). Use a binary-search to find the
  371. * next pid to display, if any
  372. */
  373. struct kernfs_open_file *of = s->private;
  374. struct cgroup *cgrp = seq_css(s)->cgroup;
  375. struct cgroup_pidlist *l;
  376. enum cgroup_filetype type = seq_cft(s)->private;
  377. int index = 0, pid = *pos;
  378. int *iter, ret;
  379. mutex_lock(&cgrp->pidlist_mutex);
  380. /*
  381. * !NULL @of->priv indicates that this isn't the first start()
  382. * after open. If the matching pidlist is around, we can use that.
  383. * Look for it. Note that @of->priv can't be used directly. It
  384. * could already have been destroyed.
  385. */
  386. if (of->priv)
  387. of->priv = cgroup_pidlist_find(cgrp, type);
  388. /*
  389. * Either this is the first start() after open or the matching
  390. * pidlist has been destroyed inbetween. Create a new one.
  391. */
  392. if (!of->priv) {
  393. ret = pidlist_array_load(cgrp, type,
  394. (struct cgroup_pidlist **)&of->priv);
  395. if (ret)
  396. return ERR_PTR(ret);
  397. }
  398. l = of->priv;
  399. if (pid) {
  400. int end = l->length;
  401. while (index < end) {
  402. int mid = (index + end) / 2;
  403. if (l->list[mid] == pid) {
  404. index = mid;
  405. break;
  406. } else if (l->list[mid] <= pid)
  407. index = mid + 1;
  408. else
  409. end = mid;
  410. }
  411. }
  412. /* If we're off the end of the array, we're done */
  413. if (index >= l->length)
  414. return NULL;
  415. /* Update the abstract position to be the actual pid that we found */
  416. iter = l->list + index;
  417. *pos = *iter;
  418. return iter;
  419. }
  420. static void cgroup_pidlist_stop(struct seq_file *s, void *v)
  421. {
  422. struct kernfs_open_file *of = s->private;
  423. struct cgroup_pidlist *l = of->priv;
  424. if (l)
  425. mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
  426. CGROUP_PIDLIST_DESTROY_DELAY);
  427. mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
  428. }
  429. static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
  430. {
  431. struct kernfs_open_file *of = s->private;
  432. struct cgroup_pidlist *l = of->priv;
  433. pid_t *p = v;
  434. pid_t *end = l->list + l->length;
  435. /*
  436. * Advance to the next pid in the array. If this goes off the
  437. * end, we're done
  438. */
  439. p++;
  440. if (p >= end) {
  441. return NULL;
  442. } else {
  443. *pos = *p;
  444. return p;
  445. }
  446. }
  447. static int cgroup_pidlist_show(struct seq_file *s, void *v)
  448. {
  449. seq_printf(s, "%d\n", *(int *)v);
  450. return 0;
  451. }
  452. static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
  453. char *buf, size_t nbytes, loff_t off)
  454. {
  455. return __cgroup_procs_write(of, buf, nbytes, off, false);
  456. }
  457. static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
  458. char *buf, size_t nbytes, loff_t off)
  459. {
  460. struct cgroup *cgrp;
  461. BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
  462. cgrp = cgroup_kn_lock_live(of->kn, false);
  463. if (!cgrp)
  464. return -ENODEV;
  465. spin_lock(&release_agent_path_lock);
  466. strlcpy(cgrp->root->release_agent_path, strstrip(buf),
  467. sizeof(cgrp->root->release_agent_path));
  468. spin_unlock(&release_agent_path_lock);
  469. cgroup_kn_unlock(of->kn);
  470. return nbytes;
  471. }
  472. static int cgroup_release_agent_show(struct seq_file *seq, void *v)
  473. {
  474. struct cgroup *cgrp = seq_css(seq)->cgroup;
  475. spin_lock(&release_agent_path_lock);
  476. seq_puts(seq, cgrp->root->release_agent_path);
  477. spin_unlock(&release_agent_path_lock);
  478. seq_putc(seq, '\n');
  479. return 0;
  480. }
  481. static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
  482. {
  483. seq_puts(seq, "0\n");
  484. return 0;
  485. }
  486. static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
  487. struct cftype *cft)
  488. {
  489. return notify_on_release(css->cgroup);
  490. }
  491. static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
  492. struct cftype *cft, u64 val)
  493. {
  494. if (val)
  495. set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
  496. else
  497. clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
  498. return 0;
  499. }
  500. static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
  501. struct cftype *cft)
  502. {
  503. return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  504. }
  505. static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
  506. struct cftype *cft, u64 val)
  507. {
  508. if (val)
  509. set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  510. else
  511. clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  512. return 0;
  513. }
  514. /* cgroup core interface files for the legacy hierarchies */
  515. struct cftype cgroup1_base_files[] = {
  516. {
  517. .name = "cgroup.procs",
  518. .seq_start = cgroup_pidlist_start,
  519. .seq_next = cgroup_pidlist_next,
  520. .seq_stop = cgroup_pidlist_stop,
  521. .seq_show = cgroup_pidlist_show,
  522. .private = CGROUP_FILE_PROCS,
  523. .write = cgroup_procs_write,
  524. },
  525. {
  526. .name = "cgroup.clone_children",
  527. .read_u64 = cgroup_clone_children_read,
  528. .write_u64 = cgroup_clone_children_write,
  529. },
  530. {
  531. .name = "cgroup.sane_behavior",
  532. .flags = CFTYPE_ONLY_ON_ROOT,
  533. .seq_show = cgroup_sane_behavior_show,
  534. },
  535. {
  536. .name = "tasks",
  537. .seq_start = cgroup_pidlist_start,
  538. .seq_next = cgroup_pidlist_next,
  539. .seq_stop = cgroup_pidlist_stop,
  540. .seq_show = cgroup_pidlist_show,
  541. .private = CGROUP_FILE_TASKS,
  542. .write = cgroup_tasks_write,
  543. },
  544. {
  545. .name = "notify_on_release",
  546. .read_u64 = cgroup_read_notify_on_release,
  547. .write_u64 = cgroup_write_notify_on_release,
  548. },
  549. {
  550. .name = "release_agent",
  551. .flags = CFTYPE_ONLY_ON_ROOT,
  552. .seq_show = cgroup_release_agent_show,
  553. .write = cgroup_release_agent_write,
  554. .max_write_len = PATH_MAX - 1,
  555. },
  556. { } /* terminate */
  557. };
  558. /* Display information about each subsystem and each hierarchy */
  559. static int proc_cgroupstats_show(struct seq_file *m, void *v)
  560. {
  561. struct cgroup_subsys *ss;
  562. int i;
  563. seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
  564. /*
  565. * ideally we don't want subsystems moving around while we do this.
  566. * cgroup_mutex is also necessary to guarantee an atomic snapshot of
  567. * subsys/hierarchy state.
  568. */
  569. mutex_lock(&cgroup_mutex);
  570. for_each_subsys(ss, i)
  571. seq_printf(m, "%s\t%d\t%d\t%d\n",
  572. ss->legacy_name, ss->root->hierarchy_id,
  573. atomic_read(&ss->root->nr_cgrps),
  574. cgroup_ssid_enabled(i));
  575. mutex_unlock(&cgroup_mutex);
  576. return 0;
  577. }
  578. static int cgroupstats_open(struct inode *inode, struct file *file)
  579. {
  580. return single_open(file, proc_cgroupstats_show, NULL);
  581. }
  582. const struct file_operations proc_cgroupstats_operations = {
  583. .open = cgroupstats_open,
  584. .read = seq_read,
  585. .llseek = seq_lseek,
  586. .release = single_release,
  587. };
  588. /**
  589. * cgroupstats_build - build and fill cgroupstats
  590. * @stats: cgroupstats to fill information into
  591. * @dentry: A dentry entry belonging to the cgroup for which stats have
  592. * been requested.
  593. *
  594. * Build and fill cgroupstats so that taskstats can export it to user
  595. * space.
  596. */
  597. int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  598. {
  599. struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
  600. struct cgroup *cgrp;
  601. struct css_task_iter it;
  602. struct task_struct *tsk;
  603. /* it should be kernfs_node belonging to cgroupfs and is a directory */
  604. if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
  605. kernfs_type(kn) != KERNFS_DIR)
  606. return -EINVAL;
  607. mutex_lock(&cgroup_mutex);
  608. /*
  609. * We aren't being called from kernfs and there's no guarantee on
  610. * @kn->priv's validity. For this and css_tryget_online_from_dir(),
  611. * @kn->priv is RCU safe. Let's do the RCU dancing.
  612. */
  613. rcu_read_lock();
  614. cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
  615. if (!cgrp || cgroup_is_dead(cgrp)) {
  616. rcu_read_unlock();
  617. mutex_unlock(&cgroup_mutex);
  618. return -ENOENT;
  619. }
  620. rcu_read_unlock();
  621. css_task_iter_start(&cgrp->self, &it);
  622. while ((tsk = css_task_iter_next(&it))) {
  623. switch (tsk->state) {
  624. case TASK_RUNNING:
  625. stats->nr_running++;
  626. break;
  627. case TASK_INTERRUPTIBLE:
  628. stats->nr_sleeping++;
  629. break;
  630. case TASK_UNINTERRUPTIBLE:
  631. stats->nr_uninterruptible++;
  632. break;
  633. case TASK_STOPPED:
  634. stats->nr_stopped++;
  635. break;
  636. default:
  637. if (delayacct_is_task_waiting_on_io(tsk))
  638. stats->nr_io_wait++;
  639. break;
  640. }
  641. }
  642. css_task_iter_end(&it);
  643. mutex_unlock(&cgroup_mutex);
  644. return 0;
  645. }
  646. void cgroup1_check_for_release(struct cgroup *cgrp)
  647. {
  648. if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
  649. !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
  650. schedule_work(&cgrp->release_agent_work);
  651. }
  652. /*
  653. * Notify userspace when a cgroup is released, by running the
  654. * configured release agent with the name of the cgroup (path
  655. * relative to the root of cgroup file system) as the argument.
  656. *
  657. * Most likely, this user command will try to rmdir this cgroup.
  658. *
  659. * This races with the possibility that some other task will be
  660. * attached to this cgroup before it is removed, or that some other
  661. * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
  662. * The presumed 'rmdir' will fail quietly if this cgroup is no longer
  663. * unused, and this cgroup will be reprieved from its death sentence,
  664. * to continue to serve a useful existence. Next time it's released,
  665. * we will get notified again, if it still has 'notify_on_release' set.
  666. *
  667. * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
  668. * means only wait until the task is successfully execve()'d. The
  669. * separate release agent task is forked by call_usermodehelper(),
  670. * then control in this thread returns here, without waiting for the
  671. * release agent task. We don't bother to wait because the caller of
  672. * this routine has no use for the exit status of the release agent
  673. * task, so no sense holding our caller up for that.
  674. */
  675. void cgroup1_release_agent(struct work_struct *work)
  676. {
  677. struct cgroup *cgrp =
  678. container_of(work, struct cgroup, release_agent_work);
  679. char *pathbuf = NULL, *agentbuf = NULL;
  680. char *argv[3], *envp[3];
  681. int ret;
  682. mutex_lock(&cgroup_mutex);
  683. pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
  684. agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
  685. if (!pathbuf || !agentbuf)
  686. goto out;
  687. spin_lock_irq(&css_set_lock);
  688. ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
  689. spin_unlock_irq(&css_set_lock);
  690. if (ret < 0 || ret >= PATH_MAX)
  691. goto out;
  692. argv[0] = agentbuf;
  693. argv[1] = pathbuf;
  694. argv[2] = NULL;
  695. /* minimal command environment */
  696. envp[0] = "HOME=/";
  697. envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  698. envp[2] = NULL;
  699. mutex_unlock(&cgroup_mutex);
  700. call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
  701. goto out_free;
  702. out:
  703. mutex_unlock(&cgroup_mutex);
  704. out_free:
  705. kfree(agentbuf);
  706. kfree(pathbuf);
  707. }
  708. /*
  709. * cgroup_rename - Only allow simple rename of directories in place.
  710. */
  711. static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
  712. const char *new_name_str)
  713. {
  714. struct cgroup *cgrp = kn->priv;
  715. int ret;
  716. if (kernfs_type(kn) != KERNFS_DIR)
  717. return -ENOTDIR;
  718. if (kn->parent != new_parent)
  719. return -EIO;
  720. /*
  721. * We're gonna grab cgroup_mutex which nests outside kernfs
  722. * active_ref. kernfs_rename() doesn't require active_ref
  723. * protection. Break them before grabbing cgroup_mutex.
  724. */
  725. kernfs_break_active_protection(new_parent);
  726. kernfs_break_active_protection(kn);
  727. mutex_lock(&cgroup_mutex);
  728. ret = kernfs_rename(kn, new_parent, new_name_str);
  729. if (!ret)
  730. trace_cgroup_rename(cgrp);
  731. mutex_unlock(&cgroup_mutex);
  732. kernfs_unbreak_active_protection(kn);
  733. kernfs_unbreak_active_protection(new_parent);
  734. return ret;
  735. }
  736. static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
  737. {
  738. struct cgroup_root *root = cgroup_root_from_kf(kf_root);
  739. struct cgroup_subsys *ss;
  740. int ssid;
  741. for_each_subsys(ss, ssid)
  742. if (root->subsys_mask & (1 << ssid))
  743. seq_show_option(seq, ss->legacy_name, NULL);
  744. if (root->flags & CGRP_ROOT_NOPREFIX)
  745. seq_puts(seq, ",noprefix");
  746. if (root->flags & CGRP_ROOT_XATTR)
  747. seq_puts(seq, ",xattr");
  748. spin_lock(&release_agent_path_lock);
  749. if (strlen(root->release_agent_path))
  750. seq_show_option(seq, "release_agent",
  751. root->release_agent_path);
  752. spin_unlock(&release_agent_path_lock);
  753. if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
  754. seq_puts(seq, ",clone_children");
  755. if (strlen(root->name))
  756. seq_show_option(seq, "name", root->name);
  757. return 0;
  758. }
  759. static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
  760. {
  761. char *token, *o = data;
  762. bool all_ss = false, one_ss = false;
  763. u16 mask = U16_MAX;
  764. struct cgroup_subsys *ss;
  765. int nr_opts = 0;
  766. int i;
  767. #ifdef CONFIG_CPUSETS
  768. mask = ~((u16)1 << cpuset_cgrp_id);
  769. #endif
  770. memset(opts, 0, sizeof(*opts));
  771. while ((token = strsep(&o, ",")) != NULL) {
  772. nr_opts++;
  773. if (!*token)
  774. return -EINVAL;
  775. if (!strcmp(token, "none")) {
  776. /* Explicitly have no subsystems */
  777. opts->none = true;
  778. continue;
  779. }
  780. if (!strcmp(token, "all")) {
  781. /* Mutually exclusive option 'all' + subsystem name */
  782. if (one_ss)
  783. return -EINVAL;
  784. all_ss = true;
  785. continue;
  786. }
  787. if (!strcmp(token, "noprefix")) {
  788. opts->flags |= CGRP_ROOT_NOPREFIX;
  789. continue;
  790. }
  791. if (!strcmp(token, "clone_children")) {
  792. opts->cpuset_clone_children = true;
  793. continue;
  794. }
  795. if (!strcmp(token, "xattr")) {
  796. opts->flags |= CGRP_ROOT_XATTR;
  797. continue;
  798. }
  799. if (!strncmp(token, "release_agent=", 14)) {
  800. /* Specifying two release agents is forbidden */
  801. if (opts->release_agent)
  802. return -EINVAL;
  803. opts->release_agent =
  804. kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
  805. if (!opts->release_agent)
  806. return -ENOMEM;
  807. continue;
  808. }
  809. if (!strncmp(token, "name=", 5)) {
  810. const char *name = token + 5;
  811. /* Can't specify an empty name */
  812. if (!strlen(name))
  813. return -EINVAL;
  814. /* Must match [\w.-]+ */
  815. for (i = 0; i < strlen(name); i++) {
  816. char c = name[i];
  817. if (isalnum(c))
  818. continue;
  819. if ((c == '.') || (c == '-') || (c == '_'))
  820. continue;
  821. return -EINVAL;
  822. }
  823. /* Specifying two names is forbidden */
  824. if (opts->name)
  825. return -EINVAL;
  826. opts->name = kstrndup(name,
  827. MAX_CGROUP_ROOT_NAMELEN - 1,
  828. GFP_KERNEL);
  829. if (!opts->name)
  830. return -ENOMEM;
  831. continue;
  832. }
  833. for_each_subsys(ss, i) {
  834. if (strcmp(token, ss->legacy_name))
  835. continue;
  836. if (!cgroup_ssid_enabled(i))
  837. continue;
  838. if (cgroup1_ssid_disabled(i))
  839. continue;
  840. /* Mutually exclusive option 'all' + subsystem name */
  841. if (all_ss)
  842. return -EINVAL;
  843. opts->subsys_mask |= (1 << i);
  844. one_ss = true;
  845. break;
  846. }
  847. if (i == CGROUP_SUBSYS_COUNT)
  848. return -ENOENT;
  849. }
  850. /*
  851. * If the 'all' option was specified select all the subsystems,
  852. * otherwise if 'none', 'name=' and a subsystem name options were
  853. * not specified, let's default to 'all'
  854. */
  855. if (all_ss || (!one_ss && !opts->none && !opts->name))
  856. for_each_subsys(ss, i)
  857. if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
  858. opts->subsys_mask |= (1 << i);
  859. /*
  860. * We either have to specify by name or by subsystems. (So all
  861. * empty hierarchies must have a name).
  862. */
  863. if (!opts->subsys_mask && !opts->name)
  864. return -EINVAL;
  865. /*
  866. * Option noprefix was introduced just for backward compatibility
  867. * with the old cpuset, so we allow noprefix only if mounting just
  868. * the cpuset subsystem.
  869. */
  870. if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
  871. return -EINVAL;
  872. /* Can't specify "none" and some subsystems */
  873. if (opts->subsys_mask && opts->none)
  874. return -EINVAL;
  875. return 0;
  876. }
  877. static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
  878. {
  879. int ret = 0;
  880. struct cgroup_root *root = cgroup_root_from_kf(kf_root);
  881. struct cgroup_sb_opts opts;
  882. u16 added_mask, removed_mask;
  883. cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  884. /* See what subsystems are wanted */
  885. ret = parse_cgroupfs_options(data, &opts);
  886. if (ret)
  887. goto out_unlock;
  888. if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
  889. pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
  890. task_tgid_nr(current), current->comm);
  891. added_mask = opts.subsys_mask & ~root->subsys_mask;
  892. removed_mask = root->subsys_mask & ~opts.subsys_mask;
  893. /* Don't allow flags or name to change at remount */
  894. if ((opts.flags ^ root->flags) ||
  895. (opts.name && strcmp(opts.name, root->name))) {
  896. pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
  897. opts.flags, opts.name ?: "", root->flags, root->name);
  898. ret = -EINVAL;
  899. goto out_unlock;
  900. }
  901. /* remounting is not allowed for populated hierarchies */
  902. if (!list_empty(&root->cgrp.self.children)) {
  903. ret = -EBUSY;
  904. goto out_unlock;
  905. }
  906. ret = rebind_subsystems(root, added_mask);
  907. if (ret)
  908. goto out_unlock;
  909. WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
  910. if (opts.release_agent) {
  911. spin_lock(&release_agent_path_lock);
  912. strcpy(root->release_agent_path, opts.release_agent);
  913. spin_unlock(&release_agent_path_lock);
  914. }
  915. trace_cgroup_remount(root);
  916. out_unlock:
  917. kfree(opts.release_agent);
  918. kfree(opts.name);
  919. mutex_unlock(&cgroup_mutex);
  920. return ret;
  921. }
  922. struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
  923. .rename = cgroup1_rename,
  924. .show_options = cgroup1_show_options,
  925. .remount_fs = cgroup1_remount,
  926. .mkdir = cgroup_mkdir,
  927. .rmdir = cgroup_rmdir,
  928. .show_path = cgroup_show_path,
  929. };
  930. struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
  931. void *data, unsigned long magic,
  932. struct cgroup_namespace *ns)
  933. {
  934. struct super_block *pinned_sb = NULL;
  935. struct cgroup_sb_opts opts;
  936. struct cgroup_root *root;
  937. struct cgroup_subsys *ss;
  938. struct dentry *dentry;
  939. int i, ret;
  940. cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  941. /* First find the desired set of subsystems */
  942. ret = parse_cgroupfs_options(data, &opts);
  943. if (ret)
  944. goto out_unlock;
  945. /*
  946. * Destruction of cgroup root is asynchronous, so subsystems may
  947. * still be dying after the previous unmount. Let's drain the
  948. * dying subsystems. We just need to ensure that the ones
  949. * unmounted previously finish dying and don't care about new ones
  950. * starting. Testing ref liveliness is good enough.
  951. */
  952. for_each_subsys(ss, i) {
  953. if (!(opts.subsys_mask & (1 << i)) ||
  954. ss->root == &cgrp_dfl_root)
  955. continue;
  956. if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
  957. mutex_unlock(&cgroup_mutex);
  958. msleep(10);
  959. ret = restart_syscall();
  960. goto out_free;
  961. }
  962. cgroup_put(&ss->root->cgrp);
  963. }
  964. for_each_root(root) {
  965. bool name_match = false;
  966. if (root == &cgrp_dfl_root)
  967. continue;
  968. /*
  969. * If we asked for a name then it must match. Also, if
  970. * name matches but sybsys_mask doesn't, we should fail.
  971. * Remember whether name matched.
  972. */
  973. if (opts.name) {
  974. if (strcmp(opts.name, root->name))
  975. continue;
  976. name_match = true;
  977. }
  978. /*
  979. * If we asked for subsystems (or explicitly for no
  980. * subsystems) then they must match.
  981. */
  982. if ((opts.subsys_mask || opts.none) &&
  983. (opts.subsys_mask != root->subsys_mask)) {
  984. if (!name_match)
  985. continue;
  986. ret = -EBUSY;
  987. goto out_unlock;
  988. }
  989. if (root->flags ^ opts.flags)
  990. pr_warn("new mount options do not match the existing superblock, will be ignored\n");
  991. /*
  992. * We want to reuse @root whose lifetime is governed by its
  993. * ->cgrp. Let's check whether @root is alive and keep it
  994. * that way. As cgroup_kill_sb() can happen anytime, we
  995. * want to block it by pinning the sb so that @root doesn't
  996. * get killed before mount is complete.
  997. *
  998. * With the sb pinned, tryget_live can reliably indicate
  999. * whether @root can be reused. If it's being killed,
  1000. * drain it. We can use wait_queue for the wait but this
  1001. * path is super cold. Let's just sleep a bit and retry.
  1002. */
  1003. pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
  1004. if (IS_ERR(pinned_sb) ||
  1005. !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
  1006. mutex_unlock(&cgroup_mutex);
  1007. if (!IS_ERR_OR_NULL(pinned_sb))
  1008. deactivate_super(pinned_sb);
  1009. msleep(10);
  1010. ret = restart_syscall();
  1011. goto out_free;
  1012. }
  1013. ret = 0;
  1014. goto out_unlock;
  1015. }
  1016. /*
  1017. * No such thing, create a new one. name= matching without subsys
  1018. * specification is allowed for already existing hierarchies but we
  1019. * can't create new one without subsys specification.
  1020. */
  1021. if (!opts.subsys_mask && !opts.none) {
  1022. ret = -EINVAL;
  1023. goto out_unlock;
  1024. }
  1025. /* Hierarchies may only be created in the initial cgroup namespace. */
  1026. if (ns != &init_cgroup_ns) {
  1027. ret = -EPERM;
  1028. goto out_unlock;
  1029. }
  1030. root = kzalloc(sizeof(*root), GFP_KERNEL);
  1031. if (!root) {
  1032. ret = -ENOMEM;
  1033. goto out_unlock;
  1034. }
  1035. init_cgroup_root(root, &opts);
  1036. ret = cgroup_setup_root(root, opts.subsys_mask);
  1037. if (ret)
  1038. cgroup_free_root(root);
  1039. out_unlock:
  1040. mutex_unlock(&cgroup_mutex);
  1041. out_free:
  1042. kfree(opts.release_agent);
  1043. kfree(opts.name);
  1044. if (ret)
  1045. return ERR_PTR(ret);
  1046. dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
  1047. CGROUP_SUPER_MAGIC, ns);
  1048. /*
  1049. * If @pinned_sb, we're reusing an existing root and holding an
  1050. * extra ref on its sb. Mount is complete. Put the extra ref.
  1051. */
  1052. if (pinned_sb)
  1053. deactivate_super(pinned_sb);
  1054. return dentry;
  1055. }
  1056. static int __init cgroup1_wq_init(void)
  1057. {
  1058. /*
  1059. * Used to destroy pidlists and separate to serve as flush domain.
  1060. * Cap @max_active to 1 too.
  1061. */
  1062. cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
  1063. 0, 1);
  1064. BUG_ON(!cgroup_pidlist_destroy_wq);
  1065. return 0;
  1066. }
  1067. core_initcall(cgroup1_wq_init);
  1068. static int __init cgroup_no_v1(char *str)
  1069. {
  1070. struct cgroup_subsys *ss;
  1071. char *token;
  1072. int i;
  1073. while ((token = strsep(&str, ",")) != NULL) {
  1074. if (!*token)
  1075. continue;
  1076. if (!strcmp(token, "all")) {
  1077. cgroup_no_v1_mask = U16_MAX;
  1078. break;
  1079. }
  1080. for_each_subsys(ss, i) {
  1081. if (strcmp(token, ss->name) &&
  1082. strcmp(token, ss->legacy_name))
  1083. continue;
  1084. cgroup_no_v1_mask |= 1 << i;
  1085. }
  1086. }
  1087. return 1;
  1088. }
  1089. __setup("cgroup_no_v1=", cgroup_no_v1);
  1090. #ifdef CONFIG_CGROUP_DEBUG
  1091. static struct cgroup_subsys_state *
  1092. debug_css_alloc(struct cgroup_subsys_state *parent_css)
  1093. {
  1094. struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
  1095. if (!css)
  1096. return ERR_PTR(-ENOMEM);
  1097. return css;
  1098. }
  1099. static void debug_css_free(struct cgroup_subsys_state *css)
  1100. {
  1101. kfree(css);
  1102. }
  1103. static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
  1104. struct cftype *cft)
  1105. {
  1106. return cgroup_task_count(css->cgroup);
  1107. }
  1108. static u64 current_css_set_read(struct cgroup_subsys_state *css,
  1109. struct cftype *cft)
  1110. {
  1111. return (u64)(unsigned long)current->cgroups;
  1112. }
  1113. static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
  1114. struct cftype *cft)
  1115. {
  1116. u64 count;
  1117. rcu_read_lock();
  1118. count = atomic_read(&task_css_set(current)->refcount);
  1119. rcu_read_unlock();
  1120. return count;
  1121. }
  1122. static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
  1123. {
  1124. struct cgrp_cset_link *link;
  1125. struct css_set *cset;
  1126. char *name_buf;
  1127. name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
  1128. if (!name_buf)
  1129. return -ENOMEM;
  1130. spin_lock_irq(&css_set_lock);
  1131. rcu_read_lock();
  1132. cset = rcu_dereference(current->cgroups);
  1133. list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
  1134. struct cgroup *c = link->cgrp;
  1135. cgroup_name(c, name_buf, NAME_MAX + 1);
  1136. seq_printf(seq, "Root %d group %s\n",
  1137. c->root->hierarchy_id, name_buf);
  1138. }
  1139. rcu_read_unlock();
  1140. spin_unlock_irq(&css_set_lock);
  1141. kfree(name_buf);
  1142. return 0;
  1143. }
  1144. #define MAX_TASKS_SHOWN_PER_CSS 25
  1145. static int cgroup_css_links_read(struct seq_file *seq, void *v)
  1146. {
  1147. struct cgroup_subsys_state *css = seq_css(seq);
  1148. struct cgrp_cset_link *link;
  1149. spin_lock_irq(&css_set_lock);
  1150. list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
  1151. struct css_set *cset = link->cset;
  1152. struct task_struct *task;
  1153. int count = 0;
  1154. seq_printf(seq, "css_set %pK\n", cset);
  1155. list_for_each_entry(task, &cset->tasks, cg_list) {
  1156. if (count++ > MAX_TASKS_SHOWN_PER_CSS)
  1157. goto overflow;
  1158. seq_printf(seq, " task %d\n", task_pid_vnr(task));
  1159. }
  1160. list_for_each_entry(task, &cset->mg_tasks, cg_list) {
  1161. if (count++ > MAX_TASKS_SHOWN_PER_CSS)
  1162. goto overflow;
  1163. seq_printf(seq, " task %d\n", task_pid_vnr(task));
  1164. }
  1165. continue;
  1166. overflow:
  1167. seq_puts(seq, " ...\n");
  1168. }
  1169. spin_unlock_irq(&css_set_lock);
  1170. return 0;
  1171. }
  1172. static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
  1173. {
  1174. return (!cgroup_is_populated(css->cgroup) &&
  1175. !css_has_online_children(&css->cgroup->self));
  1176. }
  1177. static struct cftype debug_files[] = {
  1178. {
  1179. .name = "taskcount",
  1180. .read_u64 = debug_taskcount_read,
  1181. },
  1182. {
  1183. .name = "current_css_set",
  1184. .read_u64 = current_css_set_read,
  1185. },
  1186. {
  1187. .name = "current_css_set_refcount",
  1188. .read_u64 = current_css_set_refcount_read,
  1189. },
  1190. {
  1191. .name = "current_css_set_cg_links",
  1192. .seq_show = current_css_set_cg_links_read,
  1193. },
  1194. {
  1195. .name = "cgroup_css_links",
  1196. .seq_show = cgroup_css_links_read,
  1197. },
  1198. {
  1199. .name = "releasable",
  1200. .read_u64 = releasable_read,
  1201. },
  1202. { } /* terminate */
  1203. };
  1204. struct cgroup_subsys debug_cgrp_subsys = {
  1205. .css_alloc = debug_css_alloc,
  1206. .css_free = debug_css_free,
  1207. .legacy_cftypes = debug_files,
  1208. };
  1209. #endif /* CONFIG_CGROUP_DEBUG */