hugetlb_cgroup.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. /*
  2. *
  3. * Copyright IBM Corporation, 2012
  4. * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
  5. *
  6. * This program is free software; you can redistribute it and/or modify it
  7. * under the terms of version 2.1 of the GNU Lesser General Public License
  8. * as published by the Free Software Foundation.
  9. *
  10. * This program is distributed in the hope that it would be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. */
  15. #include <linux/cgroup.h>
  16. #include <linux/slab.h>
  17. #include <linux/hugetlb.h>
  18. #include <linux/hugetlb_cgroup.h>
  19. struct hugetlb_cgroup {
  20. struct cgroup_subsys_state css;
  21. /*
  22. * the counter to account for hugepages from hugetlb.
  23. */
  24. struct res_counter hugepage[HUGE_MAX_HSTATE];
  25. };
  26. #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
  27. #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
  28. #define MEMFILE_ATTR(val) ((val) & 0xffff)
  29. static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
  30. static inline
  31. struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
  32. {
  33. return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
  34. }
  35. static inline
  36. struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
  37. {
  38. return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
  39. }
  40. static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
  41. {
  42. return (h_cg == root_h_cgroup);
  43. }
  44. static inline struct hugetlb_cgroup *
  45. parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
  46. {
  47. return hugetlb_cgroup_from_css(h_cg->css.parent);
  48. }
  49. static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
  50. {
  51. int idx;
  52. for (idx = 0; idx < hugetlb_max_hstate; idx++) {
  53. if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
  54. return true;
  55. }
  56. return false;
  57. }
  58. static struct cgroup_subsys_state *
  59. hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  60. {
  61. struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
  62. struct hugetlb_cgroup *h_cgroup;
  63. int idx;
  64. h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
  65. if (!h_cgroup)
  66. return ERR_PTR(-ENOMEM);
  67. if (parent_h_cgroup) {
  68. for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
  69. res_counter_init(&h_cgroup->hugepage[idx],
  70. &parent_h_cgroup->hugepage[idx]);
  71. } else {
  72. root_h_cgroup = h_cgroup;
  73. for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
  74. res_counter_init(&h_cgroup->hugepage[idx], NULL);
  75. }
  76. return &h_cgroup->css;
  77. }
  78. static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
  79. {
  80. struct hugetlb_cgroup *h_cgroup;
  81. h_cgroup = hugetlb_cgroup_from_css(css);
  82. kfree(h_cgroup);
  83. }
  84. /*
  85. * Should be called with hugetlb_lock held.
  86. * Since we are holding hugetlb_lock, pages cannot get moved from
  87. * active list or uncharged from the cgroup, So no need to get
  88. * page reference and test for page active here. This function
  89. * cannot fail.
  90. */
  91. static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
  92. struct page *page)
  93. {
  94. int csize;
  95. struct res_counter *counter;
  96. struct res_counter *fail_res;
  97. struct hugetlb_cgroup *page_hcg;
  98. struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
  99. page_hcg = hugetlb_cgroup_from_page(page);
  100. /*
  101. * We can have pages in active list without any cgroup
  102. * ie, hugepage with less than 3 pages. We can safely
  103. * ignore those pages.
  104. */
  105. if (!page_hcg || page_hcg != h_cg)
  106. goto out;
  107. csize = PAGE_SIZE << compound_order(page);
  108. if (!parent) {
  109. parent = root_h_cgroup;
  110. /* root has no limit */
  111. res_counter_charge_nofail(&parent->hugepage[idx],
  112. csize, &fail_res);
  113. }
  114. counter = &h_cg->hugepage[idx];
  115. res_counter_uncharge_until(counter, counter->parent, csize);
  116. set_hugetlb_cgroup(page, parent);
  117. out:
  118. return;
  119. }
  120. /*
  121. * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
  122. * the parent cgroup.
  123. */
  124. static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
  125. {
  126. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
  127. struct hstate *h;
  128. struct page *page;
  129. int idx = 0;
  130. do {
  131. for_each_hstate(h) {
  132. spin_lock(&hugetlb_lock);
  133. list_for_each_entry(page, &h->hugepage_activelist, lru)
  134. hugetlb_cgroup_move_parent(idx, h_cg, page);
  135. spin_unlock(&hugetlb_lock);
  136. idx++;
  137. }
  138. cond_resched();
  139. } while (hugetlb_cgroup_have_usage(h_cg));
  140. }
  141. int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
  142. struct hugetlb_cgroup **ptr)
  143. {
  144. int ret = 0;
  145. struct res_counter *fail_res;
  146. struct hugetlb_cgroup *h_cg = NULL;
  147. unsigned long csize = nr_pages * PAGE_SIZE;
  148. if (hugetlb_cgroup_disabled())
  149. goto done;
  150. /*
  151. * We don't charge any cgroup if the compound page have less
  152. * than 3 pages.
  153. */
  154. if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
  155. goto done;
  156. again:
  157. rcu_read_lock();
  158. h_cg = hugetlb_cgroup_from_task(current);
  159. if (!css_tryget_online(&h_cg->css)) {
  160. rcu_read_unlock();
  161. goto again;
  162. }
  163. rcu_read_unlock();
  164. ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
  165. css_put(&h_cg->css);
  166. done:
  167. *ptr = h_cg;
  168. return ret;
  169. }
  170. /* Should be called with hugetlb_lock held */
  171. void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
  172. struct hugetlb_cgroup *h_cg,
  173. struct page *page)
  174. {
  175. if (hugetlb_cgroup_disabled() || !h_cg)
  176. return;
  177. set_hugetlb_cgroup(page, h_cg);
  178. return;
  179. }
  180. /*
  181. * Should be called with hugetlb_lock held
  182. */
  183. void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
  184. struct page *page)
  185. {
  186. struct hugetlb_cgroup *h_cg;
  187. unsigned long csize = nr_pages * PAGE_SIZE;
  188. if (hugetlb_cgroup_disabled())
  189. return;
  190. VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
  191. h_cg = hugetlb_cgroup_from_page(page);
  192. if (unlikely(!h_cg))
  193. return;
  194. set_hugetlb_cgroup(page, NULL);
  195. res_counter_uncharge(&h_cg->hugepage[idx], csize);
  196. return;
  197. }
  198. void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
  199. struct hugetlb_cgroup *h_cg)
  200. {
  201. unsigned long csize = nr_pages * PAGE_SIZE;
  202. if (hugetlb_cgroup_disabled() || !h_cg)
  203. return;
  204. if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
  205. return;
  206. res_counter_uncharge(&h_cg->hugepage[idx], csize);
  207. return;
  208. }
  209. static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
  210. struct cftype *cft)
  211. {
  212. int idx, name;
  213. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
  214. idx = MEMFILE_IDX(cft->private);
  215. name = MEMFILE_ATTR(cft->private);
  216. return res_counter_read_u64(&h_cg->hugepage[idx], name);
  217. }
  218. static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
  219. char *buf, size_t nbytes, loff_t off)
  220. {
  221. int idx, name, ret;
  222. unsigned long long val;
  223. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
  224. buf = strstrip(buf);
  225. idx = MEMFILE_IDX(of_cft(of)->private);
  226. name = MEMFILE_ATTR(of_cft(of)->private);
  227. switch (name) {
  228. case RES_LIMIT:
  229. if (hugetlb_cgroup_is_root(h_cg)) {
  230. /* Can't set limit on root */
  231. ret = -EINVAL;
  232. break;
  233. }
  234. /* This function does all necessary parse...reuse it */
  235. ret = res_counter_memparse_write_strategy(buf, &val);
  236. if (ret)
  237. break;
  238. ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
  239. break;
  240. default:
  241. ret = -EINVAL;
  242. break;
  243. }
  244. return ret ?: nbytes;
  245. }
  246. static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
  247. char *buf, size_t nbytes, loff_t off)
  248. {
  249. int idx, name, ret = 0;
  250. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
  251. idx = MEMFILE_IDX(of_cft(of)->private);
  252. name = MEMFILE_ATTR(of_cft(of)->private);
  253. switch (name) {
  254. case RES_MAX_USAGE:
  255. res_counter_reset_max(&h_cg->hugepage[idx]);
  256. break;
  257. case RES_FAILCNT:
  258. res_counter_reset_failcnt(&h_cg->hugepage[idx]);
  259. break;
  260. default:
  261. ret = -EINVAL;
  262. break;
  263. }
  264. return ret ?: nbytes;
  265. }
  266. static char *mem_fmt(char *buf, int size, unsigned long hsize)
  267. {
  268. if (hsize >= (1UL << 30))
  269. snprintf(buf, size, "%luGB", hsize >> 30);
  270. else if (hsize >= (1UL << 20))
  271. snprintf(buf, size, "%luMB", hsize >> 20);
  272. else
  273. snprintf(buf, size, "%luKB", hsize >> 10);
  274. return buf;
  275. }
  276. static void __init __hugetlb_cgroup_file_init(int idx)
  277. {
  278. char buf[32];
  279. struct cftype *cft;
  280. struct hstate *h = &hstates[idx];
  281. /* format the size */
  282. mem_fmt(buf, 32, huge_page_size(h));
  283. /* Add the limit file */
  284. cft = &h->cgroup_files[0];
  285. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
  286. cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
  287. cft->read_u64 = hugetlb_cgroup_read_u64;
  288. cft->write = hugetlb_cgroup_write;
  289. /* Add the usage file */
  290. cft = &h->cgroup_files[1];
  291. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
  292. cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
  293. cft->read_u64 = hugetlb_cgroup_read_u64;
  294. /* Add the MAX usage file */
  295. cft = &h->cgroup_files[2];
  296. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
  297. cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
  298. cft->write = hugetlb_cgroup_reset;
  299. cft->read_u64 = hugetlb_cgroup_read_u64;
  300. /* Add the failcntfile */
  301. cft = &h->cgroup_files[3];
  302. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
  303. cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
  304. cft->write = hugetlb_cgroup_reset;
  305. cft->read_u64 = hugetlb_cgroup_read_u64;
  306. /* NULL terminate the last cft */
  307. cft = &h->cgroup_files[4];
  308. memset(cft, 0, sizeof(*cft));
  309. WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files));
  310. return;
  311. }
  312. void __init hugetlb_cgroup_file_init(void)
  313. {
  314. struct hstate *h;
  315. for_each_hstate(h) {
  316. /*
  317. * Add cgroup control files only if the huge page consists
  318. * of more than two normal pages. This is because we use
  319. * page[2].lru.next for storing cgroup details.
  320. */
  321. if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
  322. __hugetlb_cgroup_file_init(hstate_index(h));
  323. }
  324. }
  325. /*
  326. * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
  327. * when we migrate hugepages
  328. */
  329. void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
  330. {
  331. struct hugetlb_cgroup *h_cg;
  332. struct hstate *h = page_hstate(oldhpage);
  333. if (hugetlb_cgroup_disabled())
  334. return;
  335. VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
  336. spin_lock(&hugetlb_lock);
  337. h_cg = hugetlb_cgroup_from_page(oldhpage);
  338. set_hugetlb_cgroup(oldhpage, NULL);
  339. /* move the h_cg details to new cgroup */
  340. set_hugetlb_cgroup(newhpage, h_cg);
  341. list_move(&newhpage->lru, &h->hugepage_activelist);
  342. spin_unlock(&hugetlb_lock);
  343. return;
  344. }
  345. struct cgroup_subsys hugetlb_cgrp_subsys = {
  346. .css_alloc = hugetlb_cgroup_css_alloc,
  347. .css_offline = hugetlb_cgroup_css_offline,
  348. .css_free = hugetlb_cgroup_css_free,
  349. };