blk-cgroup.c 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814
  1. /*
  2. * Common Block IO controller cgroup interface
  3. *
  4. * Based on ideas and code from CFQ, CFS and BFQ:
  5. * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  6. *
  7. * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  8. * Paolo Valente <paolo.valente@unimore.it>
  9. *
  10. * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11. * Nauman Rafique <nauman@google.com>
  12. *
  13. * For policy-specific per-blkcg data:
  14. * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
  15. * Arianna Avanzini <avanzini.arianna@gmail.com>
  16. */
  17. #include <linux/ioprio.h>
  18. #include <linux/kdev_t.h>
  19. #include <linux/module.h>
  20. #include <linux/sched/signal.h>
  21. #include <linux/err.h>
  22. #include <linux/blkdev.h>
  23. #include <linux/backing-dev.h>
  24. #include <linux/slab.h>
  25. #include <linux/genhd.h>
  26. #include <linux/delay.h>
  27. #include <linux/atomic.h>
  28. #include <linux/ctype.h>
  29. #include <linux/blk-cgroup.h>
  30. #include <linux/tracehook.h>
  31. #include "blk.h"
  32. #define MAX_KEY_LEN 100
  33. /*
  34. * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
  35. * blkcg_pol_register_mutex nests outside of it and synchronizes entire
  36. * policy [un]register operations including cgroup file additions /
  37. * removals. Putting cgroup file registration outside blkcg_pol_mutex
  38. * allows grabbing it from cgroup callbacks.
  39. */
  40. static DEFINE_MUTEX(blkcg_pol_register_mutex);
  41. static DEFINE_MUTEX(blkcg_pol_mutex);
  42. struct blkcg blkcg_root;
  43. EXPORT_SYMBOL_GPL(blkcg_root);
  44. struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
  45. static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  46. static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
  47. static bool blkcg_debug_stats = false;
  48. static bool blkcg_policy_enabled(struct request_queue *q,
  49. const struct blkcg_policy *pol)
  50. {
  51. return pol && test_bit(pol->plid, q->blkcg_pols);
  52. }
  53. /**
  54. * blkg_free - free a blkg
  55. * @blkg: blkg to free
  56. *
  57. * Free @blkg which may be partially allocated.
  58. */
  59. static void blkg_free(struct blkcg_gq *blkg)
  60. {
  61. int i;
  62. if (!blkg)
  63. return;
  64. for (i = 0; i < BLKCG_MAX_POLS; i++)
  65. if (blkg->pd[i])
  66. blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
  67. if (blkg->blkcg != &blkcg_root)
  68. blk_exit_rl(blkg->q, &blkg->rl);
  69. blkg_rwstat_exit(&blkg->stat_ios);
  70. blkg_rwstat_exit(&blkg->stat_bytes);
  71. kfree(blkg);
  72. }
  73. /**
  74. * blkg_alloc - allocate a blkg
  75. * @blkcg: block cgroup the new blkg is associated with
  76. * @q: request_queue the new blkg is associated with
  77. * @gfp_mask: allocation mask to use
  78. *
  79. * Allocate a new blkg assocating @blkcg and @q.
  80. */
  81. static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
  82. gfp_t gfp_mask)
  83. {
  84. struct blkcg_gq *blkg;
  85. int i;
  86. /* alloc and init base part */
  87. blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
  88. if (!blkg)
  89. return NULL;
  90. if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
  91. blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
  92. goto err_free;
  93. blkg->q = q;
  94. INIT_LIST_HEAD(&blkg->q_node);
  95. blkg->blkcg = blkcg;
  96. atomic_set(&blkg->refcnt, 1);
  97. /* root blkg uses @q->root_rl, init rl only for !root blkgs */
  98. if (blkcg != &blkcg_root) {
  99. if (blk_init_rl(&blkg->rl, q, gfp_mask))
  100. goto err_free;
  101. blkg->rl.blkg = blkg;
  102. }
  103. for (i = 0; i < BLKCG_MAX_POLS; i++) {
  104. struct blkcg_policy *pol = blkcg_policy[i];
  105. struct blkg_policy_data *pd;
  106. if (!blkcg_policy_enabled(q, pol))
  107. continue;
  108. /* alloc per-policy data and attach it to blkg */
  109. pd = pol->pd_alloc_fn(gfp_mask, q->node);
  110. if (!pd)
  111. goto err_free;
  112. blkg->pd[i] = pd;
  113. pd->blkg = blkg;
  114. pd->plid = i;
  115. }
  116. return blkg;
  117. err_free:
  118. blkg_free(blkg);
  119. return NULL;
  120. }
  121. struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
  122. struct request_queue *q, bool update_hint)
  123. {
  124. struct blkcg_gq *blkg;
  125. /*
  126. * Hint didn't match. Look up from the radix tree. Note that the
  127. * hint can only be updated under queue_lock as otherwise @blkg
  128. * could have already been removed from blkg_tree. The caller is
  129. * responsible for grabbing queue_lock if @update_hint.
  130. */
  131. blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
  132. if (blkg && blkg->q == q) {
  133. if (update_hint) {
  134. lockdep_assert_held(q->queue_lock);
  135. rcu_assign_pointer(blkcg->blkg_hint, blkg);
  136. }
  137. return blkg;
  138. }
  139. return NULL;
  140. }
  141. EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
  142. /*
  143. * If @new_blkg is %NULL, this function tries to allocate a new one as
  144. * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
  145. */
  146. static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
  147. struct request_queue *q,
  148. struct blkcg_gq *new_blkg)
  149. {
  150. struct blkcg_gq *blkg;
  151. struct bdi_writeback_congested *wb_congested;
  152. int i, ret;
  153. WARN_ON_ONCE(!rcu_read_lock_held());
  154. lockdep_assert_held(q->queue_lock);
  155. /* blkg holds a reference to blkcg */
  156. if (!css_tryget_online(&blkcg->css)) {
  157. ret = -ENODEV;
  158. goto err_free_blkg;
  159. }
  160. wb_congested = wb_congested_get_create(q->backing_dev_info,
  161. blkcg->css.id,
  162. GFP_NOWAIT | __GFP_NOWARN);
  163. if (!wb_congested) {
  164. ret = -ENOMEM;
  165. goto err_put_css;
  166. }
  167. /* allocate */
  168. if (!new_blkg) {
  169. new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
  170. if (unlikely(!new_blkg)) {
  171. ret = -ENOMEM;
  172. goto err_put_congested;
  173. }
  174. }
  175. blkg = new_blkg;
  176. blkg->wb_congested = wb_congested;
  177. /* link parent */
  178. if (blkcg_parent(blkcg)) {
  179. blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
  180. if (WARN_ON_ONCE(!blkg->parent)) {
  181. ret = -ENODEV;
  182. goto err_put_congested;
  183. }
  184. blkg_get(blkg->parent);
  185. }
  186. /* invoke per-policy init */
  187. for (i = 0; i < BLKCG_MAX_POLS; i++) {
  188. struct blkcg_policy *pol = blkcg_policy[i];
  189. if (blkg->pd[i] && pol->pd_init_fn)
  190. pol->pd_init_fn(blkg->pd[i]);
  191. }
  192. /* insert */
  193. spin_lock(&blkcg->lock);
  194. ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
  195. if (likely(!ret)) {
  196. hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
  197. list_add(&blkg->q_node, &q->blkg_list);
  198. for (i = 0; i < BLKCG_MAX_POLS; i++) {
  199. struct blkcg_policy *pol = blkcg_policy[i];
  200. if (blkg->pd[i] && pol->pd_online_fn)
  201. pol->pd_online_fn(blkg->pd[i]);
  202. }
  203. }
  204. blkg->online = true;
  205. spin_unlock(&blkcg->lock);
  206. if (!ret)
  207. return blkg;
  208. /* @blkg failed fully initialized, use the usual release path */
  209. blkg_put(blkg);
  210. return ERR_PTR(ret);
  211. err_put_congested:
  212. wb_congested_put(wb_congested);
  213. err_put_css:
  214. css_put(&blkcg->css);
  215. err_free_blkg:
  216. blkg_free(new_blkg);
  217. return ERR_PTR(ret);
  218. }
  219. /**
  220. * blkg_lookup_create - lookup blkg, try to create one if not there
  221. * @blkcg: blkcg of interest
  222. * @q: request_queue of interest
  223. *
  224. * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
  225. * create one. blkg creation is performed recursively from blkcg_root such
  226. * that all non-root blkg's have access to the parent blkg. This function
  227. * should be called under RCU read lock and @q->queue_lock.
  228. *
  229. * Returns pointer to the looked up or created blkg on success, ERR_PTR()
  230. * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not
  231. * dead and bypassing, returns ERR_PTR(-EBUSY).
  232. */
  233. struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
  234. struct request_queue *q)
  235. {
  236. struct blkcg_gq *blkg;
  237. WARN_ON_ONCE(!rcu_read_lock_held());
  238. lockdep_assert_held(q->queue_lock);
  239. /*
  240. * This could be the first entry point of blkcg implementation and
  241. * we shouldn't allow anything to go through for a bypassing queue.
  242. */
  243. if (unlikely(blk_queue_bypass(q)))
  244. return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
  245. blkg = __blkg_lookup(blkcg, q, true);
  246. if (blkg)
  247. return blkg;
  248. /*
  249. * Create blkgs walking down from blkcg_root to @blkcg, so that all
  250. * non-root blkgs have access to their parents.
  251. */
  252. while (true) {
  253. struct blkcg *pos = blkcg;
  254. struct blkcg *parent = blkcg_parent(blkcg);
  255. while (parent && !__blkg_lookup(parent, q, false)) {
  256. pos = parent;
  257. parent = blkcg_parent(parent);
  258. }
  259. blkg = blkg_create(pos, q, NULL);
  260. if (pos == blkcg || IS_ERR(blkg))
  261. return blkg;
  262. }
  263. }
  264. static void blkg_destroy(struct blkcg_gq *blkg)
  265. {
  266. struct blkcg *blkcg = blkg->blkcg;
  267. struct blkcg_gq *parent = blkg->parent;
  268. int i;
  269. lockdep_assert_held(blkg->q->queue_lock);
  270. lockdep_assert_held(&blkcg->lock);
  271. /* Something wrong if we are trying to remove same group twice */
  272. WARN_ON_ONCE(list_empty(&blkg->q_node));
  273. WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
  274. for (i = 0; i < BLKCG_MAX_POLS; i++) {
  275. struct blkcg_policy *pol = blkcg_policy[i];
  276. if (blkg->pd[i] && pol->pd_offline_fn)
  277. pol->pd_offline_fn(blkg->pd[i]);
  278. }
  279. if (parent) {
  280. blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
  281. blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
  282. }
  283. blkg->online = false;
  284. radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
  285. list_del_init(&blkg->q_node);
  286. hlist_del_init_rcu(&blkg->blkcg_node);
  287. /*
  288. * Both setting lookup hint to and clearing it from @blkg are done
  289. * under queue_lock. If it's not pointing to @blkg now, it never
  290. * will. Hint assignment itself can race safely.
  291. */
  292. if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
  293. rcu_assign_pointer(blkcg->blkg_hint, NULL);
  294. /*
  295. * Put the reference taken at the time of creation so that when all
  296. * queues are gone, group can be destroyed.
  297. */
  298. blkg_put(blkg);
  299. }
  300. /**
  301. * blkg_destroy_all - destroy all blkgs associated with a request_queue
  302. * @q: request_queue of interest
  303. *
  304. * Destroy all blkgs associated with @q.
  305. */
  306. static void blkg_destroy_all(struct request_queue *q)
  307. {
  308. struct blkcg_gq *blkg, *n;
  309. lockdep_assert_held(q->queue_lock);
  310. list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
  311. struct blkcg *blkcg = blkg->blkcg;
  312. spin_lock(&blkcg->lock);
  313. blkg_destroy(blkg);
  314. spin_unlock(&blkcg->lock);
  315. }
  316. q->root_blkg = NULL;
  317. q->root_rl.blkg = NULL;
  318. }
  319. /*
  320. * A group is RCU protected, but having an rcu lock does not mean that one
  321. * can access all the fields of blkg and assume these are valid. For
  322. * example, don't try to follow throtl_data and request queue links.
  323. *
  324. * Having a reference to blkg under an rcu allows accesses to only values
  325. * local to groups like group stats and group rate limits.
  326. */
  327. void __blkg_release_rcu(struct rcu_head *rcu_head)
  328. {
  329. struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
  330. /* release the blkcg and parent blkg refs this blkg has been holding */
  331. css_put(&blkg->blkcg->css);
  332. if (blkg->parent)
  333. blkg_put(blkg->parent);
  334. wb_congested_put(blkg->wb_congested);
  335. blkg_free(blkg);
  336. }
  337. EXPORT_SYMBOL_GPL(__blkg_release_rcu);
  338. /*
  339. * The next function used by blk_queue_for_each_rl(). It's a bit tricky
  340. * because the root blkg uses @q->root_rl instead of its own rl.
  341. */
  342. struct request_list *__blk_queue_next_rl(struct request_list *rl,
  343. struct request_queue *q)
  344. {
  345. struct list_head *ent;
  346. struct blkcg_gq *blkg;
  347. /*
  348. * Determine the current blkg list_head. The first entry is
  349. * root_rl which is off @q->blkg_list and mapped to the head.
  350. */
  351. if (rl == &q->root_rl) {
  352. ent = &q->blkg_list;
  353. /* There are no more block groups, hence no request lists */
  354. if (list_empty(ent))
  355. return NULL;
  356. } else {
  357. blkg = container_of(rl, struct blkcg_gq, rl);
  358. ent = &blkg->q_node;
  359. }
  360. /* walk to the next list_head, skip root blkcg */
  361. ent = ent->next;
  362. if (ent == &q->root_blkg->q_node)
  363. ent = ent->next;
  364. if (ent == &q->blkg_list)
  365. return NULL;
  366. blkg = container_of(ent, struct blkcg_gq, q_node);
  367. return &blkg->rl;
  368. }
  369. static int blkcg_reset_stats(struct cgroup_subsys_state *css,
  370. struct cftype *cftype, u64 val)
  371. {
  372. struct blkcg *blkcg = css_to_blkcg(css);
  373. struct blkcg_gq *blkg;
  374. int i;
  375. mutex_lock(&blkcg_pol_mutex);
  376. spin_lock_irq(&blkcg->lock);
  377. /*
  378. * Note that stat reset is racy - it doesn't synchronize against
  379. * stat updates. This is a debug feature which shouldn't exist
  380. * anyway. If you get hit by a race, retry.
  381. */
  382. hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
  383. blkg_rwstat_reset(&blkg->stat_bytes);
  384. blkg_rwstat_reset(&blkg->stat_ios);
  385. for (i = 0; i < BLKCG_MAX_POLS; i++) {
  386. struct blkcg_policy *pol = blkcg_policy[i];
  387. if (blkg->pd[i] && pol->pd_reset_stats_fn)
  388. pol->pd_reset_stats_fn(blkg->pd[i]);
  389. }
  390. }
  391. spin_unlock_irq(&blkcg->lock);
  392. mutex_unlock(&blkcg_pol_mutex);
  393. return 0;
  394. }
  395. const char *blkg_dev_name(struct blkcg_gq *blkg)
  396. {
  397. /* some drivers (floppy) instantiate a queue w/o disk registered */
  398. if (blkg->q->backing_dev_info->dev)
  399. return dev_name(blkg->q->backing_dev_info->dev);
  400. return NULL;
  401. }
  402. EXPORT_SYMBOL_GPL(blkg_dev_name);
  403. /**
  404. * blkcg_print_blkgs - helper for printing per-blkg data
  405. * @sf: seq_file to print to
  406. * @blkcg: blkcg of interest
  407. * @prfill: fill function to print out a blkg
  408. * @pol: policy in question
  409. * @data: data to be passed to @prfill
  410. * @show_total: to print out sum of prfill return values or not
  411. *
  412. * This function invokes @prfill on each blkg of @blkcg if pd for the
  413. * policy specified by @pol exists. @prfill is invoked with @sf, the
  414. * policy data and @data and the matching queue lock held. If @show_total
  415. * is %true, the sum of the return values from @prfill is printed with
  416. * "Total" label at the end.
  417. *
  418. * This is to be used to construct print functions for
  419. * cftype->read_seq_string method.
  420. */
  421. void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
  422. u64 (*prfill)(struct seq_file *,
  423. struct blkg_policy_data *, int),
  424. const struct blkcg_policy *pol, int data,
  425. bool show_total)
  426. {
  427. struct blkcg_gq *blkg;
  428. u64 total = 0;
  429. rcu_read_lock();
  430. hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
  431. spin_lock_irq(blkg->q->queue_lock);
  432. if (blkcg_policy_enabled(blkg->q, pol))
  433. total += prfill(sf, blkg->pd[pol->plid], data);
  434. spin_unlock_irq(blkg->q->queue_lock);
  435. }
  436. rcu_read_unlock();
  437. if (show_total)
  438. seq_printf(sf, "Total %llu\n", (unsigned long long)total);
  439. }
  440. EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
  441. /**
  442. * __blkg_prfill_u64 - prfill helper for a single u64 value
  443. * @sf: seq_file to print to
  444. * @pd: policy private data of interest
  445. * @v: value to print
  446. *
  447. * Print @v to @sf for the device assocaited with @pd.
  448. */
  449. u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
  450. {
  451. const char *dname = blkg_dev_name(pd->blkg);
  452. if (!dname)
  453. return 0;
  454. seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
  455. return v;
  456. }
  457. EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
  458. /**
  459. * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
  460. * @sf: seq_file to print to
  461. * @pd: policy private data of interest
  462. * @rwstat: rwstat to print
  463. *
  464. * Print @rwstat to @sf for the device assocaited with @pd.
  465. */
  466. u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  467. const struct blkg_rwstat *rwstat)
  468. {
  469. static const char *rwstr[] = {
  470. [BLKG_RWSTAT_READ] = "Read",
  471. [BLKG_RWSTAT_WRITE] = "Write",
  472. [BLKG_RWSTAT_SYNC] = "Sync",
  473. [BLKG_RWSTAT_ASYNC] = "Async",
  474. [BLKG_RWSTAT_DISCARD] = "Discard",
  475. };
  476. const char *dname = blkg_dev_name(pd->blkg);
  477. u64 v;
  478. int i;
  479. if (!dname)
  480. return 0;
  481. for (i = 0; i < BLKG_RWSTAT_NR; i++)
  482. seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
  483. (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
  484. v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
  485. atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
  486. atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
  487. seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
  488. return v;
  489. }
  490. EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
  491. /**
  492. * blkg_prfill_stat - prfill callback for blkg_stat
  493. * @sf: seq_file to print to
  494. * @pd: policy private data of interest
  495. * @off: offset to the blkg_stat in @pd
  496. *
  497. * prfill callback for printing a blkg_stat.
  498. */
  499. u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
  500. {
  501. return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
  502. }
  503. EXPORT_SYMBOL_GPL(blkg_prfill_stat);
  504. /**
  505. * blkg_prfill_rwstat - prfill callback for blkg_rwstat
  506. * @sf: seq_file to print to
  507. * @pd: policy private data of interest
  508. * @off: offset to the blkg_rwstat in @pd
  509. *
  510. * prfill callback for printing a blkg_rwstat.
  511. */
  512. u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  513. int off)
  514. {
  515. struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
  516. return __blkg_prfill_rwstat(sf, pd, &rwstat);
  517. }
  518. EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
  519. static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
  520. struct blkg_policy_data *pd, int off)
  521. {
  522. struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
  523. return __blkg_prfill_rwstat(sf, pd, &rwstat);
  524. }
  525. /**
  526. * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
  527. * @sf: seq_file to print to
  528. * @v: unused
  529. *
  530. * To be used as cftype->seq_show to print blkg->stat_bytes.
  531. * cftype->private must be set to the blkcg_policy.
  532. */
  533. int blkg_print_stat_bytes(struct seq_file *sf, void *v)
  534. {
  535. blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  536. blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
  537. offsetof(struct blkcg_gq, stat_bytes), true);
  538. return 0;
  539. }
  540. EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
  541. /**
  542. * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
  543. * @sf: seq_file to print to
  544. * @v: unused
  545. *
  546. * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private
  547. * must be set to the blkcg_policy.
  548. */
  549. int blkg_print_stat_ios(struct seq_file *sf, void *v)
  550. {
  551. blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  552. blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
  553. offsetof(struct blkcg_gq, stat_ios), true);
  554. return 0;
  555. }
  556. EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
  557. static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
  558. struct blkg_policy_data *pd,
  559. int off)
  560. {
  561. struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
  562. NULL, off);
  563. return __blkg_prfill_rwstat(sf, pd, &rwstat);
  564. }
  565. /**
  566. * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
  567. * @sf: seq_file to print to
  568. * @v: unused
  569. */
  570. int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
  571. {
  572. blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  573. blkg_prfill_rwstat_field_recursive,
  574. (void *)seq_cft(sf)->private,
  575. offsetof(struct blkcg_gq, stat_bytes), true);
  576. return 0;
  577. }
  578. EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
  579. /**
  580. * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
  581. * @sf: seq_file to print to
  582. * @v: unused
  583. */
  584. int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
  585. {
  586. blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  587. blkg_prfill_rwstat_field_recursive,
  588. (void *)seq_cft(sf)->private,
  589. offsetof(struct blkcg_gq, stat_ios), true);
  590. return 0;
  591. }
  592. EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
  593. /**
  594. * blkg_stat_recursive_sum - collect hierarchical blkg_stat
  595. * @blkg: blkg of interest
  596. * @pol: blkcg_policy which contains the blkg_stat
  597. * @off: offset to the blkg_stat in blkg_policy_data or @blkg
  598. *
  599. * Collect the blkg_stat specified by @blkg, @pol and @off and all its
  600. * online descendants and their aux counts. The caller must be holding the
  601. * queue lock for online tests.
  602. *
  603. * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
  604. * at @off bytes into @blkg's blkg_policy_data of the policy.
  605. */
  606. u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
  607. struct blkcg_policy *pol, int off)
  608. {
  609. struct blkcg_gq *pos_blkg;
  610. struct cgroup_subsys_state *pos_css;
  611. u64 sum = 0;
  612. lockdep_assert_held(blkg->q->queue_lock);
  613. rcu_read_lock();
  614. blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
  615. struct blkg_stat *stat;
  616. if (!pos_blkg->online)
  617. continue;
  618. if (pol)
  619. stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
  620. else
  621. stat = (void *)blkg + off;
  622. sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
  623. }
  624. rcu_read_unlock();
  625. return sum;
  626. }
  627. EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
  628. /**
  629. * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
  630. * @blkg: blkg of interest
  631. * @pol: blkcg_policy which contains the blkg_rwstat
  632. * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
  633. *
  634. * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
  635. * online descendants and their aux counts. The caller must be holding the
  636. * queue lock for online tests.
  637. *
  638. * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
  639. * is at @off bytes into @blkg's blkg_policy_data of the policy.
  640. */
  641. struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
  642. struct blkcg_policy *pol, int off)
  643. {
  644. struct blkcg_gq *pos_blkg;
  645. struct cgroup_subsys_state *pos_css;
  646. struct blkg_rwstat sum = { };
  647. int i;
  648. lockdep_assert_held(blkg->q->queue_lock);
  649. rcu_read_lock();
  650. blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
  651. struct blkg_rwstat *rwstat;
  652. if (!pos_blkg->online)
  653. continue;
  654. if (pol)
  655. rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
  656. else
  657. rwstat = (void *)pos_blkg + off;
  658. for (i = 0; i < BLKG_RWSTAT_NR; i++)
  659. atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
  660. percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
  661. &sum.aux_cnt[i]);
  662. }
  663. rcu_read_unlock();
  664. return sum;
  665. }
  666. EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
  667. /* Performs queue bypass and policy enabled checks then looks up blkg. */
  668. static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
  669. const struct blkcg_policy *pol,
  670. struct request_queue *q)
  671. {
  672. WARN_ON_ONCE(!rcu_read_lock_held());
  673. lockdep_assert_held(q->queue_lock);
  674. if (!blkcg_policy_enabled(q, pol))
  675. return ERR_PTR(-EOPNOTSUPP);
  676. /*
  677. * This could be the first entry point of blkcg implementation and
  678. * we shouldn't allow anything to go through for a bypassing queue.
  679. */
  680. if (unlikely(blk_queue_bypass(q)))
  681. return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
  682. return __blkg_lookup(blkcg, q, true /* update_hint */);
  683. }
  684. /**
  685. * blkg_conf_prep - parse and prepare for per-blkg config update
  686. * @blkcg: target block cgroup
  687. * @pol: target policy
  688. * @input: input string
  689. * @ctx: blkg_conf_ctx to be filled
  690. *
  691. * Parse per-blkg config update from @input and initialize @ctx with the
  692. * result. @ctx->blkg points to the blkg to be updated and @ctx->body the
  693. * part of @input following MAJ:MIN. This function returns with RCU read
  694. * lock and queue lock held and must be paired with blkg_conf_finish().
  695. */
  696. int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
  697. char *input, struct blkg_conf_ctx *ctx)
  698. __acquires(rcu) __acquires(disk->queue->queue_lock)
  699. {
  700. struct gendisk *disk;
  701. struct request_queue *q;
  702. struct blkcg_gq *blkg;
  703. unsigned int major, minor;
  704. int key_len, part, ret;
  705. char *body;
  706. if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
  707. return -EINVAL;
  708. body = input + key_len;
  709. if (!isspace(*body))
  710. return -EINVAL;
  711. body = skip_spaces(body);
  712. disk = get_gendisk(MKDEV(major, minor), &part);
  713. if (!disk)
  714. return -ENODEV;
  715. if (part) {
  716. ret = -ENODEV;
  717. goto fail;
  718. }
  719. q = disk->queue;
  720. rcu_read_lock();
  721. spin_lock_irq(q->queue_lock);
  722. blkg = blkg_lookup_check(blkcg, pol, q);
  723. if (IS_ERR(blkg)) {
  724. ret = PTR_ERR(blkg);
  725. goto fail_unlock;
  726. }
  727. if (blkg)
  728. goto success;
  729. /*
  730. * Create blkgs walking down from blkcg_root to @blkcg, so that all
  731. * non-root blkgs have access to their parents.
  732. */
  733. while (true) {
  734. struct blkcg *pos = blkcg;
  735. struct blkcg *parent;
  736. struct blkcg_gq *new_blkg;
  737. parent = blkcg_parent(blkcg);
  738. while (parent && !__blkg_lookup(parent, q, false)) {
  739. pos = parent;
  740. parent = blkcg_parent(parent);
  741. }
  742. /* Drop locks to do new blkg allocation with GFP_KERNEL. */
  743. spin_unlock_irq(q->queue_lock);
  744. rcu_read_unlock();
  745. new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
  746. if (unlikely(!new_blkg)) {
  747. ret = -ENOMEM;
  748. goto fail;
  749. }
  750. rcu_read_lock();
  751. spin_lock_irq(q->queue_lock);
  752. blkg = blkg_lookup_check(pos, pol, q);
  753. if (IS_ERR(blkg)) {
  754. ret = PTR_ERR(blkg);
  755. goto fail_unlock;
  756. }
  757. if (blkg) {
  758. blkg_free(new_blkg);
  759. } else {
  760. blkg = blkg_create(pos, q, new_blkg);
  761. if (unlikely(IS_ERR(blkg))) {
  762. ret = PTR_ERR(blkg);
  763. goto fail_unlock;
  764. }
  765. }
  766. if (pos == blkcg)
  767. goto success;
  768. }
  769. success:
  770. ctx->disk = disk;
  771. ctx->blkg = blkg;
  772. ctx->body = body;
  773. return 0;
  774. fail_unlock:
  775. spin_unlock_irq(q->queue_lock);
  776. rcu_read_unlock();
  777. fail:
  778. put_disk_and_module(disk);
  779. /*
  780. * If queue was bypassing, we should retry. Do so after a
  781. * short msleep(). It isn't strictly necessary but queue
  782. * can be bypassing for some time and it's always nice to
  783. * avoid busy looping.
  784. */
  785. if (ret == -EBUSY) {
  786. msleep(10);
  787. ret = restart_syscall();
  788. }
  789. return ret;
  790. }
  791. EXPORT_SYMBOL_GPL(blkg_conf_prep);
  792. /**
  793. * blkg_conf_finish - finish up per-blkg config update
  794. * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
  795. *
  796. * Finish up after per-blkg config update. This function must be paired
  797. * with blkg_conf_prep().
  798. */
  799. void blkg_conf_finish(struct blkg_conf_ctx *ctx)
  800. __releases(ctx->disk->queue->queue_lock) __releases(rcu)
  801. {
  802. spin_unlock_irq(ctx->disk->queue->queue_lock);
  803. rcu_read_unlock();
  804. put_disk_and_module(ctx->disk);
  805. }
  806. EXPORT_SYMBOL_GPL(blkg_conf_finish);
  807. static int blkcg_print_stat(struct seq_file *sf, void *v)
  808. {
  809. struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  810. struct blkcg_gq *blkg;
  811. rcu_read_lock();
  812. hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
  813. const char *dname;
  814. char *buf;
  815. struct blkg_rwstat rwstat;
  816. u64 rbytes, wbytes, rios, wios, dbytes, dios;
  817. size_t size = seq_get_buf(sf, &buf), off = 0;
  818. int i;
  819. bool has_stats = false;
  820. dname = blkg_dev_name(blkg);
  821. if (!dname)
  822. continue;
  823. /*
  824. * Hooray string manipulation, count is the size written NOT
  825. * INCLUDING THE \0, so size is now count+1 less than what we
  826. * had before, but we want to start writing the next bit from
  827. * the \0 so we only add count to buf.
  828. */
  829. off += scnprintf(buf+off, size-off, "%s ", dname);
  830. spin_lock_irq(blkg->q->queue_lock);
  831. rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
  832. offsetof(struct blkcg_gq, stat_bytes));
  833. rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
  834. wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
  835. dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
  836. rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
  837. offsetof(struct blkcg_gq, stat_ios));
  838. rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
  839. wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
  840. dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
  841. spin_unlock_irq(blkg->q->queue_lock);
  842. if (rbytes || wbytes || rios || wios) {
  843. has_stats = true;
  844. off += scnprintf(buf+off, size-off,
  845. "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
  846. rbytes, wbytes, rios, wios,
  847. dbytes, dios);
  848. }
  849. if (!blkcg_debug_stats)
  850. goto next;
  851. if (atomic_read(&blkg->use_delay)) {
  852. has_stats = true;
  853. off += scnprintf(buf+off, size-off,
  854. " use_delay=%d delay_nsec=%llu",
  855. atomic_read(&blkg->use_delay),
  856. (unsigned long long)atomic64_read(&blkg->delay_nsec));
  857. }
  858. for (i = 0; i < BLKCG_MAX_POLS; i++) {
  859. struct blkcg_policy *pol = blkcg_policy[i];
  860. size_t written;
  861. if (!blkg->pd[i] || !pol->pd_stat_fn)
  862. continue;
  863. written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
  864. if (written)
  865. has_stats = true;
  866. off += written;
  867. }
  868. next:
  869. if (has_stats) {
  870. off += scnprintf(buf+off, size-off, "\n");
  871. seq_commit(sf, off);
  872. }
  873. }
  874. rcu_read_unlock();
  875. return 0;
  876. }
  877. static struct cftype blkcg_files[] = {
  878. {
  879. .name = "stat",
  880. .flags = CFTYPE_NOT_ON_ROOT,
  881. .seq_show = blkcg_print_stat,
  882. },
  883. { } /* terminate */
  884. };
  885. static struct cftype blkcg_legacy_files[] = {
  886. {
  887. .name = "reset_stats",
  888. .write_u64 = blkcg_reset_stats,
  889. },
  890. { } /* terminate */
  891. };
  892. /*
  893. * blkcg destruction is a three-stage process.
  894. *
  895. * 1. Destruction starts. The blkcg_css_offline() callback is invoked
  896. * which offlines writeback. Here we tie the next stage of blkg destruction
  897. * to the completion of writeback associated with the blkcg. This lets us
  898. * avoid punting potentially large amounts of outstanding writeback to root
  899. * while maintaining any ongoing policies. The next stage is triggered when
  900. * the nr_cgwbs count goes to zero.
  901. *
  902. * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
  903. * and handles the destruction of blkgs. Here the css reference held by
  904. * the blkg is put back eventually allowing blkcg_css_free() to be called.
  905. * This work may occur in cgwb_release_workfn() on the cgwb_release
  906. * workqueue. Any submitted ios that fail to get the blkg ref will be
  907. * punted to the root_blkg.
  908. *
  909. * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
  910. * This finally frees the blkcg.
  911. */
  912. /**
  913. * blkcg_css_offline - cgroup css_offline callback
  914. * @css: css of interest
  915. *
  916. * This function is called when @css is about to go away. Here the cgwbs are
  917. * offlined first and only once writeback associated with the blkcg has
  918. * finished do we start step 2 (see above).
  919. */
  920. static void blkcg_css_offline(struct cgroup_subsys_state *css)
  921. {
  922. struct blkcg *blkcg = css_to_blkcg(css);
  923. /* this prevents anyone from attaching or migrating to this blkcg */
  924. wb_blkcg_offline(blkcg);
  925. /* put the base cgwb reference allowing step 2 to be triggered */
  926. blkcg_cgwb_put(blkcg);
  927. }
  928. /**
  929. * blkcg_destroy_blkgs - responsible for shooting down blkgs
  930. * @blkcg: blkcg of interest
  931. *
  932. * blkgs should be removed while holding both q and blkcg locks. As blkcg lock
  933. * is nested inside q lock, this function performs reverse double lock dancing.
  934. * Destroying the blkgs releases the reference held on the blkcg's css allowing
  935. * blkcg_css_free to eventually be called.
  936. *
  937. * This is the blkcg counterpart of ioc_release_fn().
  938. */
  939. void blkcg_destroy_blkgs(struct blkcg *blkcg)
  940. {
  941. spin_lock_irq(&blkcg->lock);
  942. while (!hlist_empty(&blkcg->blkg_list)) {
  943. struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
  944. struct blkcg_gq, blkcg_node);
  945. struct request_queue *q = blkg->q;
  946. if (spin_trylock(q->queue_lock)) {
  947. blkg_destroy(blkg);
  948. spin_unlock(q->queue_lock);
  949. } else {
  950. spin_unlock_irq(&blkcg->lock);
  951. cpu_relax();
  952. spin_lock_irq(&blkcg->lock);
  953. }
  954. }
  955. spin_unlock_irq(&blkcg->lock);
  956. }
  957. static void blkcg_css_free(struct cgroup_subsys_state *css)
  958. {
  959. struct blkcg *blkcg = css_to_blkcg(css);
  960. int i;
  961. mutex_lock(&blkcg_pol_mutex);
  962. list_del(&blkcg->all_blkcgs_node);
  963. for (i = 0; i < BLKCG_MAX_POLS; i++)
  964. if (blkcg->cpd[i])
  965. blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
  966. mutex_unlock(&blkcg_pol_mutex);
  967. kfree(blkcg);
  968. }
  969. static struct cgroup_subsys_state *
  970. blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
  971. {
  972. struct blkcg *blkcg;
  973. struct cgroup_subsys_state *ret;
  974. int i;
  975. mutex_lock(&blkcg_pol_mutex);
  976. if (!parent_css) {
  977. blkcg = &blkcg_root;
  978. } else {
  979. blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
  980. if (!blkcg) {
  981. ret = ERR_PTR(-ENOMEM);
  982. goto unlock;
  983. }
  984. }
  985. for (i = 0; i < BLKCG_MAX_POLS ; i++) {
  986. struct blkcg_policy *pol = blkcg_policy[i];
  987. struct blkcg_policy_data *cpd;
  988. /*
  989. * If the policy hasn't been attached yet, wait for it
  990. * to be attached before doing anything else. Otherwise,
  991. * check if the policy requires any specific per-cgroup
  992. * data: if it does, allocate and initialize it.
  993. */
  994. if (!pol || !pol->cpd_alloc_fn)
  995. continue;
  996. cpd = pol->cpd_alloc_fn(GFP_KERNEL);
  997. if (!cpd) {
  998. ret = ERR_PTR(-ENOMEM);
  999. goto free_pd_blkcg;
  1000. }
  1001. blkcg->cpd[i] = cpd;
  1002. cpd->blkcg = blkcg;
  1003. cpd->plid = i;
  1004. if (pol->cpd_init_fn)
  1005. pol->cpd_init_fn(cpd);
  1006. }
  1007. spin_lock_init(&blkcg->lock);
  1008. INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
  1009. INIT_HLIST_HEAD(&blkcg->blkg_list);
  1010. #ifdef CONFIG_CGROUP_WRITEBACK
  1011. INIT_LIST_HEAD(&blkcg->cgwb_list);
  1012. refcount_set(&blkcg->cgwb_refcnt, 1);
  1013. #endif
  1014. list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
  1015. mutex_unlock(&blkcg_pol_mutex);
  1016. return &blkcg->css;
  1017. free_pd_blkcg:
  1018. for (i--; i >= 0; i--)
  1019. if (blkcg->cpd[i])
  1020. blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
  1021. if (blkcg != &blkcg_root)
  1022. kfree(blkcg);
  1023. unlock:
  1024. mutex_unlock(&blkcg_pol_mutex);
  1025. return ret;
  1026. }
  1027. /**
  1028. * blkcg_init_queue - initialize blkcg part of request queue
  1029. * @q: request_queue to initialize
  1030. *
  1031. * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
  1032. * part of new request_queue @q.
  1033. *
  1034. * RETURNS:
  1035. * 0 on success, -errno on failure.
  1036. */
  1037. int blkcg_init_queue(struct request_queue *q)
  1038. {
  1039. struct blkcg_gq *new_blkg, *blkg;
  1040. bool preloaded;
  1041. int ret;
  1042. new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
  1043. if (!new_blkg)
  1044. return -ENOMEM;
  1045. preloaded = !radix_tree_preload(GFP_KERNEL);
  1046. /* Make sure the root blkg exists. */
  1047. rcu_read_lock();
  1048. spin_lock_irq(q->queue_lock);
  1049. blkg = blkg_create(&blkcg_root, q, new_blkg);
  1050. if (IS_ERR(blkg))
  1051. goto err_unlock;
  1052. q->root_blkg = blkg;
  1053. q->root_rl.blkg = blkg;
  1054. spin_unlock_irq(q->queue_lock);
  1055. rcu_read_unlock();
  1056. if (preloaded)
  1057. radix_tree_preload_end();
  1058. ret = blk_iolatency_init(q);
  1059. if (ret) {
  1060. spin_lock_irq(q->queue_lock);
  1061. blkg_destroy_all(q);
  1062. spin_unlock_irq(q->queue_lock);
  1063. return ret;
  1064. }
  1065. ret = blk_throtl_init(q);
  1066. if (ret) {
  1067. spin_lock_irq(q->queue_lock);
  1068. blkg_destroy_all(q);
  1069. spin_unlock_irq(q->queue_lock);
  1070. }
  1071. return ret;
  1072. err_unlock:
  1073. spin_unlock_irq(q->queue_lock);
  1074. rcu_read_unlock();
  1075. if (preloaded)
  1076. radix_tree_preload_end();
  1077. return PTR_ERR(blkg);
  1078. }
  1079. /**
  1080. * blkcg_drain_queue - drain blkcg part of request_queue
  1081. * @q: request_queue to drain
  1082. *
  1083. * Called from blk_drain_queue(). Responsible for draining blkcg part.
  1084. */
  1085. void blkcg_drain_queue(struct request_queue *q)
  1086. {
  1087. lockdep_assert_held(q->queue_lock);
  1088. /*
  1089. * @q could be exiting and already have destroyed all blkgs as
  1090. * indicated by NULL root_blkg. If so, don't confuse policies.
  1091. */
  1092. if (!q->root_blkg)
  1093. return;
  1094. blk_throtl_drain(q);
  1095. }
  1096. /**
  1097. * blkcg_exit_queue - exit and release blkcg part of request_queue
  1098. * @q: request_queue being released
  1099. *
  1100. * Called from blk_release_queue(). Responsible for exiting blkcg part.
  1101. */
  1102. void blkcg_exit_queue(struct request_queue *q)
  1103. {
  1104. spin_lock_irq(q->queue_lock);
  1105. blkg_destroy_all(q);
  1106. spin_unlock_irq(q->queue_lock);
  1107. blk_throtl_exit(q);
  1108. }
  1109. /*
  1110. * We cannot support shared io contexts, as we have no mean to support
  1111. * two tasks with the same ioc in two different groups without major rework
  1112. * of the main cic data structures. For now we allow a task to change
  1113. * its cgroup only if it's the only owner of its ioc.
  1114. */
  1115. static int blkcg_can_attach(struct cgroup_taskset *tset)
  1116. {
  1117. struct task_struct *task;
  1118. struct cgroup_subsys_state *dst_css;
  1119. struct io_context *ioc;
  1120. int ret = 0;
  1121. /* task_lock() is needed to avoid races with exit_io_context() */
  1122. cgroup_taskset_for_each(task, dst_css, tset) {
  1123. task_lock(task);
  1124. ioc = task->io_context;
  1125. if (ioc && atomic_read(&ioc->nr_tasks) > 1)
  1126. ret = -EINVAL;
  1127. task_unlock(task);
  1128. if (ret)
  1129. break;
  1130. }
  1131. return ret;
  1132. }
  1133. static void blkcg_bind(struct cgroup_subsys_state *root_css)
  1134. {
  1135. int i;
  1136. mutex_lock(&blkcg_pol_mutex);
  1137. for (i = 0; i < BLKCG_MAX_POLS; i++) {
  1138. struct blkcg_policy *pol = blkcg_policy[i];
  1139. struct blkcg *blkcg;
  1140. if (!pol || !pol->cpd_bind_fn)
  1141. continue;
  1142. list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
  1143. if (blkcg->cpd[pol->plid])
  1144. pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
  1145. }
  1146. mutex_unlock(&blkcg_pol_mutex);
  1147. }
  1148. static void blkcg_exit(struct task_struct *tsk)
  1149. {
  1150. if (tsk->throttle_queue)
  1151. blk_put_queue(tsk->throttle_queue);
  1152. tsk->throttle_queue = NULL;
  1153. }
  1154. struct cgroup_subsys io_cgrp_subsys = {
  1155. .css_alloc = blkcg_css_alloc,
  1156. .css_offline = blkcg_css_offline,
  1157. .css_free = blkcg_css_free,
  1158. .can_attach = blkcg_can_attach,
  1159. .bind = blkcg_bind,
  1160. .dfl_cftypes = blkcg_files,
  1161. .legacy_cftypes = blkcg_legacy_files,
  1162. .legacy_name = "blkio",
  1163. .exit = blkcg_exit,
  1164. #ifdef CONFIG_MEMCG
  1165. /*
  1166. * This ensures that, if available, memcg is automatically enabled
  1167. * together on the default hierarchy so that the owner cgroup can
  1168. * be retrieved from writeback pages.
  1169. */
  1170. .depends_on = 1 << memory_cgrp_id,
  1171. #endif
  1172. };
  1173. EXPORT_SYMBOL_GPL(io_cgrp_subsys);
  1174. /**
  1175. * blkcg_activate_policy - activate a blkcg policy on a request_queue
  1176. * @q: request_queue of interest
  1177. * @pol: blkcg policy to activate
  1178. *
  1179. * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through
  1180. * bypass mode to populate its blkgs with policy_data for @pol.
  1181. *
  1182. * Activation happens with @q bypassed, so nobody would be accessing blkgs
  1183. * from IO path. Update of each blkg is protected by both queue and blkcg
  1184. * locks so that holding either lock and testing blkcg_policy_enabled() is
  1185. * always enough for dereferencing policy data.
  1186. *
  1187. * The caller is responsible for synchronizing [de]activations and policy
  1188. * [un]registerations. Returns 0 on success, -errno on failure.
  1189. */
  1190. int blkcg_activate_policy(struct request_queue *q,
  1191. const struct blkcg_policy *pol)
  1192. {
  1193. struct blkg_policy_data *pd_prealloc = NULL;
  1194. struct blkcg_gq *blkg;
  1195. int ret;
  1196. if (blkcg_policy_enabled(q, pol))
  1197. return 0;
  1198. if (q->mq_ops)
  1199. blk_mq_freeze_queue(q);
  1200. else
  1201. blk_queue_bypass_start(q);
  1202. pd_prealloc:
  1203. if (!pd_prealloc) {
  1204. pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
  1205. if (!pd_prealloc) {
  1206. ret = -ENOMEM;
  1207. goto out_bypass_end;
  1208. }
  1209. }
  1210. spin_lock_irq(q->queue_lock);
  1211. list_for_each_entry(blkg, &q->blkg_list, q_node) {
  1212. struct blkg_policy_data *pd;
  1213. if (blkg->pd[pol->plid])
  1214. continue;
  1215. pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
  1216. if (!pd)
  1217. swap(pd, pd_prealloc);
  1218. if (!pd) {
  1219. spin_unlock_irq(q->queue_lock);
  1220. goto pd_prealloc;
  1221. }
  1222. blkg->pd[pol->plid] = pd;
  1223. pd->blkg = blkg;
  1224. pd->plid = pol->plid;
  1225. if (pol->pd_init_fn)
  1226. pol->pd_init_fn(pd);
  1227. }
  1228. __set_bit(pol->plid, q->blkcg_pols);
  1229. ret = 0;
  1230. spin_unlock_irq(q->queue_lock);
  1231. out_bypass_end:
  1232. if (q->mq_ops)
  1233. blk_mq_unfreeze_queue(q);
  1234. else
  1235. blk_queue_bypass_end(q);
  1236. if (pd_prealloc)
  1237. pol->pd_free_fn(pd_prealloc);
  1238. return ret;
  1239. }
  1240. EXPORT_SYMBOL_GPL(blkcg_activate_policy);
  1241. /**
  1242. * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
  1243. * @q: request_queue of interest
  1244. * @pol: blkcg policy to deactivate
  1245. *
  1246. * Deactivate @pol on @q. Follows the same synchronization rules as
  1247. * blkcg_activate_policy().
  1248. */
  1249. void blkcg_deactivate_policy(struct request_queue *q,
  1250. const struct blkcg_policy *pol)
  1251. {
  1252. struct blkcg_gq *blkg;
  1253. if (!blkcg_policy_enabled(q, pol))
  1254. return;
  1255. if (q->mq_ops)
  1256. blk_mq_freeze_queue(q);
  1257. else
  1258. blk_queue_bypass_start(q);
  1259. spin_lock_irq(q->queue_lock);
  1260. __clear_bit(pol->plid, q->blkcg_pols);
  1261. list_for_each_entry(blkg, &q->blkg_list, q_node) {
  1262. if (blkg->pd[pol->plid]) {
  1263. if (pol->pd_offline_fn)
  1264. pol->pd_offline_fn(blkg->pd[pol->plid]);
  1265. pol->pd_free_fn(blkg->pd[pol->plid]);
  1266. blkg->pd[pol->plid] = NULL;
  1267. }
  1268. }
  1269. spin_unlock_irq(q->queue_lock);
  1270. if (q->mq_ops)
  1271. blk_mq_unfreeze_queue(q);
  1272. else
  1273. blk_queue_bypass_end(q);
  1274. }
  1275. EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
  1276. /**
  1277. * blkcg_policy_register - register a blkcg policy
  1278. * @pol: blkcg policy to register
  1279. *
  1280. * Register @pol with blkcg core. Might sleep and @pol may be modified on
  1281. * successful registration. Returns 0 on success and -errno on failure.
  1282. */
  1283. int blkcg_policy_register(struct blkcg_policy *pol)
  1284. {
  1285. struct blkcg *blkcg;
  1286. int i, ret;
  1287. mutex_lock(&blkcg_pol_register_mutex);
  1288. mutex_lock(&blkcg_pol_mutex);
  1289. /* find an empty slot */
  1290. ret = -ENOSPC;
  1291. for (i = 0; i < BLKCG_MAX_POLS; i++)
  1292. if (!blkcg_policy[i])
  1293. break;
  1294. if (i >= BLKCG_MAX_POLS)
  1295. goto err_unlock;
  1296. /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
  1297. if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
  1298. (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
  1299. goto err_unlock;
  1300. /* register @pol */
  1301. pol->plid = i;
  1302. blkcg_policy[pol->plid] = pol;
  1303. /* allocate and install cpd's */
  1304. if (pol->cpd_alloc_fn) {
  1305. list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
  1306. struct blkcg_policy_data *cpd;
  1307. cpd = pol->cpd_alloc_fn(GFP_KERNEL);
  1308. if (!cpd)
  1309. goto err_free_cpds;
  1310. blkcg->cpd[pol->plid] = cpd;
  1311. cpd->blkcg = blkcg;
  1312. cpd->plid = pol->plid;
  1313. pol->cpd_init_fn(cpd);
  1314. }
  1315. }
  1316. mutex_unlock(&blkcg_pol_mutex);
  1317. /* everything is in place, add intf files for the new policy */
  1318. if (pol->dfl_cftypes)
  1319. WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
  1320. pol->dfl_cftypes));
  1321. if (pol->legacy_cftypes)
  1322. WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
  1323. pol->legacy_cftypes));
  1324. mutex_unlock(&blkcg_pol_register_mutex);
  1325. return 0;
  1326. err_free_cpds:
  1327. if (pol->cpd_free_fn) {
  1328. list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
  1329. if (blkcg->cpd[pol->plid]) {
  1330. pol->cpd_free_fn(blkcg->cpd[pol->plid]);
  1331. blkcg->cpd[pol->plid] = NULL;
  1332. }
  1333. }
  1334. }
  1335. blkcg_policy[pol->plid] = NULL;
  1336. err_unlock:
  1337. mutex_unlock(&blkcg_pol_mutex);
  1338. mutex_unlock(&blkcg_pol_register_mutex);
  1339. return ret;
  1340. }
  1341. EXPORT_SYMBOL_GPL(blkcg_policy_register);
  1342. /**
  1343. * blkcg_policy_unregister - unregister a blkcg policy
  1344. * @pol: blkcg policy to unregister
  1345. *
  1346. * Undo blkcg_policy_register(@pol). Might sleep.
  1347. */
  1348. void blkcg_policy_unregister(struct blkcg_policy *pol)
  1349. {
  1350. struct blkcg *blkcg;
  1351. mutex_lock(&blkcg_pol_register_mutex);
  1352. if (WARN_ON(blkcg_policy[pol->plid] != pol))
  1353. goto out_unlock;
  1354. /* kill the intf files first */
  1355. if (pol->dfl_cftypes)
  1356. cgroup_rm_cftypes(pol->dfl_cftypes);
  1357. if (pol->legacy_cftypes)
  1358. cgroup_rm_cftypes(pol->legacy_cftypes);
  1359. /* remove cpds and unregister */
  1360. mutex_lock(&blkcg_pol_mutex);
  1361. if (pol->cpd_free_fn) {
  1362. list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
  1363. if (blkcg->cpd[pol->plid]) {
  1364. pol->cpd_free_fn(blkcg->cpd[pol->plid]);
  1365. blkcg->cpd[pol->plid] = NULL;
  1366. }
  1367. }
  1368. }
  1369. blkcg_policy[pol->plid] = NULL;
  1370. mutex_unlock(&blkcg_pol_mutex);
  1371. out_unlock:
  1372. mutex_unlock(&blkcg_pol_register_mutex);
  1373. }
  1374. EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
  1375. /*
  1376. * Scale the accumulated delay based on how long it has been since we updated
  1377. * the delay. We only call this when we are adding delay, in case it's been a
  1378. * while since we added delay, and when we are checking to see if we need to
  1379. * delay a task, to account for any delays that may have occurred.
  1380. */
  1381. static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
  1382. {
  1383. u64 old = atomic64_read(&blkg->delay_start);
  1384. /*
  1385. * We only want to scale down every second. The idea here is that we
  1386. * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
  1387. * time window. We only want to throttle tasks for recent delay that
  1388. * has occurred, in 1 second time windows since that's the maximum
  1389. * things can be throttled. We save the current delay window in
  1390. * blkg->last_delay so we know what amount is still left to be charged
  1391. * to the blkg from this point onward. blkg->last_use keeps track of
  1392. * the use_delay counter. The idea is if we're unthrottling the blkg we
  1393. * are ok with whatever is happening now, and we can take away more of
  1394. * the accumulated delay as we've already throttled enough that
  1395. * everybody is happy with their IO latencies.
  1396. */
  1397. if (time_before64(old + NSEC_PER_SEC, now) &&
  1398. atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
  1399. u64 cur = atomic64_read(&blkg->delay_nsec);
  1400. u64 sub = min_t(u64, blkg->last_delay, now - old);
  1401. int cur_use = atomic_read(&blkg->use_delay);
  1402. /*
  1403. * We've been unthrottled, subtract a larger chunk of our
  1404. * accumulated delay.
  1405. */
  1406. if (cur_use < blkg->last_use)
  1407. sub = max_t(u64, sub, blkg->last_delay >> 1);
  1408. /*
  1409. * This shouldn't happen, but handle it anyway. Our delay_nsec
  1410. * should only ever be growing except here where we subtract out
  1411. * min(last_delay, 1 second), but lord knows bugs happen and I'd
  1412. * rather not end up with negative numbers.
  1413. */
  1414. if (unlikely(cur < sub)) {
  1415. atomic64_set(&blkg->delay_nsec, 0);
  1416. blkg->last_delay = 0;
  1417. } else {
  1418. atomic64_sub(sub, &blkg->delay_nsec);
  1419. blkg->last_delay = cur - sub;
  1420. }
  1421. blkg->last_use = cur_use;
  1422. }
  1423. }
  1424. /*
  1425. * This is called when we want to actually walk up the hierarchy and check to
  1426. * see if we need to throttle, and then actually throttle if there is some
  1427. * accumulated delay. This should only be called upon return to user space so
  1428. * we're not holding some lock that would induce a priority inversion.
  1429. */
  1430. static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
  1431. {
  1432. u64 now = ktime_to_ns(ktime_get());
  1433. u64 exp;
  1434. u64 delay_nsec = 0;
  1435. int tok;
  1436. while (blkg->parent) {
  1437. if (atomic_read(&blkg->use_delay)) {
  1438. blkcg_scale_delay(blkg, now);
  1439. delay_nsec = max_t(u64, delay_nsec,
  1440. atomic64_read(&blkg->delay_nsec));
  1441. }
  1442. blkg = blkg->parent;
  1443. }
  1444. if (!delay_nsec)
  1445. return;
  1446. /*
  1447. * Let's not sleep for all eternity if we've amassed a huge delay.
  1448. * Swapping or metadata IO can accumulate 10's of seconds worth of
  1449. * delay, and we want userspace to be able to do _something_ so cap the
  1450. * delays at 1 second. If there's 10's of seconds worth of delay then
  1451. * the tasks will be delayed for 1 second for every syscall.
  1452. */
  1453. delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
  1454. /*
  1455. * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
  1456. * that hasn't landed upstream yet. Once that stuff is in place we need
  1457. * to do a psi_memstall_enter/leave if memdelay is set.
  1458. */
  1459. exp = ktime_add_ns(now, delay_nsec);
  1460. tok = io_schedule_prepare();
  1461. do {
  1462. __set_current_state(TASK_KILLABLE);
  1463. if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
  1464. break;
  1465. } while (!fatal_signal_pending(current));
  1466. io_schedule_finish(tok);
  1467. }
  1468. /**
  1469. * blkcg_maybe_throttle_current - throttle the current task if it has been marked
  1470. *
  1471. * This is only called if we've been marked with set_notify_resume(). Obviously
  1472. * we can be set_notify_resume() for reasons other than blkcg throttling, so we
  1473. * check to see if current->throttle_queue is set and if not this doesn't do
  1474. * anything. This should only ever be called by the resume code, it's not meant
  1475. * to be called by people willy-nilly as it will actually do the work to
  1476. * throttle the task if it is setup for throttling.
  1477. */
  1478. void blkcg_maybe_throttle_current(void)
  1479. {
  1480. struct request_queue *q = current->throttle_queue;
  1481. struct cgroup_subsys_state *css;
  1482. struct blkcg *blkcg;
  1483. struct blkcg_gq *blkg;
  1484. bool use_memdelay = current->use_memdelay;
  1485. if (!q)
  1486. return;
  1487. current->throttle_queue = NULL;
  1488. current->use_memdelay = false;
  1489. rcu_read_lock();
  1490. css = kthread_blkcg();
  1491. if (css)
  1492. blkcg = css_to_blkcg(css);
  1493. else
  1494. blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
  1495. if (!blkcg)
  1496. goto out;
  1497. blkg = blkg_lookup(blkcg, q);
  1498. if (!blkg)
  1499. goto out;
  1500. blkg = blkg_try_get(blkg);
  1501. if (!blkg)
  1502. goto out;
  1503. rcu_read_unlock();
  1504. blkcg_maybe_throttle_blkg(blkg, use_memdelay);
  1505. blkg_put(blkg);
  1506. blk_put_queue(q);
  1507. return;
  1508. out:
  1509. rcu_read_unlock();
  1510. blk_put_queue(q);
  1511. }
  1512. EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
  1513. /**
  1514. * blkcg_schedule_throttle - this task needs to check for throttling
  1515. * @q - the request queue IO was submitted on
  1516. * @use_memdelay - do we charge this to memory delay for PSI
  1517. *
  1518. * This is called by the IO controller when we know there's delay accumulated
  1519. * for the blkg for this task. We do not pass the blkg because there are places
  1520. * we call this that may not have that information, the swapping code for
  1521. * instance will only have a request_queue at that point. This set's the
  1522. * notify_resume for the task to check and see if it requires throttling before
  1523. * returning to user space.
  1524. *
  1525. * We will only schedule once per syscall. You can call this over and over
  1526. * again and it will only do the check once upon return to user space, and only
  1527. * throttle once. If the task needs to be throttled again it'll need to be
  1528. * re-set at the next time we see the task.
  1529. */
  1530. void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
  1531. {
  1532. if (unlikely(current->flags & PF_KTHREAD))
  1533. return;
  1534. if (!blk_get_queue(q))
  1535. return;
  1536. if (current->throttle_queue)
  1537. blk_put_queue(current->throttle_queue);
  1538. current->throttle_queue = q;
  1539. if (use_memdelay)
  1540. current->use_memdelay = use_memdelay;
  1541. set_notify_resume(current);
  1542. }
  1543. EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
  1544. /**
  1545. * blkcg_add_delay - add delay to this blkg
  1546. * @now - the current time in nanoseconds
  1547. * @delta - how many nanoseconds of delay to add
  1548. *
  1549. * Charge @delta to the blkg's current delay accumulation. This is used to
  1550. * throttle tasks if an IO controller thinks we need more throttling.
  1551. */
  1552. void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
  1553. {
  1554. blkcg_scale_delay(blkg, now);
  1555. atomic64_add(delta, &blkg->delay_nsec);
  1556. }
  1557. EXPORT_SYMBOL_GPL(blkcg_add_delay);
  1558. module_param(blkcg_debug_stats, bool, 0644);
  1559. MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");