bfq-iosched.h 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034
  1. /*
  2. * Header file for the BFQ I/O scheduler: data structures and
  3. * prototypes of interface functions among BFQ components.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License as
  7. * published by the Free Software Foundation; either version 2 of the
  8. * License, or (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * General Public License for more details.
  14. */
  15. #ifndef _BFQ_H
  16. #define _BFQ_H
  17. #include <linux/blktrace_api.h>
  18. #include <linux/hrtimer.h>
  19. #include <linux/blk-cgroup.h>
  20. #define BFQ_IOPRIO_CLASSES 3
  21. #define BFQ_CL_IDLE_TIMEOUT (HZ/5)
  22. #define BFQ_MIN_WEIGHT 1
  23. #define BFQ_MAX_WEIGHT 1000
  24. #define BFQ_WEIGHT_CONVERSION_COEFF 10
  25. #define BFQ_DEFAULT_QUEUE_IOPRIO 4
  26. #define BFQ_WEIGHT_LEGACY_DFL 100
  27. #define BFQ_DEFAULT_GRP_IOPRIO 0
  28. #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
  29. /*
  30. * Soft real-time applications are extremely more latency sensitive
  31. * than interactive ones. Over-raise the weight of the former to
  32. * privilege them against the latter.
  33. */
  34. #define BFQ_SOFTRT_WEIGHT_FACTOR 100
  35. struct bfq_entity;
  36. /**
  37. * struct bfq_service_tree - per ioprio_class service tree.
  38. *
  39. * Each service tree represents a B-WF2Q+ scheduler on its own. Each
  40. * ioprio_class has its own independent scheduler, and so its own
  41. * bfq_service_tree. All the fields are protected by the queue lock
  42. * of the containing bfqd.
  43. */
  44. struct bfq_service_tree {
  45. /* tree for active entities (i.e., those backlogged) */
  46. struct rb_root active;
  47. /* tree for idle entities (i.e., not backlogged, with V < F_i)*/
  48. struct rb_root idle;
  49. /* idle entity with minimum F_i */
  50. struct bfq_entity *first_idle;
  51. /* idle entity with maximum F_i */
  52. struct bfq_entity *last_idle;
  53. /* scheduler virtual time */
  54. u64 vtime;
  55. /* scheduler weight sum; active and idle entities contribute to it */
  56. unsigned long wsum;
  57. };
  58. /**
  59. * struct bfq_sched_data - multi-class scheduler.
  60. *
  61. * bfq_sched_data is the basic scheduler queue. It supports three
  62. * ioprio_classes, and can be used either as a toplevel queue or as an
  63. * intermediate queue in a hierarchical setup.
  64. *
  65. * The supported ioprio_classes are the same as in CFQ, in descending
  66. * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
  67. * Requests from higher priority queues are served before all the
  68. * requests from lower priority queues; among requests of the same
  69. * queue requests are served according to B-WF2Q+.
  70. *
  71. * The schedule is implemented by the service trees, plus the field
  72. * @next_in_service, which points to the entity on the active trees
  73. * that will be served next, if 1) no changes in the schedule occurs
  74. * before the current in-service entity is expired, 2) the in-service
  75. * queue becomes idle when it expires, and 3) if the entity pointed by
  76. * in_service_entity is not a queue, then the in-service child entity
  77. * of the entity pointed by in_service_entity becomes idle on
  78. * expiration. This peculiar definition allows for the following
  79. * optimization, not yet exploited: while a given entity is still in
  80. * service, we already know which is the best candidate for next
  81. * service among the other active entitities in the same parent
  82. * entity. We can then quickly compare the timestamps of the
  83. * in-service entity with those of such best candidate.
  84. *
  85. * All fields are protected by the lock of the containing bfqd.
  86. */
  87. struct bfq_sched_data {
  88. /* entity in service */
  89. struct bfq_entity *in_service_entity;
  90. /* head-of-line entity (see comments above) */
  91. struct bfq_entity *next_in_service;
  92. /* array of service trees, one per ioprio_class */
  93. struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
  94. /* last time CLASS_IDLE was served */
  95. unsigned long bfq_class_idle_last_service;
  96. };
  97. /**
  98. * struct bfq_weight_counter - counter of the number of all active queues
  99. * with a given weight.
  100. */
  101. struct bfq_weight_counter {
  102. unsigned int weight; /* weight of the queues this counter refers to */
  103. unsigned int num_active; /* nr of active queues with this weight */
  104. /*
  105. * Weights tree member (see bfq_data's @queue_weights_tree)
  106. */
  107. struct rb_node weights_node;
  108. };
  109. /**
  110. * struct bfq_entity - schedulable entity.
  111. *
  112. * A bfq_entity is used to represent either a bfq_queue (leaf node in the
  113. * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
  114. * entity belongs to the sched_data of the parent group in the cgroup
  115. * hierarchy. Non-leaf entities have also their own sched_data, stored
  116. * in @my_sched_data.
  117. *
  118. * Each entity stores independently its priority values; this would
  119. * allow different weights on different devices, but this
  120. * functionality is not exported to userspace by now. Priorities and
  121. * weights are updated lazily, first storing the new values into the
  122. * new_* fields, then setting the @prio_changed flag. As soon as
  123. * there is a transition in the entity state that allows the priority
  124. * update to take place the effective and the requested priority
  125. * values are synchronized.
  126. *
  127. * Unless cgroups are used, the weight value is calculated from the
  128. * ioprio to export the same interface as CFQ. When dealing with
  129. * ``well-behaved'' queues (i.e., queues that do not spend too much
  130. * time to consume their budget and have true sequential behavior, and
  131. * when there are no external factors breaking anticipation) the
  132. * relative weights at each level of the cgroups hierarchy should be
  133. * guaranteed. All the fields are protected by the queue lock of the
  134. * containing bfqd.
  135. */
  136. struct bfq_entity {
  137. /* service_tree member */
  138. struct rb_node rb_node;
  139. /*
  140. * Flag, true if the entity is on a tree (either the active or
  141. * the idle one of its service_tree) or is in service.
  142. */
  143. bool on_st;
  144. /* B-WF2Q+ start and finish timestamps [sectors/weight] */
  145. u64 start, finish;
  146. /* tree the entity is enqueued into; %NULL if not on a tree */
  147. struct rb_root *tree;
  148. /*
  149. * minimum start time of the (active) subtree rooted at this
  150. * entity; used for O(log N) lookups into active trees
  151. */
  152. u64 min_start;
  153. /* amount of service received during the last service slot */
  154. int service;
  155. /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
  156. int budget;
  157. /* weight of the queue */
  158. int weight;
  159. /* next weight if a change is in progress */
  160. int new_weight;
  161. /* original weight, used to implement weight boosting */
  162. int orig_weight;
  163. /* parent entity, for hierarchical scheduling */
  164. struct bfq_entity *parent;
  165. /*
  166. * For non-leaf nodes in the hierarchy, the associated
  167. * scheduler queue, %NULL on leaf nodes.
  168. */
  169. struct bfq_sched_data *my_sched_data;
  170. /* the scheduler queue this entity belongs to */
  171. struct bfq_sched_data *sched_data;
  172. /* flag, set to request a weight, ioprio or ioprio_class change */
  173. int prio_changed;
  174. /* flag, set if the entity is counted in groups_with_pending_reqs */
  175. bool in_groups_with_pending_reqs;
  176. };
  177. struct bfq_group;
  178. /**
  179. * struct bfq_ttime - per process thinktime stats.
  180. */
  181. struct bfq_ttime {
  182. /* completion time of the last request */
  183. u64 last_end_request;
  184. /* total process thinktime */
  185. u64 ttime_total;
  186. /* number of thinktime samples */
  187. unsigned long ttime_samples;
  188. /* average process thinktime */
  189. u64 ttime_mean;
  190. };
  191. /**
  192. * struct bfq_queue - leaf schedulable entity.
  193. *
  194. * A bfq_queue is a leaf request queue; it can be associated with an
  195. * io_context or more, if it is async or shared between cooperating
  196. * processes. @cgroup holds a reference to the cgroup, to be sure that it
  197. * does not disappear while a bfqq still references it (mostly to avoid
  198. * races between request issuing and task migration followed by cgroup
  199. * destruction).
  200. * All the fields are protected by the queue lock of the containing bfqd.
  201. */
  202. struct bfq_queue {
  203. /* reference counter */
  204. int ref;
  205. /* parent bfq_data */
  206. struct bfq_data *bfqd;
  207. /* current ioprio and ioprio class */
  208. unsigned short ioprio, ioprio_class;
  209. /* next ioprio and ioprio class if a change is in progress */
  210. unsigned short new_ioprio, new_ioprio_class;
  211. /*
  212. * Shared bfq_queue if queue is cooperating with one or more
  213. * other queues.
  214. */
  215. struct bfq_queue *new_bfqq;
  216. /* request-position tree member (see bfq_group's @rq_pos_tree) */
  217. struct rb_node pos_node;
  218. /* request-position tree root (see bfq_group's @rq_pos_tree) */
  219. struct rb_root *pos_root;
  220. /* sorted list of pending requests */
  221. struct rb_root sort_list;
  222. /* if fifo isn't expired, next request to serve */
  223. struct request *next_rq;
  224. /* number of sync and async requests queued */
  225. int queued[2];
  226. /* number of requests currently allocated */
  227. int allocated;
  228. /* number of pending metadata requests */
  229. int meta_pending;
  230. /* fifo list of requests in sort_list */
  231. struct list_head fifo;
  232. /* entity representing this queue in the scheduler */
  233. struct bfq_entity entity;
  234. /* pointer to the weight counter associated with this entity */
  235. struct bfq_weight_counter *weight_counter;
  236. /* maximum budget allowed from the feedback mechanism */
  237. int max_budget;
  238. /* budget expiration (in jiffies) */
  239. unsigned long budget_timeout;
  240. /* number of requests on the dispatch list or inside driver */
  241. int dispatched;
  242. /* status flags */
  243. unsigned long flags;
  244. /* node for active/idle bfqq list inside parent bfqd */
  245. struct list_head bfqq_list;
  246. /* associated @bfq_ttime struct */
  247. struct bfq_ttime ttime;
  248. /* bit vector: a 1 for each seeky requests in history */
  249. u32 seek_history;
  250. /* node for the device's burst list */
  251. struct hlist_node burst_list_node;
  252. /* position of the last request enqueued */
  253. sector_t last_request_pos;
  254. /* Number of consecutive pairs of request completion and
  255. * arrival, such that the queue becomes idle after the
  256. * completion, but the next request arrives within an idle
  257. * time slice; used only if the queue's IO_bound flag has been
  258. * cleared.
  259. */
  260. unsigned int requests_within_timer;
  261. /* pid of the process owning the queue, used for logging purposes */
  262. pid_t pid;
  263. /*
  264. * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
  265. * if the queue is shared.
  266. */
  267. struct bfq_io_cq *bic;
  268. /* current maximum weight-raising time for this queue */
  269. unsigned long wr_cur_max_time;
  270. /*
  271. * Minimum time instant such that, only if a new request is
  272. * enqueued after this time instant in an idle @bfq_queue with
  273. * no outstanding requests, then the task associated with the
  274. * queue it is deemed as soft real-time (see the comments on
  275. * the function bfq_bfqq_softrt_next_start())
  276. */
  277. unsigned long soft_rt_next_start;
  278. /*
  279. * Start time of the current weight-raising period if
  280. * the @bfq-queue is being weight-raised, otherwise
  281. * finish time of the last weight-raising period.
  282. */
  283. unsigned long last_wr_start_finish;
  284. /* factor by which the weight of this queue is multiplied */
  285. unsigned int wr_coeff;
  286. /*
  287. * Time of the last transition of the @bfq_queue from idle to
  288. * backlogged.
  289. */
  290. unsigned long last_idle_bklogged;
  291. /*
  292. * Cumulative service received from the @bfq_queue since the
  293. * last transition from idle to backlogged.
  294. */
  295. unsigned long service_from_backlogged;
  296. /*
  297. * Cumulative service received from the @bfq_queue since its
  298. * last transition to weight-raised state.
  299. */
  300. unsigned long service_from_wr;
  301. /*
  302. * Value of wr start time when switching to soft rt
  303. */
  304. unsigned long wr_start_at_switch_to_srt;
  305. unsigned long split_time; /* time of last split */
  306. unsigned long first_IO_time; /* time of first I/O for this queue */
  307. /* max service rate measured so far */
  308. u32 max_service_rate;
  309. /*
  310. * Ratio between the service received by bfqq while it is in
  311. * service, and the cumulative service (of requests of other
  312. * queues) that may be injected while bfqq is empty but still
  313. * in service. To increase precision, the coefficient is
  314. * measured in tenths of unit. Here are some example of (1)
  315. * ratios, (2) resulting percentages of service injected
  316. * w.r.t. to the total service dispatched while bfqq is in
  317. * service, and (3) corresponding values of the coefficient:
  318. * 1 (50%) -> 10
  319. * 2 (33%) -> 20
  320. * 10 (9%) -> 100
  321. * 9.9 (9%) -> 99
  322. * 1.5 (40%) -> 15
  323. * 0.5 (66%) -> 5
  324. * 0.1 (90%) -> 1
  325. *
  326. * So, if the coefficient is lower than 10, then
  327. * injected service is more than bfqq service.
  328. */
  329. unsigned int inject_coeff;
  330. /* amount of service injected in current service slot */
  331. unsigned int injected_service;
  332. };
  333. /**
  334. * struct bfq_io_cq - per (request_queue, io_context) structure.
  335. */
  336. struct bfq_io_cq {
  337. /* associated io_cq structure */
  338. struct io_cq icq; /* must be the first member */
  339. /* array of two process queues, the sync and the async */
  340. struct bfq_queue *bfqq[2];
  341. /* per (request_queue, blkcg) ioprio */
  342. int ioprio;
  343. #ifdef CONFIG_BFQ_GROUP_IOSCHED
  344. uint64_t blkcg_serial_nr; /* the current blkcg serial */
  345. #endif
  346. /*
  347. * Snapshot of the has_short_time flag before merging; taken
  348. * to remember its value while the queue is merged, so as to
  349. * be able to restore it in case of split.
  350. */
  351. bool saved_has_short_ttime;
  352. /*
  353. * Same purpose as the previous two fields for the I/O bound
  354. * classification of a queue.
  355. */
  356. bool saved_IO_bound;
  357. /*
  358. * Same purpose as the previous fields for the value of the
  359. * field keeping the queue's belonging to a large burst
  360. */
  361. bool saved_in_large_burst;
  362. /*
  363. * True if the queue belonged to a burst list before its merge
  364. * with another cooperating queue.
  365. */
  366. bool was_in_burst_list;
  367. /*
  368. * Similar to previous fields: save wr information.
  369. */
  370. unsigned long saved_wr_coeff;
  371. unsigned long saved_last_wr_start_finish;
  372. unsigned long saved_wr_start_at_switch_to_srt;
  373. unsigned int saved_wr_cur_max_time;
  374. struct bfq_ttime saved_ttime;
  375. };
  376. /**
  377. * struct bfq_data - per-device data structure.
  378. *
  379. * All the fields are protected by @lock.
  380. */
  381. struct bfq_data {
  382. /* device request queue */
  383. struct request_queue *queue;
  384. /* dispatch queue */
  385. struct list_head dispatch;
  386. /* root bfq_group for the device */
  387. struct bfq_group *root_group;
  388. /*
  389. * rbtree of weight counters of @bfq_queues, sorted by
  390. * weight. Used to keep track of whether all @bfq_queues have
  391. * the same weight. The tree contains one counter for each
  392. * distinct weight associated to some active and not
  393. * weight-raised @bfq_queue (see the comments to the functions
  394. * bfq_weights_tree_[add|remove] for further details).
  395. */
  396. struct rb_root queue_weights_tree;
  397. /*
  398. * Number of groups with at least one descendant process that
  399. * has at least one request waiting for completion. Note that
  400. * this accounts for also requests already dispatched, but not
  401. * yet completed. Therefore this number of groups may differ
  402. * (be larger) than the number of active groups, as a group is
  403. * considered active only if its corresponding entity has
  404. * descendant queues with at least one request queued. This
  405. * number is used to decide whether a scenario is symmetric.
  406. * For a detailed explanation see comments on the computation
  407. * of the variable asymmetric_scenario in the function
  408. * bfq_better_to_idle().
  409. *
  410. * However, it is hard to compute this number exactly, for
  411. * groups with multiple descendant processes. Consider a group
  412. * that is inactive, i.e., that has no descendant process with
  413. * pending I/O inside BFQ queues. Then suppose that
  414. * num_groups_with_pending_reqs is still accounting for this
  415. * group, because the group has descendant processes with some
  416. * I/O request still in flight. num_groups_with_pending_reqs
  417. * should be decremented when the in-flight request of the
  418. * last descendant process is finally completed (assuming that
  419. * nothing else has changed for the group in the meantime, in
  420. * terms of composition of the group and active/inactive state of child
  421. * groups and processes). To accomplish this, an additional
  422. * pending-request counter must be added to entities, and must
  423. * be updated correctly. To avoid this additional field and operations,
  424. * we resort to the following tradeoff between simplicity and
  425. * accuracy: for an inactive group that is still counted in
  426. * num_groups_with_pending_reqs, we decrement
  427. * num_groups_with_pending_reqs when the first descendant
  428. * process of the group remains with no request waiting for
  429. * completion.
  430. *
  431. * Even this simpler decrement strategy requires a little
  432. * carefulness: to avoid multiple decrements, we flag a group,
  433. * more precisely an entity representing a group, as still
  434. * counted in num_groups_with_pending_reqs when it becomes
  435. * inactive. Then, when the first descendant queue of the
  436. * entity remains with no request waiting for completion,
  437. * num_groups_with_pending_reqs is decremented, and this flag
  438. * is reset. After this flag is reset for the entity,
  439. * num_groups_with_pending_reqs won't be decremented any
  440. * longer in case a new descendant queue of the entity remains
  441. * with no request waiting for completion.
  442. */
  443. unsigned int num_groups_with_pending_reqs;
  444. /*
  445. * Number of bfq_queues containing requests (including the
  446. * queue in service, even if it is idling).
  447. */
  448. int busy_queues;
  449. /* number of weight-raised busy @bfq_queues */
  450. int wr_busy_queues;
  451. /* number of queued requests */
  452. int queued;
  453. /* number of requests dispatched and waiting for completion */
  454. int rq_in_driver;
  455. /*
  456. * Maximum number of requests in driver in the last
  457. * @hw_tag_samples completed requests.
  458. */
  459. int max_rq_in_driver;
  460. /* number of samples used to calculate hw_tag */
  461. int hw_tag_samples;
  462. /* flag set to one if the driver is showing a queueing behavior */
  463. int hw_tag;
  464. /* number of budgets assigned */
  465. int budgets_assigned;
  466. /*
  467. * Timer set when idling (waiting) for the next request from
  468. * the queue in service.
  469. */
  470. struct hrtimer idle_slice_timer;
  471. /* bfq_queue in service */
  472. struct bfq_queue *in_service_queue;
  473. /* on-disk position of the last served request */
  474. sector_t last_position;
  475. /* time of last request completion (ns) */
  476. u64 last_completion;
  477. /* time of first rq dispatch in current observation interval (ns) */
  478. u64 first_dispatch;
  479. /* time of last rq dispatch in current observation interval (ns) */
  480. u64 last_dispatch;
  481. /* beginning of the last budget */
  482. ktime_t last_budget_start;
  483. /* beginning of the last idle slice */
  484. ktime_t last_idling_start;
  485. /* number of samples in current observation interval */
  486. int peak_rate_samples;
  487. /* num of samples of seq dispatches in current observation interval */
  488. u32 sequential_samples;
  489. /* total num of sectors transferred in current observation interval */
  490. u64 tot_sectors_dispatched;
  491. /* max rq size seen during current observation interval (sectors) */
  492. u32 last_rq_max_size;
  493. /* time elapsed from first dispatch in current observ. interval (us) */
  494. u64 delta_from_first;
  495. /*
  496. * Current estimate of the device peak rate, measured in
  497. * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by
  498. * BFQ_RATE_SHIFT is performed to increase precision in
  499. * fixed-point calculations.
  500. */
  501. u32 peak_rate;
  502. /* maximum budget allotted to a bfq_queue before rescheduling */
  503. int bfq_max_budget;
  504. /* list of all the bfq_queues active on the device */
  505. struct list_head active_list;
  506. /* list of all the bfq_queues idle on the device */
  507. struct list_head idle_list;
  508. /*
  509. * Timeout for async/sync requests; when it fires, requests
  510. * are served in fifo order.
  511. */
  512. u64 bfq_fifo_expire[2];
  513. /* weight of backward seeks wrt forward ones */
  514. unsigned int bfq_back_penalty;
  515. /* maximum allowed backward seek */
  516. unsigned int bfq_back_max;
  517. /* maximum idling time */
  518. u32 bfq_slice_idle;
  519. /* user-configured max budget value (0 for auto-tuning) */
  520. int bfq_user_max_budget;
  521. /*
  522. * Timeout for bfq_queues to consume their budget; used to
  523. * prevent seeky queues from imposing long latencies to
  524. * sequential or quasi-sequential ones (this also implies that
  525. * seeky queues cannot receive guarantees in the service
  526. * domain; after a timeout they are charged for the time they
  527. * have been in service, to preserve fairness among them, but
  528. * without service-domain guarantees).
  529. */
  530. unsigned int bfq_timeout;
  531. /*
  532. * Number of consecutive requests that must be issued within
  533. * the idle time slice to set again idling to a queue which
  534. * was marked as non-I/O-bound (see the definition of the
  535. * IO_bound flag for further details).
  536. */
  537. unsigned int bfq_requests_within_timer;
  538. /*
  539. * Force device idling whenever needed to provide accurate
  540. * service guarantees, without caring about throughput
  541. * issues. CAVEAT: this may even increase latencies, in case
  542. * of useless idling for processes that did stop doing I/O.
  543. */
  544. bool strict_guarantees;
  545. /*
  546. * Last time at which a queue entered the current burst of
  547. * queues being activated shortly after each other; for more
  548. * details about this and the following parameters related to
  549. * a burst of activations, see the comments on the function
  550. * bfq_handle_burst.
  551. */
  552. unsigned long last_ins_in_burst;
  553. /*
  554. * Reference time interval used to decide whether a queue has
  555. * been activated shortly after @last_ins_in_burst.
  556. */
  557. unsigned long bfq_burst_interval;
  558. /* number of queues in the current burst of queue activations */
  559. int burst_size;
  560. /* common parent entity for the queues in the burst */
  561. struct bfq_entity *burst_parent_entity;
  562. /* Maximum burst size above which the current queue-activation
  563. * burst is deemed as 'large'.
  564. */
  565. unsigned long bfq_large_burst_thresh;
  566. /* true if a large queue-activation burst is in progress */
  567. bool large_burst;
  568. /*
  569. * Head of the burst list (as for the above fields, more
  570. * details in the comments on the function bfq_handle_burst).
  571. */
  572. struct hlist_head burst_list;
  573. /* if set to true, low-latency heuristics are enabled */
  574. bool low_latency;
  575. /*
  576. * Maximum factor by which the weight of a weight-raised queue
  577. * is multiplied.
  578. */
  579. unsigned int bfq_wr_coeff;
  580. /* maximum duration of a weight-raising period (jiffies) */
  581. unsigned int bfq_wr_max_time;
  582. /* Maximum weight-raising duration for soft real-time processes */
  583. unsigned int bfq_wr_rt_max_time;
  584. /*
  585. * Minimum idle period after which weight-raising may be
  586. * reactivated for a queue (in jiffies).
  587. */
  588. unsigned int bfq_wr_min_idle_time;
  589. /*
  590. * Minimum period between request arrivals after which
  591. * weight-raising may be reactivated for an already busy async
  592. * queue (in jiffies).
  593. */
  594. unsigned long bfq_wr_min_inter_arr_async;
  595. /* Max service-rate for a soft real-time queue, in sectors/sec */
  596. unsigned int bfq_wr_max_softrt_rate;
  597. /*
  598. * Cached value of the product ref_rate*ref_wr_duration, used
  599. * for computing the maximum duration of weight raising
  600. * automatically.
  601. */
  602. u64 rate_dur_prod;
  603. /* fallback dummy bfqq for extreme OOM conditions */
  604. struct bfq_queue oom_bfqq;
  605. spinlock_t lock;
  606. /*
  607. * bic associated with the task issuing current bio for
  608. * merging. This and the next field are used as a support to
  609. * be able to perform the bic lookup, needed by bio-merge
  610. * functions, before the scheduler lock is taken, and thus
  611. * avoid taking the request-queue lock while the scheduler
  612. * lock is being held.
  613. */
  614. struct bfq_io_cq *bio_bic;
  615. /* bfqq associated with the task issuing current bio for merging */
  616. struct bfq_queue *bio_bfqq;
  617. /*
  618. * Depth limits used in bfq_limit_depth (see comments on the
  619. * function)
  620. */
  621. unsigned int word_depths[2][2];
  622. };
  623. enum bfqq_state_flags {
  624. BFQQF_just_created = 0, /* queue just allocated */
  625. BFQQF_busy, /* has requests or is in service */
  626. BFQQF_wait_request, /* waiting for a request */
  627. BFQQF_non_blocking_wait_rq, /*
  628. * waiting for a request
  629. * without idling the device
  630. */
  631. BFQQF_fifo_expire, /* FIFO checked in this slice */
  632. BFQQF_has_short_ttime, /* queue has a short think time */
  633. BFQQF_sync, /* synchronous queue */
  634. BFQQF_IO_bound, /*
  635. * bfqq has timed-out at least once
  636. * having consumed at most 2/10 of
  637. * its budget
  638. */
  639. BFQQF_in_large_burst, /*
  640. * bfqq activated in a large burst,
  641. * see comments to bfq_handle_burst.
  642. */
  643. BFQQF_softrt_update, /*
  644. * may need softrt-next-start
  645. * update
  646. */
  647. BFQQF_coop, /* bfqq is shared */
  648. BFQQF_split_coop /* shared bfqq will be split */
  649. };
  650. #define BFQ_BFQQ_FNS(name) \
  651. void bfq_mark_bfqq_##name(struct bfq_queue *bfqq); \
  652. void bfq_clear_bfqq_##name(struct bfq_queue *bfqq); \
  653. int bfq_bfqq_##name(const struct bfq_queue *bfqq);
  654. BFQ_BFQQ_FNS(just_created);
  655. BFQ_BFQQ_FNS(busy);
  656. BFQ_BFQQ_FNS(wait_request);
  657. BFQ_BFQQ_FNS(non_blocking_wait_rq);
  658. BFQ_BFQQ_FNS(fifo_expire);
  659. BFQ_BFQQ_FNS(has_short_ttime);
  660. BFQ_BFQQ_FNS(sync);
  661. BFQ_BFQQ_FNS(IO_bound);
  662. BFQ_BFQQ_FNS(in_large_burst);
  663. BFQ_BFQQ_FNS(coop);
  664. BFQ_BFQQ_FNS(split_coop);
  665. BFQ_BFQQ_FNS(softrt_update);
  666. #undef BFQ_BFQQ_FNS
  667. /* Expiration reasons. */
  668. enum bfqq_expiration {
  669. BFQQE_TOO_IDLE = 0, /*
  670. * queue has been idling for
  671. * too long
  672. */
  673. BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
  674. BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
  675. BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
  676. BFQQE_PREEMPTED /* preemption in progress */
  677. };
  678. struct bfqg_stats {
  679. #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
  680. /* number of ios merged */
  681. struct blkg_rwstat merged;
  682. /* total time spent on device in ns, may not be accurate w/ queueing */
  683. struct blkg_rwstat service_time;
  684. /* total time spent waiting in scheduler queue in ns */
  685. struct blkg_rwstat wait_time;
  686. /* number of IOs queued up */
  687. struct blkg_rwstat queued;
  688. /* total disk time and nr sectors dispatched by this group */
  689. struct blkg_stat time;
  690. /* sum of number of ios queued across all samples */
  691. struct blkg_stat avg_queue_size_sum;
  692. /* count of samples taken for average */
  693. struct blkg_stat avg_queue_size_samples;
  694. /* how many times this group has been removed from service tree */
  695. struct blkg_stat dequeue;
  696. /* total time spent waiting for it to be assigned a timeslice. */
  697. struct blkg_stat group_wait_time;
  698. /* time spent idling for this blkcg_gq */
  699. struct blkg_stat idle_time;
  700. /* total time with empty current active q with other requests queued */
  701. struct blkg_stat empty_time;
  702. /* fields after this shouldn't be cleared on stat reset */
  703. u64 start_group_wait_time;
  704. u64 start_idle_time;
  705. u64 start_empty_time;
  706. uint16_t flags;
  707. #endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
  708. };
  709. #ifdef CONFIG_BFQ_GROUP_IOSCHED
  710. /*
  711. * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
  712. *
  713. * @ps: @blkcg_policy_storage that this structure inherits
  714. * @weight: weight of the bfq_group
  715. */
  716. struct bfq_group_data {
  717. /* must be the first member */
  718. struct blkcg_policy_data pd;
  719. unsigned int weight;
  720. };
  721. /**
  722. * struct bfq_group - per (device, cgroup) data structure.
  723. * @entity: schedulable entity to insert into the parent group sched_data.
  724. * @sched_data: own sched_data, to contain child entities (they may be
  725. * both bfq_queues and bfq_groups).
  726. * @bfqd: the bfq_data for the device this group acts upon.
  727. * @async_bfqq: array of async queues for all the tasks belonging to
  728. * the group, one queue per ioprio value per ioprio_class,
  729. * except for the idle class that has only one queue.
  730. * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
  731. * @my_entity: pointer to @entity, %NULL for the toplevel group; used
  732. * to avoid too many special cases during group creation/
  733. * migration.
  734. * @stats: stats for this bfqg.
  735. * @active_entities: number of active entities belonging to the group;
  736. * unused for the root group. Used to know whether there
  737. * are groups with more than one active @bfq_entity
  738. * (see the comments to the function
  739. * bfq_bfqq_may_idle()).
  740. * @rq_pos_tree: rbtree sorted by next_request position, used when
  741. * determining if two or more queues have interleaving
  742. * requests (see bfq_find_close_cooperator()).
  743. *
  744. * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
  745. * there is a set of bfq_groups, each one collecting the lower-level
  746. * entities belonging to the group that are acting on the same device.
  747. *
  748. * Locking works as follows:
  749. * o @bfqd is protected by the queue lock, RCU is used to access it
  750. * from the readers.
  751. * o All the other fields are protected by the @bfqd queue lock.
  752. */
  753. struct bfq_group {
  754. /* must be the first member */
  755. struct blkg_policy_data pd;
  756. /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */
  757. char blkg_path[128];
  758. /* reference counter (see comments in bfq_bic_update_cgroup) */
  759. int ref;
  760. struct bfq_entity entity;
  761. struct bfq_sched_data sched_data;
  762. void *bfqd;
  763. struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
  764. struct bfq_queue *async_idle_bfqq;
  765. struct bfq_entity *my_entity;
  766. int active_entities;
  767. struct rb_root rq_pos_tree;
  768. struct bfqg_stats stats;
  769. };
  770. #else
  771. struct bfq_group {
  772. struct bfq_sched_data sched_data;
  773. struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
  774. struct bfq_queue *async_idle_bfqq;
  775. struct rb_root rq_pos_tree;
  776. };
  777. #endif
  778. struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
  779. /* --------------- main algorithm interface ----------------- */
  780. #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
  781. { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
  782. extern const int bfq_timeout;
  783. struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
  784. void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
  785. struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
  786. void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
  787. void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  788. struct rb_root *root);
  789. void __bfq_weights_tree_remove(struct bfq_data *bfqd,
  790. struct bfq_queue *bfqq,
  791. struct rb_root *root);
  792. void bfq_weights_tree_remove(struct bfq_data *bfqd,
  793. struct bfq_queue *bfqq);
  794. void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  795. bool compensate, enum bfqq_expiration reason);
  796. void bfq_put_queue(struct bfq_queue *bfqq);
  797. void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
  798. void bfq_schedule_dispatch(struct bfq_data *bfqd);
  799. void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
  800. /* ------------ end of main algorithm interface -------------- */
  801. /* ---------------- cgroups-support interface ---------------- */
  802. void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
  803. unsigned int op);
  804. void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
  805. void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
  806. void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
  807. u64 io_start_time_ns, unsigned int op);
  808. void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
  809. void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
  810. void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
  811. void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);
  812. void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
  813. void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  814. struct bfq_group *bfqg);
  815. void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);
  816. void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);
  817. void bfq_end_wr_async(struct bfq_data *bfqd);
  818. struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
  819. struct blkcg *blkcg);
  820. struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
  821. struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
  822. struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
  823. void bfqg_and_blkg_put(struct bfq_group *bfqg);
  824. #ifdef CONFIG_BFQ_GROUP_IOSCHED
  825. extern struct cftype bfq_blkcg_legacy_files[];
  826. extern struct cftype bfq_blkg_files[];
  827. extern struct blkcg_policy blkcg_policy_bfq;
  828. #endif
  829. /* ------------- end of cgroups-support interface ------------- */
  830. /* - interface of the internal hierarchical B-WF2Q+ scheduler - */
  831. #ifdef CONFIG_BFQ_GROUP_IOSCHED
  832. /* both next loops stop at one of the child entities of the root group */
  833. #define for_each_entity(entity) \
  834. for (; entity ; entity = entity->parent)
  835. /*
  836. * For each iteration, compute parent in advance, so as to be safe if
  837. * entity is deallocated during the iteration. Such a deallocation may
  838. * happen as a consequence of a bfq_put_queue that frees the bfq_queue
  839. * containing entity.
  840. */
  841. #define for_each_entity_safe(entity, parent) \
  842. for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
  843. #else /* CONFIG_BFQ_GROUP_IOSCHED */
  844. /*
  845. * Next two macros are fake loops when cgroups support is not
  846. * enabled. I fact, in such a case, there is only one level to go up
  847. * (to reach the root group).
  848. */
  849. #define for_each_entity(entity) \
  850. for (; entity ; entity = NULL)
  851. #define for_each_entity_safe(entity, parent) \
  852. for (parent = NULL; entity ; entity = parent)
  853. #endif /* CONFIG_BFQ_GROUP_IOSCHED */
  854. struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
  855. struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
  856. struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
  857. struct bfq_entity *bfq_entity_of(struct rb_node *node);
  858. unsigned short bfq_ioprio_to_weight(int ioprio);
  859. void bfq_put_idle_entity(struct bfq_service_tree *st,
  860. struct bfq_entity *entity);
  861. struct bfq_service_tree *
  862. __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
  863. struct bfq_entity *entity,
  864. bool update_class_too);
  865. void bfq_bfqq_served(struct bfq_queue *bfqq, int served);
  866. void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  867. unsigned long time_ms);
  868. bool __bfq_deactivate_entity(struct bfq_entity *entity,
  869. bool ins_into_idle_tree);
  870. bool next_queue_may_preempt(struct bfq_data *bfqd);
  871. struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
  872. void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
  873. void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  874. bool ins_into_idle_tree, bool expiration);
  875. void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
  876. void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  877. bool expiration);
  878. void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  879. bool expiration);
  880. void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
  881. /* --------------- end of interface of B-WF2Q+ ---------------- */
  882. /* Logging facilities. */
  883. #ifdef CONFIG_BFQ_GROUP_IOSCHED
  884. struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
  885. #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
  886. blk_add_cgroup_trace_msg((bfqd)->queue, \
  887. bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
  888. "bfq%d%c " fmt, (bfqq)->pid, \
  889. bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \
  890. } while (0)
  891. #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
  892. blk_add_cgroup_trace_msg((bfqd)->queue, \
  893. bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \
  894. } while (0)
  895. #else /* CONFIG_BFQ_GROUP_IOSCHED */
  896. #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
  897. blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
  898. bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
  899. ##args)
  900. #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
  901. #endif /* CONFIG_BFQ_GROUP_IOSCHED */
  902. #define bfq_log(bfqd, fmt, args...) \
  903. blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
  904. #endif /* _BFQ_H */