rt.c 66 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775
  1. /*
  2. * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
  3. * policies)
  4. */
  5. #include "sched.h"
  6. #include <linux/slab.h>
  7. #include <linux/irq_work.h>
  8. int sched_rr_timeslice = RR_TIMESLICE;
  9. int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
  10. static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  11. struct rt_bandwidth def_rt_bandwidth;
  12. static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  13. {
  14. struct rt_bandwidth *rt_b =
  15. container_of(timer, struct rt_bandwidth, rt_period_timer);
  16. int idle = 0;
  17. int overrun;
  18. raw_spin_lock(&rt_b->rt_runtime_lock);
  19. for (;;) {
  20. overrun = hrtimer_forward_now(timer, rt_b->rt_period);
  21. if (!overrun)
  22. break;
  23. raw_spin_unlock(&rt_b->rt_runtime_lock);
  24. idle = do_sched_rt_period_timer(rt_b, overrun);
  25. raw_spin_lock(&rt_b->rt_runtime_lock);
  26. }
  27. if (idle)
  28. rt_b->rt_period_active = 0;
  29. raw_spin_unlock(&rt_b->rt_runtime_lock);
  30. return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  31. }
  32. void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  33. {
  34. rt_b->rt_period = ns_to_ktime(period);
  35. rt_b->rt_runtime = runtime;
  36. raw_spin_lock_init(&rt_b->rt_runtime_lock);
  37. hrtimer_init(&rt_b->rt_period_timer,
  38. CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  39. rt_b->rt_period_timer.function = sched_rt_period_timer;
  40. }
  41. static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  42. {
  43. if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  44. return;
  45. raw_spin_lock(&rt_b->rt_runtime_lock);
  46. if (!rt_b->rt_period_active) {
  47. rt_b->rt_period_active = 1;
  48. /*
  49. * SCHED_DEADLINE updates the bandwidth, as a run away
  50. * RT task with a DL task could hog a CPU. But DL does
  51. * not reset the period. If a deadline task was running
  52. * without an RT task running, it can cause RT tasks to
  53. * throttle when they start up. Kick the timer right away
  54. * to update the period.
  55. */
  56. hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
  57. hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
  58. }
  59. raw_spin_unlock(&rt_b->rt_runtime_lock);
  60. }
  61. #if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
  62. static void push_irq_work_func(struct irq_work *work);
  63. #endif
  64. void init_rt_rq(struct rt_rq *rt_rq)
  65. {
  66. struct rt_prio_array *array;
  67. int i;
  68. array = &rt_rq->active;
  69. for (i = 0; i < MAX_RT_PRIO; i++) {
  70. INIT_LIST_HEAD(array->queue + i);
  71. __clear_bit(i, array->bitmap);
  72. }
  73. /* delimiter for bitsearch: */
  74. __set_bit(MAX_RT_PRIO, array->bitmap);
  75. #if defined CONFIG_SMP
  76. rt_rq->highest_prio.curr = MAX_RT_PRIO;
  77. rt_rq->highest_prio.next = MAX_RT_PRIO;
  78. rt_rq->rt_nr_migratory = 0;
  79. rt_rq->overloaded = 0;
  80. plist_head_init(&rt_rq->pushable_tasks);
  81. #ifdef HAVE_RT_PUSH_IPI
  82. rt_rq->push_flags = 0;
  83. rt_rq->push_cpu = nr_cpu_ids;
  84. raw_spin_lock_init(&rt_rq->push_lock);
  85. init_irq_work(&rt_rq->push_work, push_irq_work_func);
  86. #endif
  87. #endif /* CONFIG_SMP */
  88. /* We start is dequeued state, because no RT tasks are queued */
  89. rt_rq->rt_queued = 0;
  90. rt_rq->rt_time = 0;
  91. rt_rq->rt_throttled = 0;
  92. rt_rq->rt_runtime = 0;
  93. raw_spin_lock_init(&rt_rq->rt_runtime_lock);
  94. }
  95. #ifdef CONFIG_RT_GROUP_SCHED
  96. static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
  97. {
  98. hrtimer_cancel(&rt_b->rt_period_timer);
  99. }
  100. #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
  101. static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  102. {
  103. #ifdef CONFIG_SCHED_DEBUG
  104. WARN_ON_ONCE(!rt_entity_is_task(rt_se));
  105. #endif
  106. return container_of(rt_se, struct task_struct, rt);
  107. }
  108. static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  109. {
  110. return rt_rq->rq;
  111. }
  112. static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  113. {
  114. return rt_se->rt_rq;
  115. }
  116. static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
  117. {
  118. struct rt_rq *rt_rq = rt_se->rt_rq;
  119. return rt_rq->rq;
  120. }
  121. void free_rt_sched_group(struct task_group *tg)
  122. {
  123. int i;
  124. if (tg->rt_se)
  125. destroy_rt_bandwidth(&tg->rt_bandwidth);
  126. for_each_possible_cpu(i) {
  127. if (tg->rt_rq)
  128. kfree(tg->rt_rq[i]);
  129. if (tg->rt_se)
  130. kfree(tg->rt_se[i]);
  131. }
  132. kfree(tg->rt_rq);
  133. kfree(tg->rt_se);
  134. }
  135. void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
  136. struct sched_rt_entity *rt_se, int cpu,
  137. struct sched_rt_entity *parent)
  138. {
  139. struct rq *rq = cpu_rq(cpu);
  140. rt_rq->highest_prio.curr = MAX_RT_PRIO;
  141. rt_rq->rt_nr_boosted = 0;
  142. rt_rq->rq = rq;
  143. rt_rq->tg = tg;
  144. tg->rt_rq[cpu] = rt_rq;
  145. tg->rt_se[cpu] = rt_se;
  146. if (!rt_se)
  147. return;
  148. if (!parent)
  149. rt_se->rt_rq = &rq->rt;
  150. else
  151. rt_se->rt_rq = parent->my_q;
  152. rt_se->my_q = rt_rq;
  153. rt_se->parent = parent;
  154. INIT_LIST_HEAD(&rt_se->run_list);
  155. }
  156. int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  157. {
  158. struct rt_rq *rt_rq;
  159. struct sched_rt_entity *rt_se;
  160. int i;
  161. tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
  162. if (!tg->rt_rq)
  163. goto err;
  164. tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
  165. if (!tg->rt_se)
  166. goto err;
  167. init_rt_bandwidth(&tg->rt_bandwidth,
  168. ktime_to_ns(def_rt_bandwidth.rt_period), 0);
  169. for_each_possible_cpu(i) {
  170. rt_rq = kzalloc_node(sizeof(struct rt_rq),
  171. GFP_KERNEL, cpu_to_node(i));
  172. if (!rt_rq)
  173. goto err;
  174. rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
  175. GFP_KERNEL, cpu_to_node(i));
  176. if (!rt_se)
  177. goto err_free_rq;
  178. init_rt_rq(rt_rq);
  179. rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
  180. init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
  181. }
  182. return 1;
  183. err_free_rq:
  184. kfree(rt_rq);
  185. err:
  186. return 0;
  187. }
  188. #else /* CONFIG_RT_GROUP_SCHED */
  189. #define rt_entity_is_task(rt_se) (1)
  190. static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  191. {
  192. return container_of(rt_se, struct task_struct, rt);
  193. }
  194. static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  195. {
  196. return container_of(rt_rq, struct rq, rt);
  197. }
  198. static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
  199. {
  200. struct task_struct *p = rt_task_of(rt_se);
  201. return task_rq(p);
  202. }
  203. static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  204. {
  205. struct rq *rq = rq_of_rt_se(rt_se);
  206. return &rq->rt;
  207. }
  208. void free_rt_sched_group(struct task_group *tg) { }
  209. int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  210. {
  211. return 1;
  212. }
  213. #endif /* CONFIG_RT_GROUP_SCHED */
  214. #ifdef CONFIG_SMP
  215. static void pull_rt_task(struct rq *this_rq);
  216. static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
  217. {
  218. /* Try to pull RT tasks here if we lower this rq's prio */
  219. return rq->rt.highest_prio.curr > prev->prio;
  220. }
  221. static inline int rt_overloaded(struct rq *rq)
  222. {
  223. return atomic_read(&rq->rd->rto_count);
  224. }
  225. static inline void rt_set_overload(struct rq *rq)
  226. {
  227. if (!rq->online)
  228. return;
  229. cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  230. /*
  231. * Make sure the mask is visible before we set
  232. * the overload count. That is checked to determine
  233. * if we should look at the mask. It would be a shame
  234. * if we looked at the mask, but the mask was not
  235. * updated yet.
  236. *
  237. * Matched by the barrier in pull_rt_task().
  238. */
  239. smp_wmb();
  240. atomic_inc(&rq->rd->rto_count);
  241. }
  242. static inline void rt_clear_overload(struct rq *rq)
  243. {
  244. if (!rq->online)
  245. return;
  246. /* the order here really doesn't matter */
  247. atomic_dec(&rq->rd->rto_count);
  248. cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  249. }
  250. static void update_rt_migration(struct rt_rq *rt_rq)
  251. {
  252. if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
  253. if (!rt_rq->overloaded) {
  254. rt_set_overload(rq_of_rt_rq(rt_rq));
  255. rt_rq->overloaded = 1;
  256. }
  257. } else if (rt_rq->overloaded) {
  258. rt_clear_overload(rq_of_rt_rq(rt_rq));
  259. rt_rq->overloaded = 0;
  260. }
  261. }
  262. static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  263. {
  264. struct task_struct *p;
  265. if (!rt_entity_is_task(rt_se))
  266. return;
  267. p = rt_task_of(rt_se);
  268. rt_rq = &rq_of_rt_rq(rt_rq)->rt;
  269. rt_rq->rt_nr_total++;
  270. if (p->nr_cpus_allowed > 1)
  271. rt_rq->rt_nr_migratory++;
  272. update_rt_migration(rt_rq);
  273. }
  274. static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  275. {
  276. struct task_struct *p;
  277. if (!rt_entity_is_task(rt_se))
  278. return;
  279. p = rt_task_of(rt_se);
  280. rt_rq = &rq_of_rt_rq(rt_rq)->rt;
  281. rt_rq->rt_nr_total--;
  282. if (p->nr_cpus_allowed > 1)
  283. rt_rq->rt_nr_migratory--;
  284. update_rt_migration(rt_rq);
  285. }
  286. static inline int has_pushable_tasks(struct rq *rq)
  287. {
  288. return !plist_head_empty(&rq->rt.pushable_tasks);
  289. }
  290. static DEFINE_PER_CPU(struct callback_head, rt_push_head);
  291. static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
  292. static void push_rt_tasks(struct rq *);
  293. static void pull_rt_task(struct rq *);
  294. static inline void queue_push_tasks(struct rq *rq)
  295. {
  296. if (!has_pushable_tasks(rq))
  297. return;
  298. queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
  299. }
  300. static inline void queue_pull_task(struct rq *rq)
  301. {
  302. queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
  303. }
  304. static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
  305. {
  306. plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
  307. plist_node_init(&p->pushable_tasks, p->prio);
  308. plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
  309. /* Update the highest prio pushable task */
  310. if (p->prio < rq->rt.highest_prio.next)
  311. rq->rt.highest_prio.next = p->prio;
  312. }
  313. static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
  314. {
  315. plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
  316. /* Update the new highest prio pushable task */
  317. if (has_pushable_tasks(rq)) {
  318. p = plist_first_entry(&rq->rt.pushable_tasks,
  319. struct task_struct, pushable_tasks);
  320. rq->rt.highest_prio.next = p->prio;
  321. } else
  322. rq->rt.highest_prio.next = MAX_RT_PRIO;
  323. }
  324. #else
  325. static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
  326. {
  327. }
  328. static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
  329. {
  330. }
  331. static inline
  332. void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  333. {
  334. }
  335. static inline
  336. void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  337. {
  338. }
  339. static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
  340. {
  341. return false;
  342. }
  343. static inline void pull_rt_task(struct rq *this_rq)
  344. {
  345. }
  346. static inline void queue_push_tasks(struct rq *rq)
  347. {
  348. }
  349. #endif /* CONFIG_SMP */
  350. static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
  351. static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
  352. static inline int on_rt_rq(struct sched_rt_entity *rt_se)
  353. {
  354. return rt_se->on_rq;
  355. }
  356. #ifdef CONFIG_RT_GROUP_SCHED
  357. static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
  358. {
  359. if (!rt_rq->tg)
  360. return RUNTIME_INF;
  361. return rt_rq->rt_runtime;
  362. }
  363. static inline u64 sched_rt_period(struct rt_rq *rt_rq)
  364. {
  365. return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
  366. }
  367. typedef struct task_group *rt_rq_iter_t;
  368. static inline struct task_group *next_task_group(struct task_group *tg)
  369. {
  370. do {
  371. tg = list_entry_rcu(tg->list.next,
  372. typeof(struct task_group), list);
  373. } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
  374. if (&tg->list == &task_groups)
  375. tg = NULL;
  376. return tg;
  377. }
  378. #define for_each_rt_rq(rt_rq, iter, rq) \
  379. for (iter = container_of(&task_groups, typeof(*iter), list); \
  380. (iter = next_task_group(iter)) && \
  381. (rt_rq = iter->rt_rq[cpu_of(rq)]);)
  382. #define for_each_sched_rt_entity(rt_se) \
  383. for (; rt_se; rt_se = rt_se->parent)
  384. static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
  385. {
  386. return rt_se->my_q;
  387. }
  388. static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
  389. static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
  390. static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  391. {
  392. struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
  393. struct rq *rq = rq_of_rt_rq(rt_rq);
  394. struct sched_rt_entity *rt_se;
  395. int cpu = cpu_of(rq);
  396. rt_se = rt_rq->tg->rt_se[cpu];
  397. if (rt_rq->rt_nr_running) {
  398. if (!rt_se)
  399. enqueue_top_rt_rq(rt_rq);
  400. else if (!on_rt_rq(rt_se))
  401. enqueue_rt_entity(rt_se, 0);
  402. if (rt_rq->highest_prio.curr < curr->prio)
  403. resched_curr(rq);
  404. }
  405. }
  406. static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
  407. {
  408. struct sched_rt_entity *rt_se;
  409. int cpu = cpu_of(rq_of_rt_rq(rt_rq));
  410. rt_se = rt_rq->tg->rt_se[cpu];
  411. if (!rt_se)
  412. dequeue_top_rt_rq(rt_rq);
  413. else if (on_rt_rq(rt_se))
  414. dequeue_rt_entity(rt_se, 0);
  415. }
  416. static inline int rt_rq_throttled(struct rt_rq *rt_rq)
  417. {
  418. return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
  419. }
  420. static int rt_se_boosted(struct sched_rt_entity *rt_se)
  421. {
  422. struct rt_rq *rt_rq = group_rt_rq(rt_se);
  423. struct task_struct *p;
  424. if (rt_rq)
  425. return !!rt_rq->rt_nr_boosted;
  426. p = rt_task_of(rt_se);
  427. return p->prio != p->normal_prio;
  428. }
  429. #ifdef CONFIG_SMP
  430. static inline const struct cpumask *sched_rt_period_mask(void)
  431. {
  432. return this_rq()->rd->span;
  433. }
  434. #else
  435. static inline const struct cpumask *sched_rt_period_mask(void)
  436. {
  437. return cpu_online_mask;
  438. }
  439. #endif
  440. static inline
  441. struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
  442. {
  443. return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
  444. }
  445. static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
  446. {
  447. return &rt_rq->tg->rt_bandwidth;
  448. }
  449. #else /* !CONFIG_RT_GROUP_SCHED */
  450. static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
  451. {
  452. return rt_rq->rt_runtime;
  453. }
  454. static inline u64 sched_rt_period(struct rt_rq *rt_rq)
  455. {
  456. return ktime_to_ns(def_rt_bandwidth.rt_period);
  457. }
  458. typedef struct rt_rq *rt_rq_iter_t;
  459. #define for_each_rt_rq(rt_rq, iter, rq) \
  460. for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
  461. #define for_each_sched_rt_entity(rt_se) \
  462. for (; rt_se; rt_se = NULL)
  463. static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
  464. {
  465. return NULL;
  466. }
  467. static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  468. {
  469. struct rq *rq = rq_of_rt_rq(rt_rq);
  470. if (!rt_rq->rt_nr_running)
  471. return;
  472. enqueue_top_rt_rq(rt_rq);
  473. resched_curr(rq);
  474. }
  475. static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
  476. {
  477. dequeue_top_rt_rq(rt_rq);
  478. }
  479. static inline int rt_rq_throttled(struct rt_rq *rt_rq)
  480. {
  481. return rt_rq->rt_throttled;
  482. }
  483. static inline const struct cpumask *sched_rt_period_mask(void)
  484. {
  485. return cpu_online_mask;
  486. }
  487. static inline
  488. struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
  489. {
  490. return &cpu_rq(cpu)->rt;
  491. }
  492. static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
  493. {
  494. return &def_rt_bandwidth;
  495. }
  496. #endif /* CONFIG_RT_GROUP_SCHED */
  497. bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
  498. {
  499. struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  500. return (hrtimer_active(&rt_b->rt_period_timer) ||
  501. rt_rq->rt_time < rt_b->rt_runtime);
  502. }
  503. #ifdef CONFIG_SMP
  504. /*
  505. * We ran out of runtime, see if we can borrow some from our neighbours.
  506. */
  507. static void do_balance_runtime(struct rt_rq *rt_rq)
  508. {
  509. struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  510. struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
  511. int i, weight;
  512. u64 rt_period;
  513. weight = cpumask_weight(rd->span);
  514. raw_spin_lock(&rt_b->rt_runtime_lock);
  515. rt_period = ktime_to_ns(rt_b->rt_period);
  516. for_each_cpu(i, rd->span) {
  517. struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
  518. s64 diff;
  519. if (iter == rt_rq)
  520. continue;
  521. raw_spin_lock(&iter->rt_runtime_lock);
  522. /*
  523. * Either all rqs have inf runtime and there's nothing to steal
  524. * or __disable_runtime() below sets a specific rq to inf to
  525. * indicate its been disabled and disalow stealing.
  526. */
  527. if (iter->rt_runtime == RUNTIME_INF)
  528. goto next;
  529. /*
  530. * From runqueues with spare time, take 1/n part of their
  531. * spare time, but no more than our period.
  532. */
  533. diff = iter->rt_runtime - iter->rt_time;
  534. if (diff > 0) {
  535. diff = div_u64((u64)diff, weight);
  536. if (rt_rq->rt_runtime + diff > rt_period)
  537. diff = rt_period - rt_rq->rt_runtime;
  538. iter->rt_runtime -= diff;
  539. rt_rq->rt_runtime += diff;
  540. if (rt_rq->rt_runtime == rt_period) {
  541. raw_spin_unlock(&iter->rt_runtime_lock);
  542. break;
  543. }
  544. }
  545. next:
  546. raw_spin_unlock(&iter->rt_runtime_lock);
  547. }
  548. raw_spin_unlock(&rt_b->rt_runtime_lock);
  549. }
  550. /*
  551. * Ensure this RQ takes back all the runtime it lend to its neighbours.
  552. */
  553. static void __disable_runtime(struct rq *rq)
  554. {
  555. struct root_domain *rd = rq->rd;
  556. rt_rq_iter_t iter;
  557. struct rt_rq *rt_rq;
  558. if (unlikely(!scheduler_running))
  559. return;
  560. for_each_rt_rq(rt_rq, iter, rq) {
  561. struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  562. s64 want;
  563. int i;
  564. raw_spin_lock(&rt_b->rt_runtime_lock);
  565. raw_spin_lock(&rt_rq->rt_runtime_lock);
  566. /*
  567. * Either we're all inf and nobody needs to borrow, or we're
  568. * already disabled and thus have nothing to do, or we have
  569. * exactly the right amount of runtime to take out.
  570. */
  571. if (rt_rq->rt_runtime == RUNTIME_INF ||
  572. rt_rq->rt_runtime == rt_b->rt_runtime)
  573. goto balanced;
  574. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  575. /*
  576. * Calculate the difference between what we started out with
  577. * and what we current have, that's the amount of runtime
  578. * we lend and now have to reclaim.
  579. */
  580. want = rt_b->rt_runtime - rt_rq->rt_runtime;
  581. /*
  582. * Greedy reclaim, take back as much as we can.
  583. */
  584. for_each_cpu(i, rd->span) {
  585. struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
  586. s64 diff;
  587. /*
  588. * Can't reclaim from ourselves or disabled runqueues.
  589. */
  590. if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
  591. continue;
  592. raw_spin_lock(&iter->rt_runtime_lock);
  593. if (want > 0) {
  594. diff = min_t(s64, iter->rt_runtime, want);
  595. iter->rt_runtime -= diff;
  596. want -= diff;
  597. } else {
  598. iter->rt_runtime -= want;
  599. want -= want;
  600. }
  601. raw_spin_unlock(&iter->rt_runtime_lock);
  602. if (!want)
  603. break;
  604. }
  605. raw_spin_lock(&rt_rq->rt_runtime_lock);
  606. /*
  607. * We cannot be left wanting - that would mean some runtime
  608. * leaked out of the system.
  609. */
  610. BUG_ON(want);
  611. balanced:
  612. /*
  613. * Disable all the borrow logic by pretending we have inf
  614. * runtime - in which case borrowing doesn't make sense.
  615. */
  616. rt_rq->rt_runtime = RUNTIME_INF;
  617. rt_rq->rt_throttled = 0;
  618. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  619. raw_spin_unlock(&rt_b->rt_runtime_lock);
  620. /* Make rt_rq available for pick_next_task() */
  621. sched_rt_rq_enqueue(rt_rq);
  622. }
  623. }
  624. static void __enable_runtime(struct rq *rq)
  625. {
  626. rt_rq_iter_t iter;
  627. struct rt_rq *rt_rq;
  628. if (unlikely(!scheduler_running))
  629. return;
  630. /*
  631. * Reset each runqueue's bandwidth settings
  632. */
  633. for_each_rt_rq(rt_rq, iter, rq) {
  634. struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  635. raw_spin_lock(&rt_b->rt_runtime_lock);
  636. raw_spin_lock(&rt_rq->rt_runtime_lock);
  637. rt_rq->rt_runtime = rt_b->rt_runtime;
  638. rt_rq->rt_time = 0;
  639. rt_rq->rt_throttled = 0;
  640. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  641. raw_spin_unlock(&rt_b->rt_runtime_lock);
  642. }
  643. }
  644. static void balance_runtime(struct rt_rq *rt_rq)
  645. {
  646. if (!sched_feat(RT_RUNTIME_SHARE))
  647. return;
  648. if (rt_rq->rt_time > rt_rq->rt_runtime) {
  649. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  650. do_balance_runtime(rt_rq);
  651. raw_spin_lock(&rt_rq->rt_runtime_lock);
  652. }
  653. }
  654. #else /* !CONFIG_SMP */
  655. static inline void balance_runtime(struct rt_rq *rt_rq) {}
  656. #endif /* CONFIG_SMP */
  657. static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
  658. {
  659. int i, idle = 1, throttled = 0;
  660. const struct cpumask *span;
  661. span = sched_rt_period_mask();
  662. #ifdef CONFIG_RT_GROUP_SCHED
  663. /*
  664. * FIXME: isolated CPUs should really leave the root task group,
  665. * whether they are isolcpus or were isolated via cpusets, lest
  666. * the timer run on a CPU which does not service all runqueues,
  667. * potentially leaving other CPUs indefinitely throttled. If
  668. * isolation is really required, the user will turn the throttle
  669. * off to kill the perturbations it causes anyway. Meanwhile,
  670. * this maintains functionality for boot and/or troubleshooting.
  671. */
  672. if (rt_b == &root_task_group.rt_bandwidth)
  673. span = cpu_online_mask;
  674. #endif
  675. for_each_cpu(i, span) {
  676. int enqueue = 0;
  677. struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
  678. struct rq *rq = rq_of_rt_rq(rt_rq);
  679. int skip;
  680. /*
  681. * When span == cpu_online_mask, taking each rq->lock
  682. * can be time-consuming. Try to avoid it when possible.
  683. */
  684. raw_spin_lock(&rt_rq->rt_runtime_lock);
  685. skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
  686. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  687. if (skip)
  688. continue;
  689. raw_spin_lock(&rq->lock);
  690. if (rt_rq->rt_time) {
  691. u64 runtime;
  692. raw_spin_lock(&rt_rq->rt_runtime_lock);
  693. if (rt_rq->rt_throttled)
  694. balance_runtime(rt_rq);
  695. runtime = rt_rq->rt_runtime;
  696. rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
  697. if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
  698. rt_rq->rt_throttled = 0;
  699. enqueue = 1;
  700. /*
  701. * When we're idle and a woken (rt) task is
  702. * throttled check_preempt_curr() will set
  703. * skip_update and the time between the wakeup
  704. * and this unthrottle will get accounted as
  705. * 'runtime'.
  706. */
  707. if (rt_rq->rt_nr_running && rq->curr == rq->idle)
  708. rq_clock_skip_update(rq, false);
  709. }
  710. if (rt_rq->rt_time || rt_rq->rt_nr_running)
  711. idle = 0;
  712. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  713. } else if (rt_rq->rt_nr_running) {
  714. idle = 0;
  715. if (!rt_rq_throttled(rt_rq))
  716. enqueue = 1;
  717. }
  718. if (rt_rq->rt_throttled)
  719. throttled = 1;
  720. if (enqueue)
  721. sched_rt_rq_enqueue(rt_rq);
  722. raw_spin_unlock(&rq->lock);
  723. }
  724. if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
  725. return 1;
  726. return idle;
  727. }
  728. static inline int rt_se_prio(struct sched_rt_entity *rt_se)
  729. {
  730. #ifdef CONFIG_RT_GROUP_SCHED
  731. struct rt_rq *rt_rq = group_rt_rq(rt_se);
  732. if (rt_rq)
  733. return rt_rq->highest_prio.curr;
  734. #endif
  735. return rt_task_of(rt_se)->prio;
  736. }
  737. static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
  738. {
  739. u64 runtime = sched_rt_runtime(rt_rq);
  740. if (rt_rq->rt_throttled)
  741. return rt_rq_throttled(rt_rq);
  742. if (runtime >= sched_rt_period(rt_rq))
  743. return 0;
  744. balance_runtime(rt_rq);
  745. runtime = sched_rt_runtime(rt_rq);
  746. if (runtime == RUNTIME_INF)
  747. return 0;
  748. if (rt_rq->rt_time > runtime) {
  749. struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  750. /*
  751. * Don't actually throttle groups that have no runtime assigned
  752. * but accrue some time due to boosting.
  753. */
  754. if (likely(rt_b->rt_runtime)) {
  755. rt_rq->rt_throttled = 1;
  756. printk_deferred_once("sched: RT throttling activated\n");
  757. } else {
  758. /*
  759. * In case we did anyway, make it go away,
  760. * replenishment is a joke, since it will replenish us
  761. * with exactly 0 ns.
  762. */
  763. rt_rq->rt_time = 0;
  764. }
  765. if (rt_rq_throttled(rt_rq)) {
  766. sched_rt_rq_dequeue(rt_rq);
  767. return 1;
  768. }
  769. }
  770. return 0;
  771. }
  772. /*
  773. * Update the current task's runtime statistics. Skip current tasks that
  774. * are not in our scheduling class.
  775. */
  776. static void update_curr_rt(struct rq *rq)
  777. {
  778. struct task_struct *curr = rq->curr;
  779. struct sched_rt_entity *rt_se = &curr->rt;
  780. u64 delta_exec;
  781. if (curr->sched_class != &rt_sched_class)
  782. return;
  783. delta_exec = rq_clock_task(rq) - curr->se.exec_start;
  784. if (unlikely((s64)delta_exec <= 0))
  785. return;
  786. /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
  787. cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
  788. schedstat_set(curr->se.statistics.exec_max,
  789. max(curr->se.statistics.exec_max, delta_exec));
  790. curr->se.sum_exec_runtime += delta_exec;
  791. account_group_exec_runtime(curr, delta_exec);
  792. curr->se.exec_start = rq_clock_task(rq);
  793. cpuacct_charge(curr, delta_exec);
  794. sched_rt_avg_update(rq, delta_exec);
  795. if (!rt_bandwidth_enabled())
  796. return;
  797. for_each_sched_rt_entity(rt_se) {
  798. struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  799. if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
  800. raw_spin_lock(&rt_rq->rt_runtime_lock);
  801. rt_rq->rt_time += delta_exec;
  802. if (sched_rt_runtime_exceeded(rt_rq))
  803. resched_curr(rq);
  804. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  805. }
  806. }
  807. }
  808. static void
  809. dequeue_top_rt_rq(struct rt_rq *rt_rq)
  810. {
  811. struct rq *rq = rq_of_rt_rq(rt_rq);
  812. BUG_ON(&rq->rt != rt_rq);
  813. if (!rt_rq->rt_queued)
  814. return;
  815. BUG_ON(!rq->nr_running);
  816. sub_nr_running(rq, rt_rq->rt_nr_running);
  817. rt_rq->rt_queued = 0;
  818. }
  819. static void
  820. enqueue_top_rt_rq(struct rt_rq *rt_rq)
  821. {
  822. struct rq *rq = rq_of_rt_rq(rt_rq);
  823. BUG_ON(&rq->rt != rt_rq);
  824. if (rt_rq->rt_queued)
  825. return;
  826. if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
  827. return;
  828. add_nr_running(rq, rt_rq->rt_nr_running);
  829. rt_rq->rt_queued = 1;
  830. }
  831. #if defined CONFIG_SMP
  832. static void
  833. inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
  834. {
  835. struct rq *rq = rq_of_rt_rq(rt_rq);
  836. #ifdef CONFIG_RT_GROUP_SCHED
  837. /*
  838. * Change rq's cpupri only if rt_rq is the top queue.
  839. */
  840. if (&rq->rt != rt_rq)
  841. return;
  842. #endif
  843. if (rq->online && prio < prev_prio)
  844. cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
  845. }
  846. static void
  847. dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
  848. {
  849. struct rq *rq = rq_of_rt_rq(rt_rq);
  850. #ifdef CONFIG_RT_GROUP_SCHED
  851. /*
  852. * Change rq's cpupri only if rt_rq is the top queue.
  853. */
  854. if (&rq->rt != rt_rq)
  855. return;
  856. #endif
  857. if (rq->online && rt_rq->highest_prio.curr != prev_prio)
  858. cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
  859. }
  860. #else /* CONFIG_SMP */
  861. static inline
  862. void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
  863. static inline
  864. void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
  865. #endif /* CONFIG_SMP */
  866. #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
  867. static void
  868. inc_rt_prio(struct rt_rq *rt_rq, int prio)
  869. {
  870. int prev_prio = rt_rq->highest_prio.curr;
  871. if (prio < prev_prio)
  872. rt_rq->highest_prio.curr = prio;
  873. inc_rt_prio_smp(rt_rq, prio, prev_prio);
  874. }
  875. static void
  876. dec_rt_prio(struct rt_rq *rt_rq, int prio)
  877. {
  878. int prev_prio = rt_rq->highest_prio.curr;
  879. if (rt_rq->rt_nr_running) {
  880. WARN_ON(prio < prev_prio);
  881. /*
  882. * This may have been our highest task, and therefore
  883. * we may have some recomputation to do
  884. */
  885. if (prio == prev_prio) {
  886. struct rt_prio_array *array = &rt_rq->active;
  887. rt_rq->highest_prio.curr =
  888. sched_find_first_bit(array->bitmap);
  889. }
  890. } else
  891. rt_rq->highest_prio.curr = MAX_RT_PRIO;
  892. dec_rt_prio_smp(rt_rq, prio, prev_prio);
  893. }
  894. #else
  895. static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
  896. static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
  897. #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
  898. #ifdef CONFIG_RT_GROUP_SCHED
  899. static void
  900. inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  901. {
  902. if (rt_se_boosted(rt_se))
  903. rt_rq->rt_nr_boosted++;
  904. if (rt_rq->tg)
  905. start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
  906. }
  907. static void
  908. dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  909. {
  910. if (rt_se_boosted(rt_se))
  911. rt_rq->rt_nr_boosted--;
  912. WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
  913. }
  914. #else /* CONFIG_RT_GROUP_SCHED */
  915. static void
  916. inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  917. {
  918. start_rt_bandwidth(&def_rt_bandwidth);
  919. }
  920. static inline
  921. void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
  922. #endif /* CONFIG_RT_GROUP_SCHED */
  923. static inline
  924. unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
  925. {
  926. struct rt_rq *group_rq = group_rt_rq(rt_se);
  927. if (group_rq)
  928. return group_rq->rt_nr_running;
  929. else
  930. return 1;
  931. }
  932. static inline
  933. unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
  934. {
  935. struct rt_rq *group_rq = group_rt_rq(rt_se);
  936. struct task_struct *tsk;
  937. if (group_rq)
  938. return group_rq->rr_nr_running;
  939. tsk = rt_task_of(rt_se);
  940. return (tsk->policy == SCHED_RR) ? 1 : 0;
  941. }
  942. static inline
  943. void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  944. {
  945. int prio = rt_se_prio(rt_se);
  946. WARN_ON(!rt_prio(prio));
  947. rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
  948. rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
  949. inc_rt_prio(rt_rq, prio);
  950. inc_rt_migration(rt_se, rt_rq);
  951. inc_rt_group(rt_se, rt_rq);
  952. }
  953. static inline
  954. void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  955. {
  956. WARN_ON(!rt_prio(rt_se_prio(rt_se)));
  957. WARN_ON(!rt_rq->rt_nr_running);
  958. rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
  959. rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
  960. dec_rt_prio(rt_rq, rt_se_prio(rt_se));
  961. dec_rt_migration(rt_se, rt_rq);
  962. dec_rt_group(rt_se, rt_rq);
  963. }
  964. /*
  965. * Change rt_se->run_list location unless SAVE && !MOVE
  966. *
  967. * assumes ENQUEUE/DEQUEUE flags match
  968. */
  969. static inline bool move_entity(unsigned int flags)
  970. {
  971. if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
  972. return false;
  973. return true;
  974. }
  975. static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
  976. {
  977. list_del_init(&rt_se->run_list);
  978. if (list_empty(array->queue + rt_se_prio(rt_se)))
  979. __clear_bit(rt_se_prio(rt_se), array->bitmap);
  980. rt_se->on_list = 0;
  981. }
  982. static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  983. {
  984. struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  985. struct rt_prio_array *array = &rt_rq->active;
  986. struct rt_rq *group_rq = group_rt_rq(rt_se);
  987. struct list_head *queue = array->queue + rt_se_prio(rt_se);
  988. /*
  989. * Don't enqueue the group if its throttled, or when empty.
  990. * The latter is a consequence of the former when a child group
  991. * get throttled and the current group doesn't have any other
  992. * active members.
  993. */
  994. if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
  995. if (rt_se->on_list)
  996. __delist_rt_entity(rt_se, array);
  997. return;
  998. }
  999. if (move_entity(flags)) {
  1000. WARN_ON_ONCE(rt_se->on_list);
  1001. if (flags & ENQUEUE_HEAD)
  1002. list_add(&rt_se->run_list, queue);
  1003. else
  1004. list_add_tail(&rt_se->run_list, queue);
  1005. __set_bit(rt_se_prio(rt_se), array->bitmap);
  1006. rt_se->on_list = 1;
  1007. }
  1008. rt_se->on_rq = 1;
  1009. inc_rt_tasks(rt_se, rt_rq);
  1010. }
  1011. static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  1012. {
  1013. struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  1014. struct rt_prio_array *array = &rt_rq->active;
  1015. if (move_entity(flags)) {
  1016. WARN_ON_ONCE(!rt_se->on_list);
  1017. __delist_rt_entity(rt_se, array);
  1018. }
  1019. rt_se->on_rq = 0;
  1020. dec_rt_tasks(rt_se, rt_rq);
  1021. }
  1022. /*
  1023. * Because the prio of an upper entry depends on the lower
  1024. * entries, we must remove entries top - down.
  1025. */
  1026. static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
  1027. {
  1028. struct sched_rt_entity *back = NULL;
  1029. for_each_sched_rt_entity(rt_se) {
  1030. rt_se->back = back;
  1031. back = rt_se;
  1032. }
  1033. dequeue_top_rt_rq(rt_rq_of_se(back));
  1034. for (rt_se = back; rt_se; rt_se = rt_se->back) {
  1035. if (on_rt_rq(rt_se))
  1036. __dequeue_rt_entity(rt_se, flags);
  1037. }
  1038. }
  1039. static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  1040. {
  1041. struct rq *rq = rq_of_rt_se(rt_se);
  1042. dequeue_rt_stack(rt_se, flags);
  1043. for_each_sched_rt_entity(rt_se)
  1044. __enqueue_rt_entity(rt_se, flags);
  1045. enqueue_top_rt_rq(&rq->rt);
  1046. }
  1047. static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  1048. {
  1049. struct rq *rq = rq_of_rt_se(rt_se);
  1050. dequeue_rt_stack(rt_se, flags);
  1051. for_each_sched_rt_entity(rt_se) {
  1052. struct rt_rq *rt_rq = group_rt_rq(rt_se);
  1053. if (rt_rq && rt_rq->rt_nr_running)
  1054. __enqueue_rt_entity(rt_se, flags);
  1055. }
  1056. enqueue_top_rt_rq(&rq->rt);
  1057. }
  1058. /*
  1059. * Adding/removing a task to/from a priority array:
  1060. */
  1061. static void
  1062. enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
  1063. {
  1064. struct sched_rt_entity *rt_se = &p->rt;
  1065. if (flags & ENQUEUE_WAKEUP)
  1066. rt_se->timeout = 0;
  1067. enqueue_rt_entity(rt_se, flags);
  1068. if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
  1069. enqueue_pushable_task(rq, p);
  1070. }
  1071. static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
  1072. {
  1073. struct sched_rt_entity *rt_se = &p->rt;
  1074. update_curr_rt(rq);
  1075. dequeue_rt_entity(rt_se, flags);
  1076. dequeue_pushable_task(rq, p);
  1077. }
  1078. /*
  1079. * Put task to the head or the end of the run list without the overhead of
  1080. * dequeue followed by enqueue.
  1081. */
  1082. static void
  1083. requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
  1084. {
  1085. if (on_rt_rq(rt_se)) {
  1086. struct rt_prio_array *array = &rt_rq->active;
  1087. struct list_head *queue = array->queue + rt_se_prio(rt_se);
  1088. if (head)
  1089. list_move(&rt_se->run_list, queue);
  1090. else
  1091. list_move_tail(&rt_se->run_list, queue);
  1092. }
  1093. }
  1094. static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
  1095. {
  1096. struct sched_rt_entity *rt_se = &p->rt;
  1097. struct rt_rq *rt_rq;
  1098. for_each_sched_rt_entity(rt_se) {
  1099. rt_rq = rt_rq_of_se(rt_se);
  1100. requeue_rt_entity(rt_rq, rt_se, head);
  1101. }
  1102. }
  1103. static void yield_task_rt(struct rq *rq)
  1104. {
  1105. requeue_task_rt(rq, rq->curr, 0);
  1106. }
  1107. #ifdef CONFIG_SMP
  1108. static int find_lowest_rq(struct task_struct *task);
  1109. static int
  1110. select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
  1111. {
  1112. struct task_struct *curr;
  1113. struct rq *rq;
  1114. /* For anything but wake ups, just return the task_cpu */
  1115. if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
  1116. goto out;
  1117. rq = cpu_rq(cpu);
  1118. rcu_read_lock();
  1119. curr = READ_ONCE(rq->curr); /* unlocked access */
  1120. /*
  1121. * If the current task on @p's runqueue is an RT task, then
  1122. * try to see if we can wake this RT task up on another
  1123. * runqueue. Otherwise simply start this RT task
  1124. * on its current runqueue.
  1125. *
  1126. * We want to avoid overloading runqueues. If the woken
  1127. * task is a higher priority, then it will stay on this CPU
  1128. * and the lower prio task should be moved to another CPU.
  1129. * Even though this will probably make the lower prio task
  1130. * lose its cache, we do not want to bounce a higher task
  1131. * around just because it gave up its CPU, perhaps for a
  1132. * lock?
  1133. *
  1134. * For equal prio tasks, we just let the scheduler sort it out.
  1135. *
  1136. * Otherwise, just let it ride on the affined RQ and the
  1137. * post-schedule router will push the preempted task away
  1138. *
  1139. * This test is optimistic, if we get it wrong the load-balancer
  1140. * will have to sort it out.
  1141. */
  1142. if (curr && unlikely(rt_task(curr)) &&
  1143. (curr->nr_cpus_allowed < 2 ||
  1144. curr->prio <= p->prio)) {
  1145. int target = find_lowest_rq(p);
  1146. /*
  1147. * Don't bother moving it if the destination CPU is
  1148. * not running a lower priority task.
  1149. */
  1150. if (target != -1 &&
  1151. p->prio < cpu_rq(target)->rt.highest_prio.curr)
  1152. cpu = target;
  1153. }
  1154. rcu_read_unlock();
  1155. out:
  1156. return cpu;
  1157. }
  1158. static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
  1159. {
  1160. /*
  1161. * Current can't be migrated, useless to reschedule,
  1162. * let's hope p can move out.
  1163. */
  1164. if (rq->curr->nr_cpus_allowed == 1 ||
  1165. !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
  1166. return;
  1167. /*
  1168. * p is migratable, so let's not schedule it and
  1169. * see if it is pushed or pulled somewhere else.
  1170. */
  1171. if (p->nr_cpus_allowed != 1
  1172. && cpupri_find(&rq->rd->cpupri, p, NULL))
  1173. return;
  1174. /*
  1175. * There appears to be other cpus that can accept
  1176. * current and none to run 'p', so lets reschedule
  1177. * to try and push current away:
  1178. */
  1179. requeue_task_rt(rq, p, 1);
  1180. resched_curr(rq);
  1181. }
  1182. #endif /* CONFIG_SMP */
  1183. /*
  1184. * Preempt the current task with a newly woken task if needed:
  1185. */
  1186. static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
  1187. {
  1188. if (p->prio < rq->curr->prio) {
  1189. resched_curr(rq);
  1190. return;
  1191. }
  1192. #ifdef CONFIG_SMP
  1193. /*
  1194. * If:
  1195. *
  1196. * - the newly woken task is of equal priority to the current task
  1197. * - the newly woken task is non-migratable while current is migratable
  1198. * - current will be preempted on the next reschedule
  1199. *
  1200. * we should check to see if current can readily move to a different
  1201. * cpu. If so, we will reschedule to allow the push logic to try
  1202. * to move current somewhere else, making room for our non-migratable
  1203. * task.
  1204. */
  1205. if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
  1206. check_preempt_equal_prio(rq, p);
  1207. #endif
  1208. }
  1209. static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
  1210. struct rt_rq *rt_rq)
  1211. {
  1212. struct rt_prio_array *array = &rt_rq->active;
  1213. struct sched_rt_entity *next = NULL;
  1214. struct list_head *queue;
  1215. int idx;
  1216. idx = sched_find_first_bit(array->bitmap);
  1217. BUG_ON(idx >= MAX_RT_PRIO);
  1218. queue = array->queue + idx;
  1219. next = list_entry(queue->next, struct sched_rt_entity, run_list);
  1220. return next;
  1221. }
  1222. static struct task_struct *_pick_next_task_rt(struct rq *rq)
  1223. {
  1224. struct sched_rt_entity *rt_se;
  1225. struct task_struct *p;
  1226. struct rt_rq *rt_rq = &rq->rt;
  1227. do {
  1228. rt_se = pick_next_rt_entity(rq, rt_rq);
  1229. BUG_ON(!rt_se);
  1230. rt_rq = group_rt_rq(rt_se);
  1231. } while (rt_rq);
  1232. p = rt_task_of(rt_se);
  1233. p->se.exec_start = rq_clock_task(rq);
  1234. return p;
  1235. }
  1236. static struct task_struct *
  1237. pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  1238. {
  1239. struct task_struct *p;
  1240. struct rt_rq *rt_rq = &rq->rt;
  1241. if (need_pull_rt_task(rq, prev)) {
  1242. /*
  1243. * This is OK, because current is on_cpu, which avoids it being
  1244. * picked for load-balance and preemption/IRQs are still
  1245. * disabled avoiding further scheduler activity on it and we're
  1246. * being very careful to re-start the picking loop.
  1247. */
  1248. rq_unpin_lock(rq, rf);
  1249. pull_rt_task(rq);
  1250. rq_repin_lock(rq, rf);
  1251. /*
  1252. * pull_rt_task() can drop (and re-acquire) rq->lock; this
  1253. * means a dl or stop task can slip in, in which case we need
  1254. * to re-start task selection.
  1255. */
  1256. if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
  1257. rq->dl.dl_nr_running))
  1258. return RETRY_TASK;
  1259. }
  1260. /*
  1261. * We may dequeue prev's rt_rq in put_prev_task().
  1262. * So, we update time before rt_nr_running check.
  1263. */
  1264. if (prev->sched_class == &rt_sched_class)
  1265. update_curr_rt(rq);
  1266. if (!rt_rq->rt_queued)
  1267. return NULL;
  1268. put_prev_task(rq, prev);
  1269. p = _pick_next_task_rt(rq);
  1270. /* The running task is never eligible for pushing */
  1271. dequeue_pushable_task(rq, p);
  1272. queue_push_tasks(rq);
  1273. return p;
  1274. }
  1275. static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
  1276. {
  1277. update_curr_rt(rq);
  1278. /*
  1279. * The previous task needs to be made eligible for pushing
  1280. * if it is still active
  1281. */
  1282. if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
  1283. enqueue_pushable_task(rq, p);
  1284. }
  1285. #ifdef CONFIG_SMP
  1286. /* Only try algorithms three times */
  1287. #define RT_MAX_TRIES 3
  1288. static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
  1289. {
  1290. if (!task_running(rq, p) &&
  1291. cpumask_test_cpu(cpu, &p->cpus_allowed))
  1292. return 1;
  1293. return 0;
  1294. }
  1295. /*
  1296. * Return the highest pushable rq's task, which is suitable to be executed
  1297. * on the cpu, NULL otherwise
  1298. */
  1299. static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
  1300. {
  1301. struct plist_head *head = &rq->rt.pushable_tasks;
  1302. struct task_struct *p;
  1303. if (!has_pushable_tasks(rq))
  1304. return NULL;
  1305. plist_for_each_entry(p, head, pushable_tasks) {
  1306. if (pick_rt_task(rq, p, cpu))
  1307. return p;
  1308. }
  1309. return NULL;
  1310. }
  1311. static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
  1312. static int find_lowest_rq(struct task_struct *task)
  1313. {
  1314. struct sched_domain *sd;
  1315. struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
  1316. int this_cpu = smp_processor_id();
  1317. int cpu = task_cpu(task);
  1318. /* Make sure the mask is initialized first */
  1319. if (unlikely(!lowest_mask))
  1320. return -1;
  1321. if (task->nr_cpus_allowed == 1)
  1322. return -1; /* No other targets possible */
  1323. if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
  1324. return -1; /* No targets found */
  1325. /*
  1326. * At this point we have built a mask of cpus representing the
  1327. * lowest priority tasks in the system. Now we want to elect
  1328. * the best one based on our affinity and topology.
  1329. *
  1330. * We prioritize the last cpu that the task executed on since
  1331. * it is most likely cache-hot in that location.
  1332. */
  1333. if (cpumask_test_cpu(cpu, lowest_mask))
  1334. return cpu;
  1335. /*
  1336. * Otherwise, we consult the sched_domains span maps to figure
  1337. * out which cpu is logically closest to our hot cache data.
  1338. */
  1339. if (!cpumask_test_cpu(this_cpu, lowest_mask))
  1340. this_cpu = -1; /* Skip this_cpu opt if not among lowest */
  1341. rcu_read_lock();
  1342. for_each_domain(cpu, sd) {
  1343. if (sd->flags & SD_WAKE_AFFINE) {
  1344. int best_cpu;
  1345. /*
  1346. * "this_cpu" is cheaper to preempt than a
  1347. * remote processor.
  1348. */
  1349. if (this_cpu != -1 &&
  1350. cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
  1351. rcu_read_unlock();
  1352. return this_cpu;
  1353. }
  1354. best_cpu = cpumask_first_and(lowest_mask,
  1355. sched_domain_span(sd));
  1356. if (best_cpu < nr_cpu_ids) {
  1357. rcu_read_unlock();
  1358. return best_cpu;
  1359. }
  1360. }
  1361. }
  1362. rcu_read_unlock();
  1363. /*
  1364. * And finally, if there were no matches within the domains
  1365. * just give the caller *something* to work with from the compatible
  1366. * locations.
  1367. */
  1368. if (this_cpu != -1)
  1369. return this_cpu;
  1370. cpu = cpumask_any(lowest_mask);
  1371. if (cpu < nr_cpu_ids)
  1372. return cpu;
  1373. return -1;
  1374. }
  1375. /* Will lock the rq it finds */
  1376. static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
  1377. {
  1378. struct rq *lowest_rq = NULL;
  1379. int tries;
  1380. int cpu;
  1381. for (tries = 0; tries < RT_MAX_TRIES; tries++) {
  1382. cpu = find_lowest_rq(task);
  1383. if ((cpu == -1) || (cpu == rq->cpu))
  1384. break;
  1385. lowest_rq = cpu_rq(cpu);
  1386. if (lowest_rq->rt.highest_prio.curr <= task->prio) {
  1387. /*
  1388. * Target rq has tasks of equal or higher priority,
  1389. * retrying does not release any lock and is unlikely
  1390. * to yield a different result.
  1391. */
  1392. lowest_rq = NULL;
  1393. break;
  1394. }
  1395. /* if the prio of this runqueue changed, try again */
  1396. if (double_lock_balance(rq, lowest_rq)) {
  1397. /*
  1398. * We had to unlock the run queue. In
  1399. * the mean time, task could have
  1400. * migrated already or had its affinity changed.
  1401. * Also make sure that it wasn't scheduled on its rq.
  1402. */
  1403. if (unlikely(task_rq(task) != rq ||
  1404. !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
  1405. task_running(rq, task) ||
  1406. !rt_task(task) ||
  1407. !task_on_rq_queued(task))) {
  1408. double_unlock_balance(rq, lowest_rq);
  1409. lowest_rq = NULL;
  1410. break;
  1411. }
  1412. }
  1413. /* If this rq is still suitable use it. */
  1414. if (lowest_rq->rt.highest_prio.curr > task->prio)
  1415. break;
  1416. /* try again */
  1417. double_unlock_balance(rq, lowest_rq);
  1418. lowest_rq = NULL;
  1419. }
  1420. return lowest_rq;
  1421. }
  1422. static struct task_struct *pick_next_pushable_task(struct rq *rq)
  1423. {
  1424. struct task_struct *p;
  1425. if (!has_pushable_tasks(rq))
  1426. return NULL;
  1427. p = plist_first_entry(&rq->rt.pushable_tasks,
  1428. struct task_struct, pushable_tasks);
  1429. BUG_ON(rq->cpu != task_cpu(p));
  1430. BUG_ON(task_current(rq, p));
  1431. BUG_ON(p->nr_cpus_allowed <= 1);
  1432. BUG_ON(!task_on_rq_queued(p));
  1433. BUG_ON(!rt_task(p));
  1434. return p;
  1435. }
  1436. /*
  1437. * If the current CPU has more than one RT task, see if the non
  1438. * running task can migrate over to a CPU that is running a task
  1439. * of lesser priority.
  1440. */
  1441. static int push_rt_task(struct rq *rq)
  1442. {
  1443. struct task_struct *next_task;
  1444. struct rq *lowest_rq;
  1445. int ret = 0;
  1446. if (!rq->rt.overloaded)
  1447. return 0;
  1448. next_task = pick_next_pushable_task(rq);
  1449. if (!next_task)
  1450. return 0;
  1451. retry:
  1452. if (unlikely(next_task == rq->curr)) {
  1453. WARN_ON(1);
  1454. return 0;
  1455. }
  1456. /*
  1457. * It's possible that the next_task slipped in of
  1458. * higher priority than current. If that's the case
  1459. * just reschedule current.
  1460. */
  1461. if (unlikely(next_task->prio < rq->curr->prio)) {
  1462. resched_curr(rq);
  1463. return 0;
  1464. }
  1465. /* We might release rq lock */
  1466. get_task_struct(next_task);
  1467. /* find_lock_lowest_rq locks the rq if found */
  1468. lowest_rq = find_lock_lowest_rq(next_task, rq);
  1469. if (!lowest_rq) {
  1470. struct task_struct *task;
  1471. /*
  1472. * find_lock_lowest_rq releases rq->lock
  1473. * so it is possible that next_task has migrated.
  1474. *
  1475. * We need to make sure that the task is still on the same
  1476. * run-queue and is also still the next task eligible for
  1477. * pushing.
  1478. */
  1479. task = pick_next_pushable_task(rq);
  1480. if (task == next_task) {
  1481. /*
  1482. * The task hasn't migrated, and is still the next
  1483. * eligible task, but we failed to find a run-queue
  1484. * to push it to. Do not retry in this case, since
  1485. * other cpus will pull from us when ready.
  1486. */
  1487. goto out;
  1488. }
  1489. if (!task)
  1490. /* No more tasks, just exit */
  1491. goto out;
  1492. /*
  1493. * Something has shifted, try again.
  1494. */
  1495. put_task_struct(next_task);
  1496. next_task = task;
  1497. goto retry;
  1498. }
  1499. deactivate_task(rq, next_task, 0);
  1500. set_task_cpu(next_task, lowest_rq->cpu);
  1501. activate_task(lowest_rq, next_task, 0);
  1502. ret = 1;
  1503. resched_curr(lowest_rq);
  1504. double_unlock_balance(rq, lowest_rq);
  1505. out:
  1506. put_task_struct(next_task);
  1507. return ret;
  1508. }
  1509. static void push_rt_tasks(struct rq *rq)
  1510. {
  1511. /* push_rt_task will return true if it moved an RT */
  1512. while (push_rt_task(rq))
  1513. ;
  1514. }
  1515. #ifdef HAVE_RT_PUSH_IPI
  1516. /*
  1517. * The search for the next cpu always starts at rq->cpu and ends
  1518. * when we reach rq->cpu again. It will never return rq->cpu.
  1519. * This returns the next cpu to check, or nr_cpu_ids if the loop
  1520. * is complete.
  1521. *
  1522. * rq->rt.push_cpu holds the last cpu returned by this function,
  1523. * or if this is the first instance, it must hold rq->cpu.
  1524. */
  1525. static int rto_next_cpu(struct rq *rq)
  1526. {
  1527. int prev_cpu = rq->rt.push_cpu;
  1528. int cpu;
  1529. cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
  1530. /*
  1531. * If the previous cpu is less than the rq's CPU, then it already
  1532. * passed the end of the mask, and has started from the beginning.
  1533. * We end if the next CPU is greater or equal to rq's CPU.
  1534. */
  1535. if (prev_cpu < rq->cpu) {
  1536. if (cpu >= rq->cpu)
  1537. return nr_cpu_ids;
  1538. } else if (cpu >= nr_cpu_ids) {
  1539. /*
  1540. * We passed the end of the mask, start at the beginning.
  1541. * If the result is greater or equal to the rq's CPU, then
  1542. * the loop is finished.
  1543. */
  1544. cpu = cpumask_first(rq->rd->rto_mask);
  1545. if (cpu >= rq->cpu)
  1546. return nr_cpu_ids;
  1547. }
  1548. rq->rt.push_cpu = cpu;
  1549. /* Return cpu to let the caller know if the loop is finished or not */
  1550. return cpu;
  1551. }
  1552. static int find_next_push_cpu(struct rq *rq)
  1553. {
  1554. struct rq *next_rq;
  1555. int cpu;
  1556. while (1) {
  1557. cpu = rto_next_cpu(rq);
  1558. if (cpu >= nr_cpu_ids)
  1559. break;
  1560. next_rq = cpu_rq(cpu);
  1561. /* Make sure the next rq can push to this rq */
  1562. if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
  1563. break;
  1564. }
  1565. return cpu;
  1566. }
  1567. #define RT_PUSH_IPI_EXECUTING 1
  1568. #define RT_PUSH_IPI_RESTART 2
  1569. /*
  1570. * When a high priority task schedules out from a CPU and a lower priority
  1571. * task is scheduled in, a check is made to see if there's any RT tasks
  1572. * on other CPUs that are waiting to run because a higher priority RT task
  1573. * is currently running on its CPU. In this case, the CPU with multiple RT
  1574. * tasks queued on it (overloaded) needs to be notified that a CPU has opened
  1575. * up that may be able to run one of its non-running queued RT tasks.
  1576. *
  1577. * On large CPU boxes, there's the case that several CPUs could schedule
  1578. * a lower priority task at the same time, in which case it will look for
  1579. * any overloaded CPUs that it could pull a task from. To do this, the runqueue
  1580. * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
  1581. * for a single overloaded CPU's runqueue lock can produce a large latency.
  1582. * (This has actually been observed on large boxes running cyclictest).
  1583. * Instead of taking the runqueue lock of the overloaded CPU, each of the
  1584. * CPUs that scheduled a lower priority task simply sends an IPI to the
  1585. * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
  1586. * lots of contention. The overloaded CPU will look to push its non-running
  1587. * RT task off, and if it does, it can then ignore the other IPIs coming
  1588. * in, and just pass those IPIs off to any other overloaded CPU.
  1589. *
  1590. * When a CPU schedules a lower priority task, it only sends an IPI to
  1591. * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
  1592. * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
  1593. * RT overloaded tasks, would cause 100 IPIs to go out at once.
  1594. *
  1595. * The overloaded RT CPU, when receiving an IPI, will try to push off its
  1596. * overloaded RT tasks and then send an IPI to the next CPU that has
  1597. * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
  1598. * have completed. Just because a CPU may have pushed off its own overloaded
  1599. * RT task does not mean it should stop sending the IPI around to other
  1600. * overloaded CPUs. There may be another RT task waiting to run on one of
  1601. * those CPUs that are of higher priority than the one that was just
  1602. * pushed.
  1603. *
  1604. * An optimization that could possibly be made is to make a CPU array similar
  1605. * to the cpupri array mask of all running RT tasks, but for the overloaded
  1606. * case, then the IPI could be sent to only the CPU with the highest priority
  1607. * RT task waiting, and that CPU could send off further IPIs to the CPU with
  1608. * the next highest waiting task. Since the overloaded case is much less likely
  1609. * to happen, the complexity of this implementation may not be worth it.
  1610. * Instead, just send an IPI around to all overloaded CPUs.
  1611. *
  1612. * The rq->rt.push_flags holds the status of the IPI that is going around.
  1613. * A run queue can only send out a single IPI at a time. The possible flags
  1614. * for rq->rt.push_flags are:
  1615. *
  1616. * (None or zero): No IPI is going around for the current rq
  1617. * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
  1618. * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
  1619. * has changed, and the IPI should restart
  1620. * circulating the overloaded CPUs again.
  1621. *
  1622. * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
  1623. * before sending to the next CPU.
  1624. *
  1625. * Instead of having all CPUs that schedule a lower priority task send
  1626. * an IPI to the same "first" CPU in the RT overload mask, they send it
  1627. * to the next overloaded CPU after their own CPU. This helps distribute
  1628. * the work when there's more than one overloaded CPU and multiple CPUs
  1629. * scheduling in lower priority tasks.
  1630. *
  1631. * When a rq schedules a lower priority task than what was currently
  1632. * running, the next CPU with overloaded RT tasks is examined first.
  1633. * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
  1634. * priority task, it will send an IPI first to CPU 5, then CPU 5 will
  1635. * send to CPU 1 if it is still overloaded. CPU 1 will clear the
  1636. * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
  1637. *
  1638. * The first CPU to notice IPI_RESTART is set, will clear that flag and then
  1639. * send an IPI to the next overloaded CPU after the rq->cpu and not the next
  1640. * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
  1641. * schedules a lower priority task, and the IPI_RESTART gets set while the
  1642. * handling is being done on CPU 5, it will clear the flag and send it back to
  1643. * CPU 4 instead of CPU 1.
  1644. *
  1645. * Note, the above logic can be disabled by turning off the sched_feature
  1646. * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
  1647. * taken by the CPU requesting a pull and the waiting RT task will be pulled
  1648. * by that CPU. This may be fine for machines with few CPUs.
  1649. */
  1650. static void tell_cpu_to_push(struct rq *rq)
  1651. {
  1652. int cpu;
  1653. if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
  1654. raw_spin_lock(&rq->rt.push_lock);
  1655. /* Make sure it's still executing */
  1656. if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
  1657. /*
  1658. * Tell the IPI to restart the loop as things have
  1659. * changed since it started.
  1660. */
  1661. rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
  1662. raw_spin_unlock(&rq->rt.push_lock);
  1663. return;
  1664. }
  1665. raw_spin_unlock(&rq->rt.push_lock);
  1666. }
  1667. /* When here, there's no IPI going around */
  1668. rq->rt.push_cpu = rq->cpu;
  1669. cpu = find_next_push_cpu(rq);
  1670. if (cpu >= nr_cpu_ids)
  1671. return;
  1672. rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
  1673. irq_work_queue_on(&rq->rt.push_work, cpu);
  1674. }
  1675. /* Called from hardirq context */
  1676. static void try_to_push_tasks(void *arg)
  1677. {
  1678. struct rt_rq *rt_rq = arg;
  1679. struct rq *rq, *src_rq;
  1680. int this_cpu;
  1681. int cpu;
  1682. this_cpu = rt_rq->push_cpu;
  1683. /* Paranoid check */
  1684. BUG_ON(this_cpu != smp_processor_id());
  1685. rq = cpu_rq(this_cpu);
  1686. src_rq = rq_of_rt_rq(rt_rq);
  1687. again:
  1688. if (has_pushable_tasks(rq)) {
  1689. raw_spin_lock(&rq->lock);
  1690. push_rt_task(rq);
  1691. raw_spin_unlock(&rq->lock);
  1692. }
  1693. /* Pass the IPI to the next rt overloaded queue */
  1694. raw_spin_lock(&rt_rq->push_lock);
  1695. /*
  1696. * If the source queue changed since the IPI went out,
  1697. * we need to restart the search from that CPU again.
  1698. */
  1699. if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
  1700. rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
  1701. rt_rq->push_cpu = src_rq->cpu;
  1702. }
  1703. cpu = find_next_push_cpu(src_rq);
  1704. if (cpu >= nr_cpu_ids)
  1705. rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
  1706. raw_spin_unlock(&rt_rq->push_lock);
  1707. if (cpu >= nr_cpu_ids)
  1708. return;
  1709. /*
  1710. * It is possible that a restart caused this CPU to be
  1711. * chosen again. Don't bother with an IPI, just see if we
  1712. * have more to push.
  1713. */
  1714. if (unlikely(cpu == rq->cpu))
  1715. goto again;
  1716. /* Try the next RT overloaded CPU */
  1717. irq_work_queue_on(&rt_rq->push_work, cpu);
  1718. }
  1719. static void push_irq_work_func(struct irq_work *work)
  1720. {
  1721. struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
  1722. try_to_push_tasks(rt_rq);
  1723. }
  1724. #endif /* HAVE_RT_PUSH_IPI */
  1725. static void pull_rt_task(struct rq *this_rq)
  1726. {
  1727. int this_cpu = this_rq->cpu, cpu;
  1728. bool resched = false;
  1729. struct task_struct *p;
  1730. struct rq *src_rq;
  1731. if (likely(!rt_overloaded(this_rq)))
  1732. return;
  1733. /*
  1734. * Match the barrier from rt_set_overloaded; this guarantees that if we
  1735. * see overloaded we must also see the rto_mask bit.
  1736. */
  1737. smp_rmb();
  1738. #ifdef HAVE_RT_PUSH_IPI
  1739. if (sched_feat(RT_PUSH_IPI)) {
  1740. tell_cpu_to_push(this_rq);
  1741. return;
  1742. }
  1743. #endif
  1744. for_each_cpu(cpu, this_rq->rd->rto_mask) {
  1745. if (this_cpu == cpu)
  1746. continue;
  1747. src_rq = cpu_rq(cpu);
  1748. /*
  1749. * Don't bother taking the src_rq->lock if the next highest
  1750. * task is known to be lower-priority than our current task.
  1751. * This may look racy, but if this value is about to go
  1752. * logically higher, the src_rq will push this task away.
  1753. * And if its going logically lower, we do not care
  1754. */
  1755. if (src_rq->rt.highest_prio.next >=
  1756. this_rq->rt.highest_prio.curr)
  1757. continue;
  1758. /*
  1759. * We can potentially drop this_rq's lock in
  1760. * double_lock_balance, and another CPU could
  1761. * alter this_rq
  1762. */
  1763. double_lock_balance(this_rq, src_rq);
  1764. /*
  1765. * We can pull only a task, which is pushable
  1766. * on its rq, and no others.
  1767. */
  1768. p = pick_highest_pushable_task(src_rq, this_cpu);
  1769. /*
  1770. * Do we have an RT task that preempts
  1771. * the to-be-scheduled task?
  1772. */
  1773. if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
  1774. WARN_ON(p == src_rq->curr);
  1775. WARN_ON(!task_on_rq_queued(p));
  1776. /*
  1777. * There's a chance that p is higher in priority
  1778. * than what's currently running on its cpu.
  1779. * This is just that p is wakeing up and hasn't
  1780. * had a chance to schedule. We only pull
  1781. * p if it is lower in priority than the
  1782. * current task on the run queue
  1783. */
  1784. if (p->prio < src_rq->curr->prio)
  1785. goto skip;
  1786. resched = true;
  1787. deactivate_task(src_rq, p, 0);
  1788. set_task_cpu(p, this_cpu);
  1789. activate_task(this_rq, p, 0);
  1790. /*
  1791. * We continue with the search, just in
  1792. * case there's an even higher prio task
  1793. * in another runqueue. (low likelihood
  1794. * but possible)
  1795. */
  1796. }
  1797. skip:
  1798. double_unlock_balance(this_rq, src_rq);
  1799. }
  1800. if (resched)
  1801. resched_curr(this_rq);
  1802. }
  1803. /*
  1804. * If we are not running and we are not going to reschedule soon, we should
  1805. * try to push tasks away now
  1806. */
  1807. static void task_woken_rt(struct rq *rq, struct task_struct *p)
  1808. {
  1809. if (!task_running(rq, p) &&
  1810. !test_tsk_need_resched(rq->curr) &&
  1811. p->nr_cpus_allowed > 1 &&
  1812. (dl_task(rq->curr) || rt_task(rq->curr)) &&
  1813. (rq->curr->nr_cpus_allowed < 2 ||
  1814. rq->curr->prio <= p->prio))
  1815. push_rt_tasks(rq);
  1816. }
  1817. /* Assumes rq->lock is held */
  1818. static void rq_online_rt(struct rq *rq)
  1819. {
  1820. if (rq->rt.overloaded)
  1821. rt_set_overload(rq);
  1822. __enable_runtime(rq);
  1823. cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
  1824. }
  1825. /* Assumes rq->lock is held */
  1826. static void rq_offline_rt(struct rq *rq)
  1827. {
  1828. if (rq->rt.overloaded)
  1829. rt_clear_overload(rq);
  1830. __disable_runtime(rq);
  1831. cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
  1832. }
  1833. /*
  1834. * When switch from the rt queue, we bring ourselves to a position
  1835. * that we might want to pull RT tasks from other runqueues.
  1836. */
  1837. static void switched_from_rt(struct rq *rq, struct task_struct *p)
  1838. {
  1839. /*
  1840. * If there are other RT tasks then we will reschedule
  1841. * and the scheduling of the other RT tasks will handle
  1842. * the balancing. But if we are the last RT task
  1843. * we may need to handle the pulling of RT tasks
  1844. * now.
  1845. */
  1846. if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
  1847. return;
  1848. queue_pull_task(rq);
  1849. }
  1850. void __init init_sched_rt_class(void)
  1851. {
  1852. unsigned int i;
  1853. for_each_possible_cpu(i) {
  1854. zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
  1855. GFP_KERNEL, cpu_to_node(i));
  1856. }
  1857. }
  1858. #endif /* CONFIG_SMP */
  1859. /*
  1860. * When switching a task to RT, we may overload the runqueue
  1861. * with RT tasks. In this case we try to push them off to
  1862. * other runqueues.
  1863. */
  1864. static void switched_to_rt(struct rq *rq, struct task_struct *p)
  1865. {
  1866. /*
  1867. * If we are already running, then there's nothing
  1868. * that needs to be done. But if we are not running
  1869. * we may need to preempt the current running task.
  1870. * If that current running task is also an RT task
  1871. * then see if we can move to another run queue.
  1872. */
  1873. if (task_on_rq_queued(p) && rq->curr != p) {
  1874. #ifdef CONFIG_SMP
  1875. if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
  1876. queue_push_tasks(rq);
  1877. #endif /* CONFIG_SMP */
  1878. if (p->prio < rq->curr->prio)
  1879. resched_curr(rq);
  1880. }
  1881. }
  1882. /*
  1883. * Priority of the task has changed. This may cause
  1884. * us to initiate a push or pull.
  1885. */
  1886. static void
  1887. prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
  1888. {
  1889. if (!task_on_rq_queued(p))
  1890. return;
  1891. if (rq->curr == p) {
  1892. #ifdef CONFIG_SMP
  1893. /*
  1894. * If our priority decreases while running, we
  1895. * may need to pull tasks to this runqueue.
  1896. */
  1897. if (oldprio < p->prio)
  1898. queue_pull_task(rq);
  1899. /*
  1900. * If there's a higher priority task waiting to run
  1901. * then reschedule.
  1902. */
  1903. if (p->prio > rq->rt.highest_prio.curr)
  1904. resched_curr(rq);
  1905. #else
  1906. /* For UP simply resched on drop of prio */
  1907. if (oldprio < p->prio)
  1908. resched_curr(rq);
  1909. #endif /* CONFIG_SMP */
  1910. } else {
  1911. /*
  1912. * This task is not running, but if it is
  1913. * greater than the current running task
  1914. * then reschedule.
  1915. */
  1916. if (p->prio < rq->curr->prio)
  1917. resched_curr(rq);
  1918. }
  1919. }
  1920. #ifdef CONFIG_POSIX_TIMERS
  1921. static void watchdog(struct rq *rq, struct task_struct *p)
  1922. {
  1923. unsigned long soft, hard;
  1924. /* max may change after cur was read, this will be fixed next tick */
  1925. soft = task_rlimit(p, RLIMIT_RTTIME);
  1926. hard = task_rlimit_max(p, RLIMIT_RTTIME);
  1927. if (soft != RLIM_INFINITY) {
  1928. unsigned long next;
  1929. if (p->rt.watchdog_stamp != jiffies) {
  1930. p->rt.timeout++;
  1931. p->rt.watchdog_stamp = jiffies;
  1932. }
  1933. next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
  1934. if (p->rt.timeout > next)
  1935. p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
  1936. }
  1937. }
  1938. #else
  1939. static inline void watchdog(struct rq *rq, struct task_struct *p) { }
  1940. #endif
  1941. static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
  1942. {
  1943. struct sched_rt_entity *rt_se = &p->rt;
  1944. update_curr_rt(rq);
  1945. watchdog(rq, p);
  1946. /*
  1947. * RR tasks need a special form of timeslice management.
  1948. * FIFO tasks have no timeslices.
  1949. */
  1950. if (p->policy != SCHED_RR)
  1951. return;
  1952. if (--p->rt.time_slice)
  1953. return;
  1954. p->rt.time_slice = sched_rr_timeslice;
  1955. /*
  1956. * Requeue to the end of queue if we (and all of our ancestors) are not
  1957. * the only element on the queue
  1958. */
  1959. for_each_sched_rt_entity(rt_se) {
  1960. if (rt_se->run_list.prev != rt_se->run_list.next) {
  1961. requeue_task_rt(rq, p, 0);
  1962. resched_curr(rq);
  1963. return;
  1964. }
  1965. }
  1966. }
  1967. static void set_curr_task_rt(struct rq *rq)
  1968. {
  1969. struct task_struct *p = rq->curr;
  1970. p->se.exec_start = rq_clock_task(rq);
  1971. /* The running task is never eligible for pushing */
  1972. dequeue_pushable_task(rq, p);
  1973. }
  1974. static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
  1975. {
  1976. /*
  1977. * Time slice is 0 for SCHED_FIFO tasks
  1978. */
  1979. if (task->policy == SCHED_RR)
  1980. return sched_rr_timeslice;
  1981. else
  1982. return 0;
  1983. }
  1984. const struct sched_class rt_sched_class = {
  1985. .next = &fair_sched_class,
  1986. .enqueue_task = enqueue_task_rt,
  1987. .dequeue_task = dequeue_task_rt,
  1988. .yield_task = yield_task_rt,
  1989. .check_preempt_curr = check_preempt_curr_rt,
  1990. .pick_next_task = pick_next_task_rt,
  1991. .put_prev_task = put_prev_task_rt,
  1992. #ifdef CONFIG_SMP
  1993. .select_task_rq = select_task_rq_rt,
  1994. .set_cpus_allowed = set_cpus_allowed_common,
  1995. .rq_online = rq_online_rt,
  1996. .rq_offline = rq_offline_rt,
  1997. .task_woken = task_woken_rt,
  1998. .switched_from = switched_from_rt,
  1999. #endif
  2000. .set_curr_task = set_curr_task_rt,
  2001. .task_tick = task_tick_rt,
  2002. .get_rr_interval = get_rr_interval_rt,
  2003. .prio_changed = prio_changed_rt,
  2004. .switched_to = switched_to_rt,
  2005. .update_curr = update_curr_rt,
  2006. };
  2007. #ifdef CONFIG_RT_GROUP_SCHED
  2008. /*
  2009. * Ensure that the real time constraints are schedulable.
  2010. */
  2011. static DEFINE_MUTEX(rt_constraints_mutex);
  2012. /* Must be called with tasklist_lock held */
  2013. static inline int tg_has_rt_tasks(struct task_group *tg)
  2014. {
  2015. struct task_struct *g, *p;
  2016. /*
  2017. * Autogroups do not have RT tasks; see autogroup_create().
  2018. */
  2019. if (task_group_is_autogroup(tg))
  2020. return 0;
  2021. for_each_process_thread(g, p) {
  2022. if (rt_task(p) && task_group(p) == tg)
  2023. return 1;
  2024. }
  2025. return 0;
  2026. }
  2027. struct rt_schedulable_data {
  2028. struct task_group *tg;
  2029. u64 rt_period;
  2030. u64 rt_runtime;
  2031. };
  2032. static int tg_rt_schedulable(struct task_group *tg, void *data)
  2033. {
  2034. struct rt_schedulable_data *d = data;
  2035. struct task_group *child;
  2036. unsigned long total, sum = 0;
  2037. u64 period, runtime;
  2038. period = ktime_to_ns(tg->rt_bandwidth.rt_period);
  2039. runtime = tg->rt_bandwidth.rt_runtime;
  2040. if (tg == d->tg) {
  2041. period = d->rt_period;
  2042. runtime = d->rt_runtime;
  2043. }
  2044. /*
  2045. * Cannot have more runtime than the period.
  2046. */
  2047. if (runtime > period && runtime != RUNTIME_INF)
  2048. return -EINVAL;
  2049. /*
  2050. * Ensure we don't starve existing RT tasks.
  2051. */
  2052. if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
  2053. return -EBUSY;
  2054. total = to_ratio(period, runtime);
  2055. /*
  2056. * Nobody can have more than the global setting allows.
  2057. */
  2058. if (total > to_ratio(global_rt_period(), global_rt_runtime()))
  2059. return -EINVAL;
  2060. /*
  2061. * The sum of our children's runtime should not exceed our own.
  2062. */
  2063. list_for_each_entry_rcu(child, &tg->children, siblings) {
  2064. period = ktime_to_ns(child->rt_bandwidth.rt_period);
  2065. runtime = child->rt_bandwidth.rt_runtime;
  2066. if (child == d->tg) {
  2067. period = d->rt_period;
  2068. runtime = d->rt_runtime;
  2069. }
  2070. sum += to_ratio(period, runtime);
  2071. }
  2072. if (sum > total)
  2073. return -EINVAL;
  2074. return 0;
  2075. }
  2076. static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
  2077. {
  2078. int ret;
  2079. struct rt_schedulable_data data = {
  2080. .tg = tg,
  2081. .rt_period = period,
  2082. .rt_runtime = runtime,
  2083. };
  2084. rcu_read_lock();
  2085. ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
  2086. rcu_read_unlock();
  2087. return ret;
  2088. }
  2089. static int tg_set_rt_bandwidth(struct task_group *tg,
  2090. u64 rt_period, u64 rt_runtime)
  2091. {
  2092. int i, err = 0;
  2093. /*
  2094. * Disallowing the root group RT runtime is BAD, it would disallow the
  2095. * kernel creating (and or operating) RT threads.
  2096. */
  2097. if (tg == &root_task_group && rt_runtime == 0)
  2098. return -EINVAL;
  2099. /* No period doesn't make any sense. */
  2100. if (rt_period == 0)
  2101. return -EINVAL;
  2102. mutex_lock(&rt_constraints_mutex);
  2103. read_lock(&tasklist_lock);
  2104. err = __rt_schedulable(tg, rt_period, rt_runtime);
  2105. if (err)
  2106. goto unlock;
  2107. raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
  2108. tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
  2109. tg->rt_bandwidth.rt_runtime = rt_runtime;
  2110. for_each_possible_cpu(i) {
  2111. struct rt_rq *rt_rq = tg->rt_rq[i];
  2112. raw_spin_lock(&rt_rq->rt_runtime_lock);
  2113. rt_rq->rt_runtime = rt_runtime;
  2114. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  2115. }
  2116. raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
  2117. unlock:
  2118. read_unlock(&tasklist_lock);
  2119. mutex_unlock(&rt_constraints_mutex);
  2120. return err;
  2121. }
  2122. int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
  2123. {
  2124. u64 rt_runtime, rt_period;
  2125. rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
  2126. rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
  2127. if (rt_runtime_us < 0)
  2128. rt_runtime = RUNTIME_INF;
  2129. return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  2130. }
  2131. long sched_group_rt_runtime(struct task_group *tg)
  2132. {
  2133. u64 rt_runtime_us;
  2134. if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
  2135. return -1;
  2136. rt_runtime_us = tg->rt_bandwidth.rt_runtime;
  2137. do_div(rt_runtime_us, NSEC_PER_USEC);
  2138. return rt_runtime_us;
  2139. }
  2140. int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
  2141. {
  2142. u64 rt_runtime, rt_period;
  2143. rt_period = rt_period_us * NSEC_PER_USEC;
  2144. rt_runtime = tg->rt_bandwidth.rt_runtime;
  2145. return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  2146. }
  2147. long sched_group_rt_period(struct task_group *tg)
  2148. {
  2149. u64 rt_period_us;
  2150. rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
  2151. do_div(rt_period_us, NSEC_PER_USEC);
  2152. return rt_period_us;
  2153. }
  2154. static int sched_rt_global_constraints(void)
  2155. {
  2156. int ret = 0;
  2157. mutex_lock(&rt_constraints_mutex);
  2158. read_lock(&tasklist_lock);
  2159. ret = __rt_schedulable(NULL, 0, 0);
  2160. read_unlock(&tasklist_lock);
  2161. mutex_unlock(&rt_constraints_mutex);
  2162. return ret;
  2163. }
  2164. int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
  2165. {
  2166. /* Don't accept realtime tasks when there is no way for them to run */
  2167. if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
  2168. return 0;
  2169. return 1;
  2170. }
  2171. #else /* !CONFIG_RT_GROUP_SCHED */
  2172. static int sched_rt_global_constraints(void)
  2173. {
  2174. unsigned long flags;
  2175. int i;
  2176. raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
  2177. for_each_possible_cpu(i) {
  2178. struct rt_rq *rt_rq = &cpu_rq(i)->rt;
  2179. raw_spin_lock(&rt_rq->rt_runtime_lock);
  2180. rt_rq->rt_runtime = global_rt_runtime();
  2181. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  2182. }
  2183. raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
  2184. return 0;
  2185. }
  2186. #endif /* CONFIG_RT_GROUP_SCHED */
  2187. static int sched_rt_global_validate(void)
  2188. {
  2189. if (sysctl_sched_rt_period <= 0)
  2190. return -EINVAL;
  2191. if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
  2192. (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
  2193. return -EINVAL;
  2194. return 0;
  2195. }
  2196. static void sched_rt_do_global(void)
  2197. {
  2198. def_rt_bandwidth.rt_runtime = global_rt_runtime();
  2199. def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
  2200. }
  2201. int sched_rt_handler(struct ctl_table *table, int write,
  2202. void __user *buffer, size_t *lenp,
  2203. loff_t *ppos)
  2204. {
  2205. int old_period, old_runtime;
  2206. static DEFINE_MUTEX(mutex);
  2207. int ret;
  2208. mutex_lock(&mutex);
  2209. old_period = sysctl_sched_rt_period;
  2210. old_runtime = sysctl_sched_rt_runtime;
  2211. ret = proc_dointvec(table, write, buffer, lenp, ppos);
  2212. if (!ret && write) {
  2213. ret = sched_rt_global_validate();
  2214. if (ret)
  2215. goto undo;
  2216. ret = sched_dl_global_validate();
  2217. if (ret)
  2218. goto undo;
  2219. ret = sched_rt_global_constraints();
  2220. if (ret)
  2221. goto undo;
  2222. sched_rt_do_global();
  2223. sched_dl_do_global();
  2224. }
  2225. if (0) {
  2226. undo:
  2227. sysctl_sched_rt_period = old_period;
  2228. sysctl_sched_rt_runtime = old_runtime;
  2229. }
  2230. mutex_unlock(&mutex);
  2231. return ret;
  2232. }
  2233. int sched_rr_handler(struct ctl_table *table, int write,
  2234. void __user *buffer, size_t *lenp,
  2235. loff_t *ppos)
  2236. {
  2237. int ret;
  2238. static DEFINE_MUTEX(mutex);
  2239. mutex_lock(&mutex);
  2240. ret = proc_dointvec(table, write, buffer, lenp, ppos);
  2241. /*
  2242. * Make sure that internally we keep jiffies.
  2243. * Also, writing zero resets the timeslice to default:
  2244. */
  2245. if (!ret && write) {
  2246. sched_rr_timeslice =
  2247. sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
  2248. msecs_to_jiffies(sysctl_sched_rr_timeslice);
  2249. }
  2250. mutex_unlock(&mutex);
  2251. return ret;
  2252. }
  2253. #ifdef CONFIG_SCHED_DEBUG
  2254. extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
  2255. void print_rt_stats(struct seq_file *m, int cpu)
  2256. {
  2257. rt_rq_iter_t iter;
  2258. struct rt_rq *rt_rq;
  2259. rcu_read_lock();
  2260. for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
  2261. print_rt_rq(m, cpu, rt_rq);
  2262. rcu_read_unlock();
  2263. }
  2264. #endif /* CONFIG_SCHED_DEBUG */