intel_lrc.c 87 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905
  1. /*
  2. * Copyright © 2014 Intel Corporation
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice (including the next
  12. * paragraph) shall be included in all copies or substantial portions of the
  13. * Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21. * IN THE SOFTWARE.
  22. *
  23. * Authors:
  24. * Ben Widawsky <ben@bwidawsk.net>
  25. * Michel Thierry <michel.thierry@intel.com>
  26. * Thomas Daniel <thomas.daniel@intel.com>
  27. * Oscar Mateo <oscar.mateo@intel.com>
  28. *
  29. */
  30. /**
  31. * DOC: Logical Rings, Logical Ring Contexts and Execlists
  32. *
  33. * Motivation:
  34. * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  35. * These expanded contexts enable a number of new abilities, especially
  36. * "Execlists" (also implemented in this file).
  37. *
  38. * One of the main differences with the legacy HW contexts is that logical
  39. * ring contexts incorporate many more things to the context's state, like
  40. * PDPs or ringbuffer control registers:
  41. *
  42. * The reason why PDPs are included in the context is straightforward: as
  43. * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  44. * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  45. * instead, the GPU will do it for you on the context switch.
  46. *
  47. * But, what about the ringbuffer control registers (head, tail, etc..)?
  48. * shouldn't we just need a set of those per engine command streamer? This is
  49. * where the name "Logical Rings" starts to make sense: by virtualizing the
  50. * rings, the engine cs shifts to a new "ring buffer" with every context
  51. * switch. When you want to submit a workload to the GPU you: A) choose your
  52. * context, B) find its appropriate virtualized ring, C) write commands to it
  53. * and then, finally, D) tell the GPU to switch to that context.
  54. *
  55. * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  56. * to a contexts is via a context execution list, ergo "Execlists".
  57. *
  58. * LRC implementation:
  59. * Regarding the creation of contexts, we have:
  60. *
  61. * - One global default context.
  62. * - One local default context for each opened fd.
  63. * - One local extra context for each context create ioctl call.
  64. *
  65. * Now that ringbuffers belong per-context (and not per-engine, like before)
  66. * and that contexts are uniquely tied to a given engine (and not reusable,
  67. * like before) we need:
  68. *
  69. * - One ringbuffer per-engine inside each context.
  70. * - One backing object per-engine inside each context.
  71. *
  72. * The global default context starts its life with these new objects fully
  73. * allocated and populated. The local default context for each opened fd is
  74. * more complex, because we don't know at creation time which engine is going
  75. * to use them. To handle this, we have implemented a deferred creation of LR
  76. * contexts:
  77. *
  78. * The local context starts its life as a hollow or blank holder, that only
  79. * gets populated for a given engine once we receive an execbuffer. If later
  80. * on we receive another execbuffer ioctl for the same context but a different
  81. * engine, we allocate/populate a new ringbuffer and context backing object and
  82. * so on.
  83. *
  84. * Finally, regarding local contexts created using the ioctl call: as they are
  85. * only allowed with the render ring, we can allocate & populate them right
  86. * away (no need to defer anything, at least for now).
  87. *
  88. * Execlists implementation:
  89. * Execlists are the new method by which, on gen8+ hardware, workloads are
  90. * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  91. * This method works as follows:
  92. *
  93. * When a request is committed, its commands (the BB start and any leading or
  94. * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  95. * for the appropriate context. The tail pointer in the hardware context is not
  96. * updated at this time, but instead, kept by the driver in the ringbuffer
  97. * structure. A structure representing this request is added to a request queue
  98. * for the appropriate engine: this structure contains a copy of the context's
  99. * tail after the request was written to the ring buffer and a pointer to the
  100. * context itself.
  101. *
  102. * If the engine's request queue was empty before the request was added, the
  103. * queue is processed immediately. Otherwise the queue will be processed during
  104. * a context switch interrupt. In any case, elements on the queue will get sent
  105. * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
  106. * globally unique 20-bits submission ID.
  107. *
  108. * When execution of a request completes, the GPU updates the context status
  109. * buffer with a context complete event and generates a context switch interrupt.
  110. * During the interrupt handling, the driver examines the events in the buffer:
  111. * for each context complete event, if the announced ID matches that on the head
  112. * of the request queue, then that request is retired and removed from the queue.
  113. *
  114. * After processing, if any requests were retired and the queue is not empty
  115. * then a new execution list can be submitted. The two requests at the front of
  116. * the queue are next to be submitted but since a context may not occur twice in
  117. * an execution list, if subsequent requests have the same ID as the first then
  118. * the two requests must be combined. This is done simply by discarding requests
  119. * at the head of the queue until either only one requests is left (in which case
  120. * we use a NULL second context) or the first two requests have unique IDs.
  121. *
  122. * By always executing the first two requests in the queue the driver ensures
  123. * that the GPU is kept as busy as possible. In the case where a single context
  124. * completes but a second context is still executing, the request for this second
  125. * context will be at the head of the queue when we remove the first one. This
  126. * request will then be resubmitted along with a new request for a different context,
  127. * which will cause the hardware to continue executing the second request and queue
  128. * the new request (the GPU detects the condition of a context getting preempted
  129. * with the same context and optimizes the context switch flow by not doing
  130. * preemption, but just sampling the new tail pointer).
  131. *
  132. */
  133. #include <linux/interrupt.h>
  134. #include <drm/drmP.h>
  135. #include <drm/i915_drm.h>
  136. #include "i915_drv.h"
  137. #include "i915_gem_render_state.h"
  138. #include "i915_vgpu.h"
  139. #include "intel_lrc_reg.h"
  140. #include "intel_mocs.h"
  141. #include "intel_workarounds.h"
  142. #define RING_EXECLIST_QFULL (1 << 0x2)
  143. #define RING_EXECLIST1_VALID (1 << 0x3)
  144. #define RING_EXECLIST0_VALID (1 << 0x4)
  145. #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
  146. #define RING_EXECLIST1_ACTIVE (1 << 0x11)
  147. #define RING_EXECLIST0_ACTIVE (1 << 0x12)
  148. #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
  149. #define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
  150. #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
  151. #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
  152. #define GEN8_CTX_STATUS_COMPLETE (1 << 4)
  153. #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
  154. #define GEN8_CTX_STATUS_COMPLETED_MASK \
  155. (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
  156. /* Typical size of the average request (2 pipecontrols and a MI_BB) */
  157. #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
  158. #define WA_TAIL_DWORDS 2
  159. #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
  160. static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
  161. struct intel_engine_cs *engine,
  162. struct intel_context *ce);
  163. static void execlists_init_reg_state(u32 *reg_state,
  164. struct i915_gem_context *ctx,
  165. struct intel_engine_cs *engine,
  166. struct intel_ring *ring);
  167. static inline struct i915_priolist *to_priolist(struct rb_node *rb)
  168. {
  169. return rb_entry(rb, struct i915_priolist, node);
  170. }
  171. static inline int rq_prio(const struct i915_request *rq)
  172. {
  173. return rq->sched.attr.priority;
  174. }
  175. static inline bool need_preempt(const struct intel_engine_cs *engine,
  176. const struct i915_request *last,
  177. int prio)
  178. {
  179. return (intel_engine_has_preemption(engine) &&
  180. __execlists_need_preempt(prio, rq_prio(last)) &&
  181. !i915_request_completed(last));
  182. }
  183. /*
  184. * The context descriptor encodes various attributes of a context,
  185. * including its GTT address and some flags. Because it's fairly
  186. * expensive to calculate, we'll just do it once and cache the result,
  187. * which remains valid until the context is unpinned.
  188. *
  189. * This is what a descriptor looks like, from LSB to MSB::
  190. *
  191. * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
  192. * bits 12-31: LRCA, GTT address of (the HWSP of) this context
  193. * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
  194. * bits 53-54: mbz, reserved for use by hardware
  195. * bits 55-63: group ID, currently unused and set to 0
  196. *
  197. * Starting from Gen11, the upper dword of the descriptor has a new format:
  198. *
  199. * bits 32-36: reserved
  200. * bits 37-47: SW context ID
  201. * bits 48:53: engine instance
  202. * bit 54: mbz, reserved for use by hardware
  203. * bits 55-60: SW counter
  204. * bits 61-63: engine class
  205. *
  206. * engine info, SW context ID and SW counter need to form a unique number
  207. * (Context ID) per lrc.
  208. */
  209. static void
  210. intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
  211. struct intel_engine_cs *engine,
  212. struct intel_context *ce)
  213. {
  214. u64 desc;
  215. BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
  216. BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
  217. desc = ctx->desc_template; /* bits 0-11 */
  218. GEM_BUG_ON(desc & GENMASK_ULL(63, 12));
  219. desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
  220. /* bits 12-31 */
  221. GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
  222. /*
  223. * The following 32bits are copied into the OA reports (dword 2).
  224. * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
  225. * anything below.
  226. */
  227. if (INTEL_GEN(ctx->i915) >= 11) {
  228. GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
  229. desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
  230. /* bits 37-47 */
  231. desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
  232. /* bits 48-53 */
  233. /* TODO: decide what to do with SW counter (bits 55-60) */
  234. desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
  235. /* bits 61-63 */
  236. } else {
  237. GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
  238. desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */
  239. }
  240. ce->lrc_desc = desc;
  241. }
  242. static struct i915_priolist *
  243. lookup_priolist(struct intel_engine_cs *engine, int prio)
  244. {
  245. struct intel_engine_execlists * const execlists = &engine->execlists;
  246. struct i915_priolist *p;
  247. struct rb_node **parent, *rb;
  248. bool first = true;
  249. if (unlikely(execlists->no_priolist))
  250. prio = I915_PRIORITY_NORMAL;
  251. find_priolist:
  252. /* most positive priority is scheduled first, equal priorities fifo */
  253. rb = NULL;
  254. parent = &execlists->queue.rb_root.rb_node;
  255. while (*parent) {
  256. rb = *parent;
  257. p = to_priolist(rb);
  258. if (prio > p->priority) {
  259. parent = &rb->rb_left;
  260. } else if (prio < p->priority) {
  261. parent = &rb->rb_right;
  262. first = false;
  263. } else {
  264. return p;
  265. }
  266. }
  267. if (prio == I915_PRIORITY_NORMAL) {
  268. p = &execlists->default_priolist;
  269. } else {
  270. p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC);
  271. /* Convert an allocation failure to a priority bump */
  272. if (unlikely(!p)) {
  273. prio = I915_PRIORITY_NORMAL; /* recurses just once */
  274. /* To maintain ordering with all rendering, after an
  275. * allocation failure we have to disable all scheduling.
  276. * Requests will then be executed in fifo, and schedule
  277. * will ensure that dependencies are emitted in fifo.
  278. * There will be still some reordering with existing
  279. * requests, so if userspace lied about their
  280. * dependencies that reordering may be visible.
  281. */
  282. execlists->no_priolist = true;
  283. goto find_priolist;
  284. }
  285. }
  286. p->priority = prio;
  287. INIT_LIST_HEAD(&p->requests);
  288. rb_link_node(&p->node, rb, parent);
  289. rb_insert_color_cached(&p->node, &execlists->queue, first);
  290. return p;
  291. }
  292. static void unwind_wa_tail(struct i915_request *rq)
  293. {
  294. rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
  295. assert_ring_tail_valid(rq->ring, rq->tail);
  296. }
  297. static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
  298. {
  299. struct i915_request *rq, *rn;
  300. struct i915_priolist *uninitialized_var(p);
  301. int last_prio = I915_PRIORITY_INVALID;
  302. lockdep_assert_held(&engine->timeline.lock);
  303. list_for_each_entry_safe_reverse(rq, rn,
  304. &engine->timeline.requests,
  305. link) {
  306. if (i915_request_completed(rq))
  307. return;
  308. __i915_request_unsubmit(rq);
  309. unwind_wa_tail(rq);
  310. GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
  311. if (rq_prio(rq) != last_prio) {
  312. last_prio = rq_prio(rq);
  313. p = lookup_priolist(engine, last_prio);
  314. }
  315. GEM_BUG_ON(p->priority != rq_prio(rq));
  316. list_add(&rq->sched.link, &p->requests);
  317. }
  318. }
  319. void
  320. execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
  321. {
  322. struct intel_engine_cs *engine =
  323. container_of(execlists, typeof(*engine), execlists);
  324. unsigned long flags;
  325. spin_lock_irqsave(&engine->timeline.lock, flags);
  326. __unwind_incomplete_requests(engine);
  327. spin_unlock_irqrestore(&engine->timeline.lock, flags);
  328. }
  329. static inline void
  330. execlists_context_status_change(struct i915_request *rq, unsigned long status)
  331. {
  332. /*
  333. * Only used when GVT-g is enabled now. When GVT-g is disabled,
  334. * The compiler should eliminate this function as dead-code.
  335. */
  336. if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
  337. return;
  338. atomic_notifier_call_chain(&rq->engine->context_status_notifier,
  339. status, rq);
  340. }
  341. inline void
  342. execlists_user_begin(struct intel_engine_execlists *execlists,
  343. const struct execlist_port *port)
  344. {
  345. execlists_set_active_once(execlists, EXECLISTS_ACTIVE_USER);
  346. }
  347. inline void
  348. execlists_user_end(struct intel_engine_execlists *execlists)
  349. {
  350. execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
  351. }
  352. static inline void
  353. execlists_context_schedule_in(struct i915_request *rq)
  354. {
  355. execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
  356. intel_engine_context_in(rq->engine);
  357. }
  358. static inline void
  359. execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
  360. {
  361. intel_engine_context_out(rq->engine);
  362. execlists_context_status_change(rq, status);
  363. trace_i915_request_out(rq);
  364. }
  365. static void
  366. execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
  367. {
  368. ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
  369. ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
  370. ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
  371. ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
  372. }
  373. static u64 execlists_update_context(struct i915_request *rq)
  374. {
  375. struct intel_context *ce = rq->hw_context;
  376. struct i915_hw_ppgtt *ppgtt =
  377. rq->gem_context->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
  378. u32 *reg_state = ce->lrc_reg_state;
  379. reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
  380. /*
  381. * True 32b PPGTT with dynamic page allocation: update PDP
  382. * registers and point the unallocated PDPs to scratch page.
  383. * PML4 is allocated during ppgtt init, so this is not needed
  384. * in 48-bit mode.
  385. */
  386. if (ppgtt && !i915_vm_is_48bit(&ppgtt->vm))
  387. execlists_update_context_pdps(ppgtt, reg_state);
  388. /*
  389. * Make sure the context image is complete before we submit it to HW.
  390. *
  391. * Ostensibly, writes (including the WCB) should be flushed prior to
  392. * an uncached write such as our mmio register access, the empirical
  393. * evidence (esp. on Braswell) suggests that the WC write into memory
  394. * may not be visible to the HW prior to the completion of the UC
  395. * register write and that we may begin execution from the context
  396. * before its image is complete leading to invalid PD chasing.
  397. *
  398. * Furthermore, Braswell, at least, wants a full mb to be sure that
  399. * the writes are coherent in memory (visible to the GPU) prior to
  400. * execution, and not just visible to other CPUs (as is the result of
  401. * wmb).
  402. */
  403. mb();
  404. return ce->lrc_desc;
  405. }
  406. static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
  407. {
  408. if (execlists->ctrl_reg) {
  409. writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
  410. writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
  411. } else {
  412. writel(upper_32_bits(desc), execlists->submit_reg);
  413. writel(lower_32_bits(desc), execlists->submit_reg);
  414. }
  415. }
  416. static void execlists_submit_ports(struct intel_engine_cs *engine)
  417. {
  418. struct intel_engine_execlists *execlists = &engine->execlists;
  419. struct execlist_port *port = execlists->port;
  420. unsigned int n;
  421. /*
  422. * We can skip acquiring intel_runtime_pm_get() here as it was taken
  423. * on our behalf by the request (see i915_gem_mark_busy()) and it will
  424. * not be relinquished until the device is idle (see
  425. * i915_gem_idle_work_handler()). As a precaution, we make sure
  426. * that all ELSP are drained i.e. we have processed the CSB,
  427. * before allowing ourselves to idle and calling intel_runtime_pm_put().
  428. */
  429. GEM_BUG_ON(!engine->i915->gt.awake);
  430. /*
  431. * ELSQ note: the submit queue is not cleared after being submitted
  432. * to the HW so we need to make sure we always clean it up. This is
  433. * currently ensured by the fact that we always write the same number
  434. * of elsq entries, keep this in mind before changing the loop below.
  435. */
  436. for (n = execlists_num_ports(execlists); n--; ) {
  437. struct i915_request *rq;
  438. unsigned int count;
  439. u64 desc;
  440. rq = port_unpack(&port[n], &count);
  441. if (rq) {
  442. GEM_BUG_ON(count > !n);
  443. if (!count++)
  444. execlists_context_schedule_in(rq);
  445. port_set(&port[n], port_pack(rq, count));
  446. desc = execlists_update_context(rq);
  447. GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
  448. GEM_TRACE("%s in[%d]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
  449. engine->name, n,
  450. port[n].context_id, count,
  451. rq->global_seqno,
  452. rq->fence.context, rq->fence.seqno,
  453. intel_engine_get_seqno(engine),
  454. rq_prio(rq));
  455. } else {
  456. GEM_BUG_ON(!n);
  457. desc = 0;
  458. }
  459. write_desc(execlists, desc, n);
  460. }
  461. /* we need to manually load the submit queue */
  462. if (execlists->ctrl_reg)
  463. writel(EL_CTRL_LOAD, execlists->ctrl_reg);
  464. execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
  465. }
  466. static bool ctx_single_port_submission(const struct intel_context *ce)
  467. {
  468. return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
  469. i915_gem_context_force_single_submission(ce->gem_context));
  470. }
  471. static bool can_merge_ctx(const struct intel_context *prev,
  472. const struct intel_context *next)
  473. {
  474. if (prev != next)
  475. return false;
  476. if (ctx_single_port_submission(prev))
  477. return false;
  478. return true;
  479. }
  480. static void port_assign(struct execlist_port *port, struct i915_request *rq)
  481. {
  482. GEM_BUG_ON(rq == port_request(port));
  483. if (port_isset(port))
  484. i915_request_put(port_request(port));
  485. port_set(port, port_pack(i915_request_get(rq), port_count(port)));
  486. }
  487. static void inject_preempt_context(struct intel_engine_cs *engine)
  488. {
  489. struct intel_engine_execlists *execlists = &engine->execlists;
  490. struct intel_context *ce =
  491. to_intel_context(engine->i915->preempt_context, engine);
  492. unsigned int n;
  493. GEM_BUG_ON(execlists->preempt_complete_status !=
  494. upper_32_bits(ce->lrc_desc));
  495. /*
  496. * Switch to our empty preempt context so
  497. * the state of the GPU is known (idle).
  498. */
  499. GEM_TRACE("%s\n", engine->name);
  500. for (n = execlists_num_ports(execlists); --n; )
  501. write_desc(execlists, 0, n);
  502. write_desc(execlists, ce->lrc_desc, n);
  503. /* we need to manually load the submit queue */
  504. if (execlists->ctrl_reg)
  505. writel(EL_CTRL_LOAD, execlists->ctrl_reg);
  506. execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
  507. execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
  508. }
  509. static void complete_preempt_context(struct intel_engine_execlists *execlists)
  510. {
  511. GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
  512. if (inject_preempt_hang(execlists))
  513. return;
  514. execlists_cancel_port_requests(execlists);
  515. __unwind_incomplete_requests(container_of(execlists,
  516. struct intel_engine_cs,
  517. execlists));
  518. }
  519. static void execlists_dequeue(struct intel_engine_cs *engine)
  520. {
  521. struct intel_engine_execlists * const execlists = &engine->execlists;
  522. struct execlist_port *port = execlists->port;
  523. const struct execlist_port * const last_port =
  524. &execlists->port[execlists->port_mask];
  525. struct i915_request *last = port_request(port);
  526. struct rb_node *rb;
  527. bool submit = false;
  528. /*
  529. * Hardware submission is through 2 ports. Conceptually each port
  530. * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
  531. * static for a context, and unique to each, so we only execute
  532. * requests belonging to a single context from each ring. RING_HEAD
  533. * is maintained by the CS in the context image, it marks the place
  534. * where it got up to last time, and through RING_TAIL we tell the CS
  535. * where we want to execute up to this time.
  536. *
  537. * In this list the requests are in order of execution. Consecutive
  538. * requests from the same context are adjacent in the ringbuffer. We
  539. * can combine these requests into a single RING_TAIL update:
  540. *
  541. * RING_HEAD...req1...req2
  542. * ^- RING_TAIL
  543. * since to execute req2 the CS must first execute req1.
  544. *
  545. * Our goal then is to point each port to the end of a consecutive
  546. * sequence of requests as being the most optimal (fewest wake ups
  547. * and context switches) submission.
  548. */
  549. if (last) {
  550. /*
  551. * Don't resubmit or switch until all outstanding
  552. * preemptions (lite-restore) are seen. Then we
  553. * know the next preemption status we see corresponds
  554. * to this ELSP update.
  555. */
  556. GEM_BUG_ON(!execlists_is_active(execlists,
  557. EXECLISTS_ACTIVE_USER));
  558. GEM_BUG_ON(!port_count(&port[0]));
  559. /*
  560. * If we write to ELSP a second time before the HW has had
  561. * a chance to respond to the previous write, we can confuse
  562. * the HW and hit "undefined behaviour". After writing to ELSP,
  563. * we must then wait until we see a context-switch event from
  564. * the HW to indicate that it has had a chance to respond.
  565. */
  566. if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
  567. return;
  568. if (need_preempt(engine, last, execlists->queue_priority)) {
  569. inject_preempt_context(engine);
  570. return;
  571. }
  572. /*
  573. * In theory, we could coalesce more requests onto
  574. * the second port (the first port is active, with
  575. * no preemptions pending). However, that means we
  576. * then have to deal with the possible lite-restore
  577. * of the second port (as we submit the ELSP, there
  578. * may be a context-switch) but also we may complete
  579. * the resubmission before the context-switch. Ergo,
  580. * coalescing onto the second port will cause a
  581. * preemption event, but we cannot predict whether
  582. * that will affect port[0] or port[1].
  583. *
  584. * If the second port is already active, we can wait
  585. * until the next context-switch before contemplating
  586. * new requests. The GPU will be busy and we should be
  587. * able to resubmit the new ELSP before it idles,
  588. * avoiding pipeline bubbles (momentary pauses where
  589. * the driver is unable to keep up the supply of new
  590. * work). However, we have to double check that the
  591. * priorities of the ports haven't been switch.
  592. */
  593. if (port_count(&port[1]))
  594. return;
  595. /*
  596. * WaIdleLiteRestore:bdw,skl
  597. * Apply the wa NOOPs to prevent
  598. * ring:HEAD == rq:TAIL as we resubmit the
  599. * request. See gen8_emit_breadcrumb() for
  600. * where we prepare the padding after the
  601. * end of the request.
  602. */
  603. last->tail = last->wa_tail;
  604. }
  605. while ((rb = rb_first_cached(&execlists->queue))) {
  606. struct i915_priolist *p = to_priolist(rb);
  607. struct i915_request *rq, *rn;
  608. list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
  609. /*
  610. * Can we combine this request with the current port?
  611. * It has to be the same context/ringbuffer and not
  612. * have any exceptions (e.g. GVT saying never to
  613. * combine contexts).
  614. *
  615. * If we can combine the requests, we can execute both
  616. * by updating the RING_TAIL to point to the end of the
  617. * second request, and so we never need to tell the
  618. * hardware about the first.
  619. */
  620. if (last &&
  621. !can_merge_ctx(rq->hw_context, last->hw_context)) {
  622. /*
  623. * If we are on the second port and cannot
  624. * combine this request with the last, then we
  625. * are done.
  626. */
  627. if (port == last_port) {
  628. __list_del_many(&p->requests,
  629. &rq->sched.link);
  630. goto done;
  631. }
  632. /*
  633. * If GVT overrides us we only ever submit
  634. * port[0], leaving port[1] empty. Note that we
  635. * also have to be careful that we don't queue
  636. * the same context (even though a different
  637. * request) to the second port.
  638. */
  639. if (ctx_single_port_submission(last->hw_context) ||
  640. ctx_single_port_submission(rq->hw_context)) {
  641. __list_del_many(&p->requests,
  642. &rq->sched.link);
  643. goto done;
  644. }
  645. GEM_BUG_ON(last->hw_context == rq->hw_context);
  646. if (submit)
  647. port_assign(port, last);
  648. port++;
  649. GEM_BUG_ON(port_isset(port));
  650. }
  651. INIT_LIST_HEAD(&rq->sched.link);
  652. __i915_request_submit(rq);
  653. trace_i915_request_in(rq, port_index(port, execlists));
  654. last = rq;
  655. submit = true;
  656. }
  657. rb_erase_cached(&p->node, &execlists->queue);
  658. INIT_LIST_HEAD(&p->requests);
  659. if (p->priority != I915_PRIORITY_NORMAL)
  660. kmem_cache_free(engine->i915->priorities, p);
  661. }
  662. done:
  663. /*
  664. * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
  665. *
  666. * We choose queue_priority such that if we add a request of greater
  667. * priority than this, we kick the submission tasklet to decide on
  668. * the right order of submitting the requests to hardware. We must
  669. * also be prepared to reorder requests as they are in-flight on the
  670. * HW. We derive the queue_priority then as the first "hole" in
  671. * the HW submission ports and if there are no available slots,
  672. * the priority of the lowest executing request, i.e. last.
  673. *
  674. * When we do receive a higher priority request ready to run from the
  675. * user, see queue_request(), the queue_priority is bumped to that
  676. * request triggering preemption on the next dequeue (or subsequent
  677. * interrupt for secondary ports).
  678. */
  679. execlists->queue_priority =
  680. port != execlists->port ? rq_prio(last) : INT_MIN;
  681. if (submit) {
  682. port_assign(port, last);
  683. execlists_submit_ports(engine);
  684. }
  685. /* We must always keep the beast fed if we have work piled up */
  686. GEM_BUG_ON(rb_first_cached(&execlists->queue) &&
  687. !port_isset(execlists->port));
  688. /* Re-evaluate the executing context setup after each preemptive kick */
  689. if (last)
  690. execlists_user_begin(execlists, execlists->port);
  691. /* If the engine is now idle, so should be the flag; and vice versa. */
  692. GEM_BUG_ON(execlists_is_active(&engine->execlists,
  693. EXECLISTS_ACTIVE_USER) ==
  694. !port_isset(engine->execlists.port));
  695. }
  696. void
  697. execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
  698. {
  699. struct execlist_port *port = execlists->port;
  700. unsigned int num_ports = execlists_num_ports(execlists);
  701. while (num_ports-- && port_isset(port)) {
  702. struct i915_request *rq = port_request(port);
  703. GEM_TRACE("%s:port%u global=%d (fence %llx:%d), (current %d)\n",
  704. rq->engine->name,
  705. (unsigned int)(port - execlists->port),
  706. rq->global_seqno,
  707. rq->fence.context, rq->fence.seqno,
  708. intel_engine_get_seqno(rq->engine));
  709. GEM_BUG_ON(!execlists->active);
  710. execlists_context_schedule_out(rq,
  711. i915_request_completed(rq) ?
  712. INTEL_CONTEXT_SCHEDULE_OUT :
  713. INTEL_CONTEXT_SCHEDULE_PREEMPTED);
  714. i915_request_put(rq);
  715. memset(port, 0, sizeof(*port));
  716. port++;
  717. }
  718. execlists_clear_all_active(execlists);
  719. }
  720. static void reset_csb_pointers(struct intel_engine_execlists *execlists)
  721. {
  722. /*
  723. * After a reset, the HW starts writing into CSB entry [0]. We
  724. * therefore have to set our HEAD pointer back one entry so that
  725. * the *first* entry we check is entry 0. To complicate this further,
  726. * as we don't wait for the first interrupt after reset, we have to
  727. * fake the HW write to point back to the last entry so that our
  728. * inline comparison of our cached head position against the last HW
  729. * write works even before the first interrupt.
  730. */
  731. execlists->csb_head = execlists->csb_write_reset;
  732. WRITE_ONCE(*execlists->csb_write, execlists->csb_write_reset);
  733. }
  734. static void nop_submission_tasklet(unsigned long data)
  735. {
  736. /* The driver is wedged; don't process any more events. */
  737. }
  738. static void execlists_cancel_requests(struct intel_engine_cs *engine)
  739. {
  740. struct intel_engine_execlists * const execlists = &engine->execlists;
  741. struct i915_request *rq, *rn;
  742. struct rb_node *rb;
  743. unsigned long flags;
  744. GEM_TRACE("%s current %d\n",
  745. engine->name, intel_engine_get_seqno(engine));
  746. /*
  747. * Before we call engine->cancel_requests(), we should have exclusive
  748. * access to the submission state. This is arranged for us by the
  749. * caller disabling the interrupt generation, the tasklet and other
  750. * threads that may then access the same state, giving us a free hand
  751. * to reset state. However, we still need to let lockdep be aware that
  752. * we know this state may be accessed in hardirq context, so we
  753. * disable the irq around this manipulation and we want to keep
  754. * the spinlock focused on its duties and not accidentally conflate
  755. * coverage to the submission's irq state. (Similarly, although we
  756. * shouldn't need to disable irq around the manipulation of the
  757. * submission's irq state, we also wish to remind ourselves that
  758. * it is irq state.)
  759. */
  760. spin_lock_irqsave(&engine->timeline.lock, flags);
  761. /* Cancel the requests on the HW and clear the ELSP tracker. */
  762. execlists_cancel_port_requests(execlists);
  763. execlists_user_end(execlists);
  764. /* Mark all executing requests as skipped. */
  765. list_for_each_entry(rq, &engine->timeline.requests, link) {
  766. GEM_BUG_ON(!rq->global_seqno);
  767. if (!i915_request_completed(rq))
  768. dma_fence_set_error(&rq->fence, -EIO);
  769. }
  770. /* Flush the queued requests to the timeline list (for retiring). */
  771. while ((rb = rb_first_cached(&execlists->queue))) {
  772. struct i915_priolist *p = to_priolist(rb);
  773. list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
  774. INIT_LIST_HEAD(&rq->sched.link);
  775. dma_fence_set_error(&rq->fence, -EIO);
  776. __i915_request_submit(rq);
  777. }
  778. rb_erase_cached(&p->node, &execlists->queue);
  779. INIT_LIST_HEAD(&p->requests);
  780. if (p->priority != I915_PRIORITY_NORMAL)
  781. kmem_cache_free(engine->i915->priorities, p);
  782. }
  783. /* Remaining _unready_ requests will be nop'ed when submitted */
  784. execlists->queue_priority = INT_MIN;
  785. execlists->queue = RB_ROOT_CACHED;
  786. GEM_BUG_ON(port_isset(execlists->port));
  787. GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
  788. execlists->tasklet.func = nop_submission_tasklet;
  789. spin_unlock_irqrestore(&engine->timeline.lock, flags);
  790. }
  791. static inline bool
  792. reset_in_progress(const struct intel_engine_execlists *execlists)
  793. {
  794. return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
  795. }
  796. static void process_csb(struct intel_engine_cs *engine)
  797. {
  798. struct intel_engine_execlists * const execlists = &engine->execlists;
  799. struct execlist_port *port = execlists->port;
  800. const u32 * const buf = execlists->csb_status;
  801. u8 head, tail;
  802. /*
  803. * Note that csb_write, csb_status may be either in HWSP or mmio.
  804. * When reading from the csb_write mmio register, we have to be
  805. * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
  806. * the low 4bits. As it happens we know the next 4bits are always
  807. * zero and so we can simply masked off the low u8 of the register
  808. * and treat it identically to reading from the HWSP (without having
  809. * to use explicit shifting and masking, and probably bifurcating
  810. * the code to handle the legacy mmio read).
  811. */
  812. head = execlists->csb_head;
  813. tail = READ_ONCE(*execlists->csb_write);
  814. GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
  815. if (unlikely(head == tail))
  816. return;
  817. /*
  818. * Hopefully paired with a wmb() in HW!
  819. *
  820. * We must complete the read of the write pointer before any reads
  821. * from the CSB, so that we do not see stale values. Without an rmb
  822. * (lfence) the HW may speculatively perform the CSB[] reads *before*
  823. * we perform the READ_ONCE(*csb_write).
  824. */
  825. rmb();
  826. do {
  827. struct i915_request *rq;
  828. unsigned int status;
  829. unsigned int count;
  830. if (++head == GEN8_CSB_ENTRIES)
  831. head = 0;
  832. /*
  833. * We are flying near dragons again.
  834. *
  835. * We hold a reference to the request in execlist_port[]
  836. * but no more than that. We are operating in softirq
  837. * context and so cannot hold any mutex or sleep. That
  838. * prevents us stopping the requests we are processing
  839. * in port[] from being retired simultaneously (the
  840. * breadcrumb will be complete before we see the
  841. * context-switch). As we only hold the reference to the
  842. * request, any pointer chasing underneath the request
  843. * is subject to a potential use-after-free. Thus we
  844. * store all of the bookkeeping within port[] as
  845. * required, and avoid using unguarded pointers beneath
  846. * request itself. The same applies to the atomic
  847. * status notifier.
  848. */
  849. GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
  850. engine->name, head,
  851. buf[2 * head + 0], buf[2 * head + 1],
  852. execlists->active);
  853. status = buf[2 * head];
  854. if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
  855. GEN8_CTX_STATUS_PREEMPTED))
  856. execlists_set_active(execlists,
  857. EXECLISTS_ACTIVE_HWACK);
  858. if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
  859. execlists_clear_active(execlists,
  860. EXECLISTS_ACTIVE_HWACK);
  861. if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
  862. continue;
  863. /* We should never get a COMPLETED | IDLE_ACTIVE! */
  864. GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
  865. if (status & GEN8_CTX_STATUS_COMPLETE &&
  866. buf[2*head + 1] == execlists->preempt_complete_status) {
  867. GEM_TRACE("%s preempt-idle\n", engine->name);
  868. complete_preempt_context(execlists);
  869. continue;
  870. }
  871. if (status & GEN8_CTX_STATUS_PREEMPTED &&
  872. execlists_is_active(execlists,
  873. EXECLISTS_ACTIVE_PREEMPT))
  874. continue;
  875. GEM_BUG_ON(!execlists_is_active(execlists,
  876. EXECLISTS_ACTIVE_USER));
  877. rq = port_unpack(port, &count);
  878. GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
  879. engine->name,
  880. port->context_id, count,
  881. rq ? rq->global_seqno : 0,
  882. rq ? rq->fence.context : 0,
  883. rq ? rq->fence.seqno : 0,
  884. intel_engine_get_seqno(engine),
  885. rq ? rq_prio(rq) : 0);
  886. /* Check the context/desc id for this event matches */
  887. GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
  888. GEM_BUG_ON(count == 0);
  889. if (--count == 0) {
  890. /*
  891. * On the final event corresponding to the
  892. * submission of this context, we expect either
  893. * an element-switch event or a completion
  894. * event (and on completion, the active-idle
  895. * marker). No more preemptions, lite-restore
  896. * or otherwise.
  897. */
  898. GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
  899. GEM_BUG_ON(port_isset(&port[1]) &&
  900. !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
  901. GEM_BUG_ON(!port_isset(&port[1]) &&
  902. !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
  903. /*
  904. * We rely on the hardware being strongly
  905. * ordered, that the breadcrumb write is
  906. * coherent (visible from the CPU) before the
  907. * user interrupt and CSB is processed.
  908. */
  909. GEM_BUG_ON(!i915_request_completed(rq));
  910. execlists_context_schedule_out(rq,
  911. INTEL_CONTEXT_SCHEDULE_OUT);
  912. i915_request_put(rq);
  913. GEM_TRACE("%s completed ctx=%d\n",
  914. engine->name, port->context_id);
  915. port = execlists_port_complete(execlists, port);
  916. if (port_isset(port))
  917. execlists_user_begin(execlists, port);
  918. else
  919. execlists_user_end(execlists);
  920. } else {
  921. port_set(port, port_pack(rq, count));
  922. }
  923. } while (head != tail);
  924. execlists->csb_head = head;
  925. }
  926. static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
  927. {
  928. lockdep_assert_held(&engine->timeline.lock);
  929. process_csb(engine);
  930. if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
  931. execlists_dequeue(engine);
  932. }
  933. /*
  934. * Check the unread Context Status Buffers and manage the submission of new
  935. * contexts to the ELSP accordingly.
  936. */
  937. static void execlists_submission_tasklet(unsigned long data)
  938. {
  939. struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
  940. unsigned long flags;
  941. GEM_TRACE("%s awake?=%d, active=%x\n",
  942. engine->name,
  943. engine->i915->gt.awake,
  944. engine->execlists.active);
  945. spin_lock_irqsave(&engine->timeline.lock, flags);
  946. __execlists_submission_tasklet(engine);
  947. spin_unlock_irqrestore(&engine->timeline.lock, flags);
  948. }
  949. static void queue_request(struct intel_engine_cs *engine,
  950. struct i915_sched_node *node,
  951. int prio)
  952. {
  953. list_add_tail(&node->link,
  954. &lookup_priolist(engine, prio)->requests);
  955. }
  956. static void __update_queue(struct intel_engine_cs *engine, int prio)
  957. {
  958. engine->execlists.queue_priority = prio;
  959. }
  960. static void __submit_queue_imm(struct intel_engine_cs *engine)
  961. {
  962. struct intel_engine_execlists * const execlists = &engine->execlists;
  963. if (reset_in_progress(execlists))
  964. return; /* defer until we restart the engine following reset */
  965. if (execlists->tasklet.func == execlists_submission_tasklet)
  966. __execlists_submission_tasklet(engine);
  967. else
  968. tasklet_hi_schedule(&execlists->tasklet);
  969. }
  970. static void submit_queue(struct intel_engine_cs *engine, int prio)
  971. {
  972. if (prio > engine->execlists.queue_priority) {
  973. __update_queue(engine, prio);
  974. __submit_queue_imm(engine);
  975. }
  976. }
  977. static void execlists_submit_request(struct i915_request *request)
  978. {
  979. struct intel_engine_cs *engine = request->engine;
  980. unsigned long flags;
  981. /* Will be called from irq-context when using foreign fences. */
  982. spin_lock_irqsave(&engine->timeline.lock, flags);
  983. queue_request(engine, &request->sched, rq_prio(request));
  984. GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
  985. GEM_BUG_ON(list_empty(&request->sched.link));
  986. submit_queue(engine, rq_prio(request));
  987. spin_unlock_irqrestore(&engine->timeline.lock, flags);
  988. }
  989. static struct i915_request *sched_to_request(struct i915_sched_node *node)
  990. {
  991. return container_of(node, struct i915_request, sched);
  992. }
  993. static struct intel_engine_cs *
  994. sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
  995. {
  996. struct intel_engine_cs *engine = sched_to_request(node)->engine;
  997. GEM_BUG_ON(!locked);
  998. if (engine != locked) {
  999. spin_unlock(&locked->timeline.lock);
  1000. spin_lock(&engine->timeline.lock);
  1001. }
  1002. return engine;
  1003. }
  1004. static void execlists_schedule(struct i915_request *request,
  1005. const struct i915_sched_attr *attr)
  1006. {
  1007. struct i915_priolist *uninitialized_var(pl);
  1008. struct intel_engine_cs *engine, *last;
  1009. struct i915_dependency *dep, *p;
  1010. struct i915_dependency stack;
  1011. const int prio = attr->priority;
  1012. LIST_HEAD(dfs);
  1013. GEM_BUG_ON(prio == I915_PRIORITY_INVALID);
  1014. if (i915_request_completed(request))
  1015. return;
  1016. if (prio <= READ_ONCE(request->sched.attr.priority))
  1017. return;
  1018. /* Need BKL in order to use the temporary link inside i915_dependency */
  1019. lockdep_assert_held(&request->i915->drm.struct_mutex);
  1020. stack.signaler = &request->sched;
  1021. list_add(&stack.dfs_link, &dfs);
  1022. /*
  1023. * Recursively bump all dependent priorities to match the new request.
  1024. *
  1025. * A naive approach would be to use recursion:
  1026. * static void update_priorities(struct i915_sched_node *node, prio) {
  1027. * list_for_each_entry(dep, &node->signalers_list, signal_link)
  1028. * update_priorities(dep->signal, prio)
  1029. * queue_request(node);
  1030. * }
  1031. * but that may have unlimited recursion depth and so runs a very
  1032. * real risk of overunning the kernel stack. Instead, we build
  1033. * a flat list of all dependencies starting with the current request.
  1034. * As we walk the list of dependencies, we add all of its dependencies
  1035. * to the end of the list (this may include an already visited
  1036. * request) and continue to walk onwards onto the new dependencies. The
  1037. * end result is a topological list of requests in reverse order, the
  1038. * last element in the list is the request we must execute first.
  1039. */
  1040. list_for_each_entry(dep, &dfs, dfs_link) {
  1041. struct i915_sched_node *node = dep->signaler;
  1042. /*
  1043. * Within an engine, there can be no cycle, but we may
  1044. * refer to the same dependency chain multiple times
  1045. * (redundant dependencies are not eliminated) and across
  1046. * engines.
  1047. */
  1048. list_for_each_entry(p, &node->signalers_list, signal_link) {
  1049. GEM_BUG_ON(p == dep); /* no cycles! */
  1050. if (i915_sched_node_signaled(p->signaler))
  1051. continue;
  1052. GEM_BUG_ON(p->signaler->attr.priority < node->attr.priority);
  1053. if (prio > READ_ONCE(p->signaler->attr.priority))
  1054. list_move_tail(&p->dfs_link, &dfs);
  1055. }
  1056. }
  1057. /*
  1058. * If we didn't need to bump any existing priorities, and we haven't
  1059. * yet submitted this request (i.e. there is no potential race with
  1060. * execlists_submit_request()), we can set our own priority and skip
  1061. * acquiring the engine locks.
  1062. */
  1063. if (request->sched.attr.priority == I915_PRIORITY_INVALID) {
  1064. GEM_BUG_ON(!list_empty(&request->sched.link));
  1065. request->sched.attr = *attr;
  1066. if (stack.dfs_link.next == stack.dfs_link.prev)
  1067. return;
  1068. __list_del_entry(&stack.dfs_link);
  1069. }
  1070. last = NULL;
  1071. engine = request->engine;
  1072. spin_lock_irq(&engine->timeline.lock);
  1073. /* Fifo and depth-first replacement ensure our deps execute before us */
  1074. list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
  1075. struct i915_sched_node *node = dep->signaler;
  1076. INIT_LIST_HEAD(&dep->dfs_link);
  1077. engine = sched_lock_engine(node, engine);
  1078. if (prio <= node->attr.priority)
  1079. continue;
  1080. node->attr.priority = prio;
  1081. if (!list_empty(&node->link)) {
  1082. if (last != engine) {
  1083. pl = lookup_priolist(engine, prio);
  1084. last = engine;
  1085. }
  1086. GEM_BUG_ON(pl->priority != prio);
  1087. list_move_tail(&node->link, &pl->requests);
  1088. }
  1089. if (prio > engine->execlists.queue_priority &&
  1090. i915_sw_fence_done(&sched_to_request(node)->submit)) {
  1091. /* defer submission until after all of our updates */
  1092. __update_queue(engine, prio);
  1093. tasklet_hi_schedule(&engine->execlists.tasklet);
  1094. }
  1095. }
  1096. spin_unlock_irq(&engine->timeline.lock);
  1097. }
  1098. static void execlists_context_destroy(struct intel_context *ce)
  1099. {
  1100. GEM_BUG_ON(ce->pin_count);
  1101. if (!ce->state)
  1102. return;
  1103. intel_ring_free(ce->ring);
  1104. GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
  1105. i915_gem_object_put(ce->state->obj);
  1106. }
  1107. static void execlists_context_unpin(struct intel_context *ce)
  1108. {
  1109. i915_gem_context_unpin_hw_id(ce->gem_context);
  1110. intel_ring_unpin(ce->ring);
  1111. ce->state->obj->pin_global--;
  1112. i915_gem_object_unpin_map(ce->state->obj);
  1113. i915_vma_unpin(ce->state);
  1114. i915_gem_context_put(ce->gem_context);
  1115. }
  1116. static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
  1117. {
  1118. unsigned int flags;
  1119. int err;
  1120. /*
  1121. * Clear this page out of any CPU caches for coherent swap-in/out.
  1122. * We only want to do this on the first bind so that we do not stall
  1123. * on an active context (which by nature is already on the GPU).
  1124. */
  1125. if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
  1126. err = i915_gem_object_set_to_wc_domain(vma->obj, true);
  1127. if (err)
  1128. return err;
  1129. }
  1130. flags = PIN_GLOBAL | PIN_HIGH;
  1131. flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
  1132. return i915_vma_pin(vma, 0, 0, flags);
  1133. }
  1134. static struct intel_context *
  1135. __execlists_context_pin(struct intel_engine_cs *engine,
  1136. struct i915_gem_context *ctx,
  1137. struct intel_context *ce)
  1138. {
  1139. void *vaddr;
  1140. int ret;
  1141. ret = execlists_context_deferred_alloc(ctx, engine, ce);
  1142. if (ret)
  1143. goto err;
  1144. GEM_BUG_ON(!ce->state);
  1145. ret = __context_pin(ctx, ce->state);
  1146. if (ret)
  1147. goto err;
  1148. vaddr = i915_gem_object_pin_map(ce->state->obj,
  1149. i915_coherent_map_type(ctx->i915) |
  1150. I915_MAP_OVERRIDE);
  1151. if (IS_ERR(vaddr)) {
  1152. ret = PTR_ERR(vaddr);
  1153. goto unpin_vma;
  1154. }
  1155. ret = intel_ring_pin(ce->ring);
  1156. if (ret)
  1157. goto unpin_map;
  1158. ret = i915_gem_context_pin_hw_id(ctx);
  1159. if (ret)
  1160. goto unpin_ring;
  1161. intel_lr_context_descriptor_update(ctx, engine, ce);
  1162. GEM_BUG_ON(!intel_ring_offset_valid(ce->ring, ce->ring->head));
  1163. ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
  1164. ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
  1165. i915_ggtt_offset(ce->ring->vma);
  1166. ce->lrc_reg_state[CTX_RING_HEAD + 1] = ce->ring->head;
  1167. ce->lrc_reg_state[CTX_RING_TAIL + 1] = ce->ring->tail;
  1168. ce->state->obj->pin_global++;
  1169. i915_gem_context_get(ctx);
  1170. return ce;
  1171. unpin_ring:
  1172. intel_ring_unpin(ce->ring);
  1173. unpin_map:
  1174. i915_gem_object_unpin_map(ce->state->obj);
  1175. unpin_vma:
  1176. __i915_vma_unpin(ce->state);
  1177. err:
  1178. ce->pin_count = 0;
  1179. return ERR_PTR(ret);
  1180. }
  1181. static const struct intel_context_ops execlists_context_ops = {
  1182. .unpin = execlists_context_unpin,
  1183. .destroy = execlists_context_destroy,
  1184. };
  1185. static struct intel_context *
  1186. execlists_context_pin(struct intel_engine_cs *engine,
  1187. struct i915_gem_context *ctx)
  1188. {
  1189. struct intel_context *ce = to_intel_context(ctx, engine);
  1190. lockdep_assert_held(&ctx->i915->drm.struct_mutex);
  1191. if (likely(ce->pin_count++))
  1192. return ce;
  1193. GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
  1194. ce->ops = &execlists_context_ops;
  1195. return __execlists_context_pin(engine, ctx, ce);
  1196. }
  1197. static int execlists_request_alloc(struct i915_request *request)
  1198. {
  1199. int ret;
  1200. GEM_BUG_ON(!request->hw_context->pin_count);
  1201. /* Flush enough space to reduce the likelihood of waiting after
  1202. * we start building the request - in which case we will just
  1203. * have to repeat work.
  1204. */
  1205. request->reserved_space += EXECLISTS_REQUEST_SIZE;
  1206. ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
  1207. if (ret)
  1208. return ret;
  1209. /* Note that after this point, we have committed to using
  1210. * this request as it is being used to both track the
  1211. * state of engine initialisation and liveness of the
  1212. * golden renderstate above. Think twice before you try
  1213. * to cancel/unwind this request now.
  1214. */
  1215. request->reserved_space -= EXECLISTS_REQUEST_SIZE;
  1216. return 0;
  1217. }
  1218. /*
  1219. * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
  1220. * PIPE_CONTROL instruction. This is required for the flush to happen correctly
  1221. * but there is a slight complication as this is applied in WA batch where the
  1222. * values are only initialized once so we cannot take register value at the
  1223. * beginning and reuse it further; hence we save its value to memory, upload a
  1224. * constant value with bit21 set and then we restore it back with the saved value.
  1225. * To simplify the WA, a constant value is formed by using the default value
  1226. * of this register. This shouldn't be a problem because we are only modifying
  1227. * it for a short period and this batch in non-premptible. We can ofcourse
  1228. * use additional instructions that read the actual value of the register
  1229. * at that time and set our bit of interest but it makes the WA complicated.
  1230. *
  1231. * This WA is also required for Gen9 so extracting as a function avoids
  1232. * code duplication.
  1233. */
  1234. static u32 *
  1235. gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
  1236. {
  1237. /* NB no one else is allowed to scribble over scratch + 256! */
  1238. *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
  1239. *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
  1240. *batch++ = i915_scratch_offset(engine->i915) + 256;
  1241. *batch++ = 0;
  1242. *batch++ = MI_LOAD_REGISTER_IMM(1);
  1243. *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
  1244. *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
  1245. batch = gen8_emit_pipe_control(batch,
  1246. PIPE_CONTROL_CS_STALL |
  1247. PIPE_CONTROL_DC_FLUSH_ENABLE,
  1248. 0);
  1249. *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
  1250. *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
  1251. *batch++ = i915_scratch_offset(engine->i915) + 256;
  1252. *batch++ = 0;
  1253. return batch;
  1254. }
  1255. /*
  1256. * Typically we only have one indirect_ctx and per_ctx batch buffer which are
  1257. * initialized at the beginning and shared across all contexts but this field
  1258. * helps us to have multiple batches at different offsets and select them based
  1259. * on a criteria. At the moment this batch always start at the beginning of the page
  1260. * and at this point we don't have multiple wa_ctx batch buffers.
  1261. *
  1262. * The number of WA applied are not known at the beginning; we use this field
  1263. * to return the no of DWORDS written.
  1264. *
  1265. * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
  1266. * so it adds NOOPs as padding to make it cacheline aligned.
  1267. * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
  1268. * makes a complete batch buffer.
  1269. */
  1270. static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
  1271. {
  1272. /* WaDisableCtxRestoreArbitration:bdw,chv */
  1273. *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
  1274. /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
  1275. if (IS_BROADWELL(engine->i915))
  1276. batch = gen8_emit_flush_coherentl3_wa(engine, batch);
  1277. /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
  1278. /* Actual scratch location is at 128 bytes offset */
  1279. batch = gen8_emit_pipe_control(batch,
  1280. PIPE_CONTROL_FLUSH_L3 |
  1281. PIPE_CONTROL_GLOBAL_GTT_IVB |
  1282. PIPE_CONTROL_CS_STALL |
  1283. PIPE_CONTROL_QW_WRITE,
  1284. i915_scratch_offset(engine->i915) +
  1285. 2 * CACHELINE_BYTES);
  1286. *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
  1287. /* Pad to end of cacheline */
  1288. while ((unsigned long)batch % CACHELINE_BYTES)
  1289. *batch++ = MI_NOOP;
  1290. /*
  1291. * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
  1292. * execution depends on the length specified in terms of cache lines
  1293. * in the register CTX_RCS_INDIRECT_CTX
  1294. */
  1295. return batch;
  1296. }
  1297. struct lri {
  1298. i915_reg_t reg;
  1299. u32 value;
  1300. };
  1301. static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
  1302. {
  1303. GEM_BUG_ON(!count || count > 63);
  1304. *batch++ = MI_LOAD_REGISTER_IMM(count);
  1305. do {
  1306. *batch++ = i915_mmio_reg_offset(lri->reg);
  1307. *batch++ = lri->value;
  1308. } while (lri++, --count);
  1309. *batch++ = MI_NOOP;
  1310. return batch;
  1311. }
  1312. static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
  1313. {
  1314. static const struct lri lri[] = {
  1315. /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
  1316. {
  1317. COMMON_SLICE_CHICKEN2,
  1318. __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
  1319. 0),
  1320. },
  1321. /* BSpec: 11391 */
  1322. {
  1323. FF_SLICE_CHICKEN,
  1324. __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
  1325. FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
  1326. },
  1327. /* BSpec: 11299 */
  1328. {
  1329. _3D_CHICKEN3,
  1330. __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
  1331. _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
  1332. }
  1333. };
  1334. *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
  1335. /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
  1336. batch = gen8_emit_flush_coherentl3_wa(engine, batch);
  1337. batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
  1338. /* WaClearSlmSpaceAtContextSwitch:kbl */
  1339. /* Actual scratch location is at 128 bytes offset */
  1340. if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) {
  1341. batch = gen8_emit_pipe_control(batch,
  1342. PIPE_CONTROL_FLUSH_L3 |
  1343. PIPE_CONTROL_GLOBAL_GTT_IVB |
  1344. PIPE_CONTROL_CS_STALL |
  1345. PIPE_CONTROL_QW_WRITE,
  1346. i915_scratch_offset(engine->i915)
  1347. + 2 * CACHELINE_BYTES);
  1348. }
  1349. /* WaMediaPoolStateCmdInWABB:bxt,glk */
  1350. if (HAS_POOLED_EU(engine->i915)) {
  1351. /*
  1352. * EU pool configuration is setup along with golden context
  1353. * during context initialization. This value depends on
  1354. * device type (2x6 or 3x6) and needs to be updated based
  1355. * on which subslice is disabled especially for 2x6
  1356. * devices, however it is safe to load default
  1357. * configuration of 3x6 device instead of masking off
  1358. * corresponding bits because HW ignores bits of a disabled
  1359. * subslice and drops down to appropriate config. Please
  1360. * see render_state_setup() in i915_gem_render_state.c for
  1361. * possible configurations, to avoid duplication they are
  1362. * not shown here again.
  1363. */
  1364. *batch++ = GEN9_MEDIA_POOL_STATE;
  1365. *batch++ = GEN9_MEDIA_POOL_ENABLE;
  1366. *batch++ = 0x00777000;
  1367. *batch++ = 0;
  1368. *batch++ = 0;
  1369. *batch++ = 0;
  1370. }
  1371. *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
  1372. /* Pad to end of cacheline */
  1373. while ((unsigned long)batch % CACHELINE_BYTES)
  1374. *batch++ = MI_NOOP;
  1375. return batch;
  1376. }
  1377. static u32 *
  1378. gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
  1379. {
  1380. int i;
  1381. /*
  1382. * WaPipeControlBefore3DStateSamplePattern: cnl
  1383. *
  1384. * Ensure the engine is idle prior to programming a
  1385. * 3DSTATE_SAMPLE_PATTERN during a context restore.
  1386. */
  1387. batch = gen8_emit_pipe_control(batch,
  1388. PIPE_CONTROL_CS_STALL,
  1389. 0);
  1390. /*
  1391. * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
  1392. * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
  1393. * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
  1394. * confusing. Since gen8_emit_pipe_control() already advances the
  1395. * batch by 6 dwords, we advance the other 10 here, completing a
  1396. * cacheline. It's not clear if the workaround requires this padding
  1397. * before other commands, or if it's just the regular padding we would
  1398. * already have for the workaround bb, so leave it here for now.
  1399. */
  1400. for (i = 0; i < 10; i++)
  1401. *batch++ = MI_NOOP;
  1402. /* Pad to end of cacheline */
  1403. while ((unsigned long)batch % CACHELINE_BYTES)
  1404. *batch++ = MI_NOOP;
  1405. return batch;
  1406. }
  1407. #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
  1408. static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
  1409. {
  1410. struct drm_i915_gem_object *obj;
  1411. struct i915_vma *vma;
  1412. int err;
  1413. obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
  1414. if (IS_ERR(obj))
  1415. return PTR_ERR(obj);
  1416. vma = i915_vma_instance(obj, &engine->i915->ggtt.vm, NULL);
  1417. if (IS_ERR(vma)) {
  1418. err = PTR_ERR(vma);
  1419. goto err;
  1420. }
  1421. err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
  1422. if (err)
  1423. goto err;
  1424. engine->wa_ctx.vma = vma;
  1425. return 0;
  1426. err:
  1427. i915_gem_object_put(obj);
  1428. return err;
  1429. }
  1430. static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
  1431. {
  1432. i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
  1433. }
  1434. typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
  1435. static int intel_init_workaround_bb(struct intel_engine_cs *engine)
  1436. {
  1437. struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
  1438. struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
  1439. &wa_ctx->per_ctx };
  1440. wa_bb_func_t wa_bb_fn[2];
  1441. struct page *page;
  1442. void *batch, *batch_ptr;
  1443. unsigned int i;
  1444. int ret;
  1445. if (GEM_WARN_ON(engine->id != RCS))
  1446. return -EINVAL;
  1447. switch (INTEL_GEN(engine->i915)) {
  1448. case 11:
  1449. return 0;
  1450. case 10:
  1451. wa_bb_fn[0] = gen10_init_indirectctx_bb;
  1452. wa_bb_fn[1] = NULL;
  1453. break;
  1454. case 9:
  1455. wa_bb_fn[0] = gen9_init_indirectctx_bb;
  1456. wa_bb_fn[1] = NULL;
  1457. break;
  1458. case 8:
  1459. wa_bb_fn[0] = gen8_init_indirectctx_bb;
  1460. wa_bb_fn[1] = NULL;
  1461. break;
  1462. default:
  1463. MISSING_CASE(INTEL_GEN(engine->i915));
  1464. return 0;
  1465. }
  1466. ret = lrc_setup_wa_ctx(engine);
  1467. if (ret) {
  1468. DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
  1469. return ret;
  1470. }
  1471. page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
  1472. batch = batch_ptr = kmap_atomic(page);
  1473. /*
  1474. * Emit the two workaround batch buffers, recording the offset from the
  1475. * start of the workaround batch buffer object for each and their
  1476. * respective sizes.
  1477. */
  1478. for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
  1479. wa_bb[i]->offset = batch_ptr - batch;
  1480. if (GEM_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
  1481. CACHELINE_BYTES))) {
  1482. ret = -EINVAL;
  1483. break;
  1484. }
  1485. if (wa_bb_fn[i])
  1486. batch_ptr = wa_bb_fn[i](engine, batch_ptr);
  1487. wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
  1488. }
  1489. BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
  1490. kunmap_atomic(batch);
  1491. if (ret)
  1492. lrc_destroy_wa_ctx(engine);
  1493. return ret;
  1494. }
  1495. static void enable_execlists(struct intel_engine_cs *engine)
  1496. {
  1497. struct drm_i915_private *dev_priv = engine->i915;
  1498. I915_WRITE(RING_HWSTAM(engine->mmio_base), 0xffffffff);
  1499. /*
  1500. * Make sure we're not enabling the new 12-deep CSB
  1501. * FIFO as that requires a slightly updated handling
  1502. * in the ctx switch irq. Since we're currently only
  1503. * using only 2 elements of the enhanced execlists the
  1504. * deeper FIFO it's not needed and it's not worth adding
  1505. * more statements to the irq handler to support it.
  1506. */
  1507. if (INTEL_GEN(dev_priv) >= 11)
  1508. I915_WRITE(RING_MODE_GEN7(engine),
  1509. _MASKED_BIT_DISABLE(GEN11_GFX_DISABLE_LEGACY_MODE));
  1510. else
  1511. I915_WRITE(RING_MODE_GEN7(engine),
  1512. _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
  1513. I915_WRITE(RING_MI_MODE(engine->mmio_base),
  1514. _MASKED_BIT_DISABLE(STOP_RING));
  1515. I915_WRITE(RING_HWS_PGA(engine->mmio_base),
  1516. engine->status_page.ggtt_offset);
  1517. POSTING_READ(RING_HWS_PGA(engine->mmio_base));
  1518. }
  1519. static bool unexpected_starting_state(struct intel_engine_cs *engine)
  1520. {
  1521. struct drm_i915_private *dev_priv = engine->i915;
  1522. bool unexpected = false;
  1523. if (I915_READ(RING_MI_MODE(engine->mmio_base)) & STOP_RING) {
  1524. DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
  1525. unexpected = true;
  1526. }
  1527. return unexpected;
  1528. }
  1529. static int gen8_init_common_ring(struct intel_engine_cs *engine)
  1530. {
  1531. intel_engine_apply_workarounds(engine);
  1532. intel_mocs_init_engine(engine);
  1533. intel_engine_reset_breadcrumbs(engine);
  1534. if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
  1535. struct drm_printer p = drm_debug_printer(__func__);
  1536. intel_engine_dump(engine, &p, NULL);
  1537. }
  1538. enable_execlists(engine);
  1539. return 0;
  1540. }
  1541. static int gen8_init_render_ring(struct intel_engine_cs *engine)
  1542. {
  1543. struct drm_i915_private *dev_priv = engine->i915;
  1544. int ret;
  1545. ret = gen8_init_common_ring(engine);
  1546. if (ret)
  1547. return ret;
  1548. intel_whitelist_workarounds_apply(engine);
  1549. /* We need to disable the AsyncFlip performance optimisations in order
  1550. * to use MI_WAIT_FOR_EVENT within the CS. It should already be
  1551. * programmed to '1' on all products.
  1552. *
  1553. * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv
  1554. */
  1555. I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
  1556. I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
  1557. return 0;
  1558. }
  1559. static int gen9_init_render_ring(struct intel_engine_cs *engine)
  1560. {
  1561. int ret;
  1562. ret = gen8_init_common_ring(engine);
  1563. if (ret)
  1564. return ret;
  1565. intel_whitelist_workarounds_apply(engine);
  1566. return 0;
  1567. }
  1568. static struct i915_request *
  1569. execlists_reset_prepare(struct intel_engine_cs *engine)
  1570. {
  1571. struct intel_engine_execlists * const execlists = &engine->execlists;
  1572. struct i915_request *request, *active;
  1573. unsigned long flags;
  1574. GEM_TRACE("%s: depth<-%d\n", engine->name,
  1575. atomic_read(&execlists->tasklet.count));
  1576. /*
  1577. * Prevent request submission to the hardware until we have
  1578. * completed the reset in i915_gem_reset_finish(). If a request
  1579. * is completed by one engine, it may then queue a request
  1580. * to a second via its execlists->tasklet *just* as we are
  1581. * calling engine->init_hw() and also writing the ELSP.
  1582. * Turning off the execlists->tasklet until the reset is over
  1583. * prevents the race.
  1584. */
  1585. __tasklet_disable_sync_once(&execlists->tasklet);
  1586. spin_lock_irqsave(&engine->timeline.lock, flags);
  1587. /*
  1588. * We want to flush the pending context switches, having disabled
  1589. * the tasklet above, we can assume exclusive access to the execlists.
  1590. * For this allows us to catch up with an inflight preemption event,
  1591. * and avoid blaming an innocent request if the stall was due to the
  1592. * preemption itself.
  1593. */
  1594. process_csb(engine);
  1595. /*
  1596. * The last active request can then be no later than the last request
  1597. * now in ELSP[0]. So search backwards from there, so that if the GPU
  1598. * has advanced beyond the last CSB update, it will be pardoned.
  1599. */
  1600. active = NULL;
  1601. request = port_request(execlists->port);
  1602. if (request) {
  1603. /*
  1604. * Prevent the breadcrumb from advancing before we decide
  1605. * which request is currently active.
  1606. */
  1607. intel_engine_stop_cs(engine);
  1608. list_for_each_entry_from_reverse(request,
  1609. &engine->timeline.requests,
  1610. link) {
  1611. if (__i915_request_completed(request,
  1612. request->global_seqno))
  1613. break;
  1614. active = request;
  1615. }
  1616. }
  1617. spin_unlock_irqrestore(&engine->timeline.lock, flags);
  1618. return active;
  1619. }
  1620. static void execlists_reset(struct intel_engine_cs *engine,
  1621. struct i915_request *request)
  1622. {
  1623. struct intel_engine_execlists * const execlists = &engine->execlists;
  1624. unsigned long flags;
  1625. u32 *regs;
  1626. GEM_TRACE("%s request global=%x, current=%d\n",
  1627. engine->name, request ? request->global_seqno : 0,
  1628. intel_engine_get_seqno(engine));
  1629. spin_lock_irqsave(&engine->timeline.lock, flags);
  1630. /*
  1631. * Catch up with any missed context-switch interrupts.
  1632. *
  1633. * Ideally we would just read the remaining CSB entries now that we
  1634. * know the gpu is idle. However, the CSB registers are sometimes^W
  1635. * often trashed across a GPU reset! Instead we have to rely on
  1636. * guessing the missed context-switch events by looking at what
  1637. * requests were completed.
  1638. */
  1639. execlists_cancel_port_requests(execlists);
  1640. /* Push back any incomplete requests for replay after the reset. */
  1641. __unwind_incomplete_requests(engine);
  1642. /* Following the reset, we need to reload the CSB read/write pointers */
  1643. reset_csb_pointers(&engine->execlists);
  1644. spin_unlock_irqrestore(&engine->timeline.lock, flags);
  1645. /*
  1646. * If the request was innocent, we leave the request in the ELSP
  1647. * and will try to replay it on restarting. The context image may
  1648. * have been corrupted by the reset, in which case we may have
  1649. * to service a new GPU hang, but more likely we can continue on
  1650. * without impact.
  1651. *
  1652. * If the request was guilty, we presume the context is corrupt
  1653. * and have to at least restore the RING register in the context
  1654. * image back to the expected values to skip over the guilty request.
  1655. */
  1656. if (!request || request->fence.error != -EIO)
  1657. return;
  1658. /*
  1659. * We want a simple context + ring to execute the breadcrumb update.
  1660. * We cannot rely on the context being intact across the GPU hang,
  1661. * so clear it and rebuild just what we need for the breadcrumb.
  1662. * All pending requests for this context will be zapped, and any
  1663. * future request will be after userspace has had the opportunity
  1664. * to recreate its own state.
  1665. */
  1666. regs = request->hw_context->lrc_reg_state;
  1667. if (engine->pinned_default_state) {
  1668. memcpy(regs, /* skip restoring the vanilla PPHWSP */
  1669. engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
  1670. engine->context_size - PAGE_SIZE);
  1671. }
  1672. execlists_init_reg_state(regs,
  1673. request->gem_context, engine, request->ring);
  1674. /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
  1675. regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
  1676. request->ring->head = intel_ring_wrap(request->ring, request->postfix);
  1677. regs[CTX_RING_HEAD + 1] = request->ring->head;
  1678. intel_ring_update_space(request->ring);
  1679. /* Reset WaIdleLiteRestore:bdw,skl as well */
  1680. unwind_wa_tail(request);
  1681. }
  1682. static void execlists_reset_finish(struct intel_engine_cs *engine)
  1683. {
  1684. struct intel_engine_execlists * const execlists = &engine->execlists;
  1685. /*
  1686. * After a GPU reset, we may have requests to replay. Do so now while
  1687. * we still have the forcewake to be sure that the GPU is not allowed
  1688. * to sleep before we restart and reload a context.
  1689. *
  1690. */
  1691. if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
  1692. execlists->tasklet.func(execlists->tasklet.data);
  1693. tasklet_enable(&execlists->tasklet);
  1694. GEM_TRACE("%s: depth->%d\n", engine->name,
  1695. atomic_read(&execlists->tasklet.count));
  1696. }
  1697. static int intel_logical_ring_emit_pdps(struct i915_request *rq)
  1698. {
  1699. struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
  1700. struct intel_engine_cs *engine = rq->engine;
  1701. const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
  1702. u32 *cs;
  1703. int i;
  1704. cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
  1705. if (IS_ERR(cs))
  1706. return PTR_ERR(cs);
  1707. *cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
  1708. for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
  1709. const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
  1710. *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
  1711. *cs++ = upper_32_bits(pd_daddr);
  1712. *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
  1713. *cs++ = lower_32_bits(pd_daddr);
  1714. }
  1715. *cs++ = MI_NOOP;
  1716. intel_ring_advance(rq, cs);
  1717. return 0;
  1718. }
  1719. static int gen8_emit_bb_start(struct i915_request *rq,
  1720. u64 offset, u32 len,
  1721. const unsigned int flags)
  1722. {
  1723. u32 *cs;
  1724. int ret;
  1725. /* Don't rely in hw updating PDPs, specially in lite-restore.
  1726. * Ideally, we should set Force PD Restore in ctx descriptor,
  1727. * but we can't. Force Restore would be a second option, but
  1728. * it is unsafe in case of lite-restore (because the ctx is
  1729. * not idle). PML4 is allocated during ppgtt init so this is
  1730. * not needed in 48-bit.*/
  1731. if (rq->gem_context->ppgtt &&
  1732. (intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
  1733. !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
  1734. !intel_vgpu_active(rq->i915)) {
  1735. ret = intel_logical_ring_emit_pdps(rq);
  1736. if (ret)
  1737. return ret;
  1738. rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
  1739. }
  1740. cs = intel_ring_begin(rq, 6);
  1741. if (IS_ERR(cs))
  1742. return PTR_ERR(cs);
  1743. /*
  1744. * WaDisableCtxRestoreArbitration:bdw,chv
  1745. *
  1746. * We don't need to perform MI_ARB_ENABLE as often as we do (in
  1747. * particular all the gen that do not need the w/a at all!), if we
  1748. * took care to make sure that on every switch into this context
  1749. * (both ordinary and for preemption) that arbitrartion was enabled
  1750. * we would be fine. However, there doesn't seem to be a downside to
  1751. * being paranoid and making sure it is set before each batch and
  1752. * every context-switch.
  1753. *
  1754. * Note that if we fail to enable arbitration before the request
  1755. * is complete, then we do not see the context-switch interrupt and
  1756. * the engine hangs (with RING_HEAD == RING_TAIL).
  1757. *
  1758. * That satisfies both the GPGPU w/a and our heavy-handed paranoia.
  1759. */
  1760. *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
  1761. /* FIXME(BDW): Address space and security selectors. */
  1762. *cs++ = MI_BATCH_BUFFER_START_GEN8 |
  1763. (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
  1764. *cs++ = lower_32_bits(offset);
  1765. *cs++ = upper_32_bits(offset);
  1766. *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
  1767. *cs++ = MI_NOOP;
  1768. intel_ring_advance(rq, cs);
  1769. return 0;
  1770. }
  1771. static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
  1772. {
  1773. struct drm_i915_private *dev_priv = engine->i915;
  1774. I915_WRITE_IMR(engine,
  1775. ~(engine->irq_enable_mask | engine->irq_keep_mask));
  1776. POSTING_READ_FW(RING_IMR(engine->mmio_base));
  1777. }
  1778. static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
  1779. {
  1780. struct drm_i915_private *dev_priv = engine->i915;
  1781. I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
  1782. }
  1783. static int gen8_emit_flush(struct i915_request *request, u32 mode)
  1784. {
  1785. u32 cmd, *cs;
  1786. cs = intel_ring_begin(request, 4);
  1787. if (IS_ERR(cs))
  1788. return PTR_ERR(cs);
  1789. cmd = MI_FLUSH_DW + 1;
  1790. /* We always require a command barrier so that subsequent
  1791. * commands, such as breadcrumb interrupts, are strictly ordered
  1792. * wrt the contents of the write cache being flushed to memory
  1793. * (and thus being coherent from the CPU).
  1794. */
  1795. cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
  1796. if (mode & EMIT_INVALIDATE) {
  1797. cmd |= MI_INVALIDATE_TLB;
  1798. if (request->engine->id == VCS)
  1799. cmd |= MI_INVALIDATE_BSD;
  1800. }
  1801. *cs++ = cmd;
  1802. *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
  1803. *cs++ = 0; /* upper addr */
  1804. *cs++ = 0; /* value */
  1805. intel_ring_advance(request, cs);
  1806. return 0;
  1807. }
  1808. static int gen8_emit_flush_render(struct i915_request *request,
  1809. u32 mode)
  1810. {
  1811. struct intel_engine_cs *engine = request->engine;
  1812. u32 scratch_addr =
  1813. i915_scratch_offset(engine->i915) + 2 * CACHELINE_BYTES;
  1814. bool vf_flush_wa = false, dc_flush_wa = false;
  1815. u32 *cs, flags = 0;
  1816. int len;
  1817. flags |= PIPE_CONTROL_CS_STALL;
  1818. if (mode & EMIT_FLUSH) {
  1819. flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
  1820. flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
  1821. flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
  1822. flags |= PIPE_CONTROL_FLUSH_ENABLE;
  1823. }
  1824. if (mode & EMIT_INVALIDATE) {
  1825. flags |= PIPE_CONTROL_TLB_INVALIDATE;
  1826. flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
  1827. flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
  1828. flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
  1829. flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
  1830. flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
  1831. flags |= PIPE_CONTROL_QW_WRITE;
  1832. flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
  1833. /*
  1834. * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
  1835. * pipe control.
  1836. */
  1837. if (IS_GEN9(request->i915))
  1838. vf_flush_wa = true;
  1839. /* WaForGAMHang:kbl */
  1840. if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
  1841. dc_flush_wa = true;
  1842. }
  1843. len = 6;
  1844. if (vf_flush_wa)
  1845. len += 6;
  1846. if (dc_flush_wa)
  1847. len += 12;
  1848. cs = intel_ring_begin(request, len);
  1849. if (IS_ERR(cs))
  1850. return PTR_ERR(cs);
  1851. if (vf_flush_wa)
  1852. cs = gen8_emit_pipe_control(cs, 0, 0);
  1853. if (dc_flush_wa)
  1854. cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
  1855. 0);
  1856. cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
  1857. if (dc_flush_wa)
  1858. cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
  1859. intel_ring_advance(request, cs);
  1860. return 0;
  1861. }
  1862. /*
  1863. * Reserve space for 2 NOOPs at the end of each request to be
  1864. * used as a workaround for not being allowed to do lite
  1865. * restore with HEAD==TAIL (WaIdleLiteRestore).
  1866. */
  1867. static void gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
  1868. {
  1869. /* Ensure there's always at least one preemption point per-request. */
  1870. *cs++ = MI_ARB_CHECK;
  1871. *cs++ = MI_NOOP;
  1872. request->wa_tail = intel_ring_offset(request, cs);
  1873. }
  1874. static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
  1875. {
  1876. /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
  1877. BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
  1878. cs = gen8_emit_ggtt_write(cs, request->global_seqno,
  1879. intel_hws_seqno_address(request->engine));
  1880. *cs++ = MI_USER_INTERRUPT;
  1881. *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
  1882. request->tail = intel_ring_offset(request, cs);
  1883. assert_ring_tail_valid(request->ring, request->tail);
  1884. gen8_emit_wa_tail(request, cs);
  1885. }
  1886. static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
  1887. static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
  1888. {
  1889. /* We're using qword write, seqno should be aligned to 8 bytes. */
  1890. BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
  1891. cs = gen8_emit_ggtt_write_rcs(cs, request->global_seqno,
  1892. intel_hws_seqno_address(request->engine));
  1893. *cs++ = MI_USER_INTERRUPT;
  1894. *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
  1895. request->tail = intel_ring_offset(request, cs);
  1896. assert_ring_tail_valid(request->ring, request->tail);
  1897. gen8_emit_wa_tail(request, cs);
  1898. }
  1899. static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
  1900. static int gen8_init_rcs_context(struct i915_request *rq)
  1901. {
  1902. int ret;
  1903. ret = intel_ctx_workarounds_emit(rq);
  1904. if (ret)
  1905. return ret;
  1906. ret = intel_rcs_context_init_mocs(rq);
  1907. /*
  1908. * Failing to program the MOCS is non-fatal.The system will not
  1909. * run at peak performance. So generate an error and carry on.
  1910. */
  1911. if (ret)
  1912. DRM_ERROR("MOCS failed to program: expect performance issues.\n");
  1913. return i915_gem_render_state_emit(rq);
  1914. }
  1915. /**
  1916. * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
  1917. * @engine: Engine Command Streamer.
  1918. */
  1919. void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
  1920. {
  1921. struct drm_i915_private *dev_priv;
  1922. /*
  1923. * Tasklet cannot be active at this point due intel_mark_active/idle
  1924. * so this is just for documentation.
  1925. */
  1926. if (WARN_ON(test_bit(TASKLET_STATE_SCHED,
  1927. &engine->execlists.tasklet.state)))
  1928. tasklet_kill(&engine->execlists.tasklet);
  1929. dev_priv = engine->i915;
  1930. if (engine->buffer) {
  1931. WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0);
  1932. }
  1933. if (engine->cleanup)
  1934. engine->cleanup(engine);
  1935. intel_engine_cleanup_common(engine);
  1936. lrc_destroy_wa_ctx(engine);
  1937. engine->i915 = NULL;
  1938. dev_priv->engine[engine->id] = NULL;
  1939. kfree(engine);
  1940. }
  1941. void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
  1942. {
  1943. engine->submit_request = execlists_submit_request;
  1944. engine->cancel_requests = execlists_cancel_requests;
  1945. engine->schedule = execlists_schedule;
  1946. engine->execlists.tasklet.func = execlists_submission_tasklet;
  1947. engine->reset.prepare = execlists_reset_prepare;
  1948. engine->park = NULL;
  1949. engine->unpark = NULL;
  1950. engine->flags |= I915_ENGINE_SUPPORTS_STATS;
  1951. if (engine->i915->preempt_context)
  1952. engine->flags |= I915_ENGINE_HAS_PREEMPTION;
  1953. engine->i915->caps.scheduler =
  1954. I915_SCHEDULER_CAP_ENABLED |
  1955. I915_SCHEDULER_CAP_PRIORITY;
  1956. if (intel_engine_has_preemption(engine))
  1957. engine->i915->caps.scheduler |= I915_SCHEDULER_CAP_PREEMPTION;
  1958. }
  1959. static void
  1960. logical_ring_default_vfuncs(struct intel_engine_cs *engine)
  1961. {
  1962. /* Default vfuncs which can be overriden by each engine. */
  1963. engine->init_hw = gen8_init_common_ring;
  1964. engine->reset.prepare = execlists_reset_prepare;
  1965. engine->reset.reset = execlists_reset;
  1966. engine->reset.finish = execlists_reset_finish;
  1967. engine->context_pin = execlists_context_pin;
  1968. engine->request_alloc = execlists_request_alloc;
  1969. engine->emit_flush = gen8_emit_flush;
  1970. engine->emit_breadcrumb = gen8_emit_breadcrumb;
  1971. engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
  1972. engine->set_default_submission = intel_execlists_set_default_submission;
  1973. if (INTEL_GEN(engine->i915) < 11) {
  1974. engine->irq_enable = gen8_logical_ring_enable_irq;
  1975. engine->irq_disable = gen8_logical_ring_disable_irq;
  1976. } else {
  1977. /*
  1978. * TODO: On Gen11 interrupt masks need to be clear
  1979. * to allow C6 entry. Keep interrupts enabled at
  1980. * and take the hit of generating extra interrupts
  1981. * until a more refined solution exists.
  1982. */
  1983. }
  1984. engine->emit_bb_start = gen8_emit_bb_start;
  1985. }
  1986. static inline void
  1987. logical_ring_default_irqs(struct intel_engine_cs *engine)
  1988. {
  1989. unsigned int shift = 0;
  1990. if (INTEL_GEN(engine->i915) < 11) {
  1991. const u8 irq_shifts[] = {
  1992. [RCS] = GEN8_RCS_IRQ_SHIFT,
  1993. [BCS] = GEN8_BCS_IRQ_SHIFT,
  1994. [VCS] = GEN8_VCS1_IRQ_SHIFT,
  1995. [VCS2] = GEN8_VCS2_IRQ_SHIFT,
  1996. [VECS] = GEN8_VECS_IRQ_SHIFT,
  1997. };
  1998. shift = irq_shifts[engine->id];
  1999. }
  2000. engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
  2001. engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
  2002. }
  2003. static void
  2004. logical_ring_setup(struct intel_engine_cs *engine)
  2005. {
  2006. intel_engine_setup_common(engine);
  2007. /* Intentionally left blank. */
  2008. engine->buffer = NULL;
  2009. tasklet_init(&engine->execlists.tasklet,
  2010. execlists_submission_tasklet, (unsigned long)engine);
  2011. logical_ring_default_vfuncs(engine);
  2012. logical_ring_default_irqs(engine);
  2013. }
  2014. static bool csb_force_mmio(struct drm_i915_private *i915)
  2015. {
  2016. /* Older GVT emulation depends upon intercepting CSB mmio */
  2017. return intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915);
  2018. }
  2019. static int logical_ring_init(struct intel_engine_cs *engine)
  2020. {
  2021. struct drm_i915_private *i915 = engine->i915;
  2022. struct intel_engine_execlists * const execlists = &engine->execlists;
  2023. int ret;
  2024. ret = intel_engine_init_common(engine);
  2025. if (ret)
  2026. return ret;
  2027. if (HAS_LOGICAL_RING_ELSQ(i915)) {
  2028. execlists->submit_reg = i915->regs +
  2029. i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
  2030. execlists->ctrl_reg = i915->regs +
  2031. i915_mmio_reg_offset(RING_EXECLIST_CONTROL(engine));
  2032. } else {
  2033. execlists->submit_reg = i915->regs +
  2034. i915_mmio_reg_offset(RING_ELSP(engine));
  2035. }
  2036. execlists->preempt_complete_status = ~0u;
  2037. if (i915->preempt_context) {
  2038. struct intel_context *ce =
  2039. to_intel_context(i915->preempt_context, engine);
  2040. execlists->preempt_complete_status =
  2041. upper_32_bits(ce->lrc_desc);
  2042. }
  2043. execlists->csb_read =
  2044. i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
  2045. if (csb_force_mmio(i915)) {
  2046. execlists->csb_status = (u32 __force *)
  2047. (i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
  2048. execlists->csb_write = (u32 __force *)execlists->csb_read;
  2049. execlists->csb_write_reset =
  2050. _MASKED_FIELD(GEN8_CSB_WRITE_PTR_MASK,
  2051. GEN8_CSB_ENTRIES - 1);
  2052. } else {
  2053. execlists->csb_status =
  2054. &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
  2055. execlists->csb_write =
  2056. &engine->status_page.page_addr[intel_hws_csb_write_index(i915)];
  2057. execlists->csb_write_reset = GEN8_CSB_ENTRIES - 1;
  2058. }
  2059. reset_csb_pointers(execlists);
  2060. return 0;
  2061. }
  2062. int logical_render_ring_init(struct intel_engine_cs *engine)
  2063. {
  2064. struct drm_i915_private *dev_priv = engine->i915;
  2065. int ret;
  2066. logical_ring_setup(engine);
  2067. if (HAS_L3_DPF(dev_priv))
  2068. engine->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
  2069. /* Override some for render ring. */
  2070. if (INTEL_GEN(dev_priv) >= 9)
  2071. engine->init_hw = gen9_init_render_ring;
  2072. else
  2073. engine->init_hw = gen8_init_render_ring;
  2074. engine->init_context = gen8_init_rcs_context;
  2075. engine->emit_flush = gen8_emit_flush_render;
  2076. engine->emit_breadcrumb = gen8_emit_breadcrumb_rcs;
  2077. engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_rcs_sz;
  2078. ret = logical_ring_init(engine);
  2079. if (ret)
  2080. return ret;
  2081. ret = intel_init_workaround_bb(engine);
  2082. if (ret) {
  2083. /*
  2084. * We continue even if we fail to initialize WA batch
  2085. * because we only expect rare glitches but nothing
  2086. * critical to prevent us from using GPU
  2087. */
  2088. DRM_ERROR("WA batch buffer initialization failed: %d\n",
  2089. ret);
  2090. }
  2091. intel_engine_init_workarounds(engine);
  2092. return 0;
  2093. }
  2094. int logical_xcs_ring_init(struct intel_engine_cs *engine)
  2095. {
  2096. logical_ring_setup(engine);
  2097. return logical_ring_init(engine);
  2098. }
  2099. static u32
  2100. make_rpcs(struct drm_i915_private *dev_priv)
  2101. {
  2102. bool subslice_pg = INTEL_INFO(dev_priv)->sseu.has_subslice_pg;
  2103. u8 slices = hweight8(INTEL_INFO(dev_priv)->sseu.slice_mask);
  2104. u8 subslices = hweight8(INTEL_INFO(dev_priv)->sseu.subslice_mask[0]);
  2105. u32 rpcs = 0;
  2106. /*
  2107. * No explicit RPCS request is needed to ensure full
  2108. * slice/subslice/EU enablement prior to Gen9.
  2109. */
  2110. if (INTEL_GEN(dev_priv) < 9)
  2111. return 0;
  2112. /*
  2113. * Since the SScount bitfield in GEN8_R_PWR_CLK_STATE is only three bits
  2114. * wide and Icelake has up to eight subslices, specfial programming is
  2115. * needed in order to correctly enable all subslices.
  2116. *
  2117. * According to documentation software must consider the configuration
  2118. * as 2x4x8 and hardware will translate this to 1x8x8.
  2119. *
  2120. * Furthemore, even though SScount is three bits, maximum documented
  2121. * value for it is four. From this some rules/restrictions follow:
  2122. *
  2123. * 1.
  2124. * If enabled subslice count is greater than four, two whole slices must
  2125. * be enabled instead.
  2126. *
  2127. * 2.
  2128. * When more than one slice is enabled, hardware ignores the subslice
  2129. * count altogether.
  2130. *
  2131. * From these restrictions it follows that it is not possible to enable
  2132. * a count of subslices between the SScount maximum of four restriction,
  2133. * and the maximum available number on a particular SKU. Either all
  2134. * subslices are enabled, or a count between one and four on the first
  2135. * slice.
  2136. */
  2137. if (IS_GEN11(dev_priv) && slices == 1 && subslices >= 4) {
  2138. GEM_BUG_ON(subslices & 1);
  2139. subslice_pg = false;
  2140. slices *= 2;
  2141. }
  2142. /*
  2143. * Starting in Gen9, render power gating can leave
  2144. * slice/subslice/EU in a partially enabled state. We
  2145. * must make an explicit request through RPCS for full
  2146. * enablement.
  2147. */
  2148. if (INTEL_INFO(dev_priv)->sseu.has_slice_pg) {
  2149. u32 mask, val = slices;
  2150. if (INTEL_GEN(dev_priv) >= 11) {
  2151. mask = GEN11_RPCS_S_CNT_MASK;
  2152. val <<= GEN11_RPCS_S_CNT_SHIFT;
  2153. } else {
  2154. mask = GEN8_RPCS_S_CNT_MASK;
  2155. val <<= GEN8_RPCS_S_CNT_SHIFT;
  2156. }
  2157. GEM_BUG_ON(val & ~mask);
  2158. val &= mask;
  2159. rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_S_CNT_ENABLE | val;
  2160. }
  2161. if (subslice_pg) {
  2162. u32 val = subslices;
  2163. val <<= GEN8_RPCS_SS_CNT_SHIFT;
  2164. GEM_BUG_ON(val & ~GEN8_RPCS_SS_CNT_MASK);
  2165. val &= GEN8_RPCS_SS_CNT_MASK;
  2166. rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_SS_CNT_ENABLE | val;
  2167. }
  2168. if (INTEL_INFO(dev_priv)->sseu.has_eu_pg) {
  2169. u32 val;
  2170. val = INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
  2171. GEN8_RPCS_EU_MIN_SHIFT;
  2172. GEM_BUG_ON(val & ~GEN8_RPCS_EU_MIN_MASK);
  2173. val &= GEN8_RPCS_EU_MIN_MASK;
  2174. rpcs |= val;
  2175. val = INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
  2176. GEN8_RPCS_EU_MAX_SHIFT;
  2177. GEM_BUG_ON(val & ~GEN8_RPCS_EU_MAX_MASK);
  2178. val &= GEN8_RPCS_EU_MAX_MASK;
  2179. rpcs |= val;
  2180. rpcs |= GEN8_RPCS_ENABLE;
  2181. }
  2182. return rpcs;
  2183. }
  2184. static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
  2185. {
  2186. u32 indirect_ctx_offset;
  2187. switch (INTEL_GEN(engine->i915)) {
  2188. default:
  2189. MISSING_CASE(INTEL_GEN(engine->i915));
  2190. /* fall through */
  2191. case 11:
  2192. indirect_ctx_offset =
  2193. GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
  2194. break;
  2195. case 10:
  2196. indirect_ctx_offset =
  2197. GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
  2198. break;
  2199. case 9:
  2200. indirect_ctx_offset =
  2201. GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
  2202. break;
  2203. case 8:
  2204. indirect_ctx_offset =
  2205. GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
  2206. break;
  2207. }
  2208. return indirect_ctx_offset;
  2209. }
  2210. static void execlists_init_reg_state(u32 *regs,
  2211. struct i915_gem_context *ctx,
  2212. struct intel_engine_cs *engine,
  2213. struct intel_ring *ring)
  2214. {
  2215. struct drm_i915_private *dev_priv = engine->i915;
  2216. struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
  2217. u32 base = engine->mmio_base;
  2218. bool rcs = engine->class == RENDER_CLASS;
  2219. /* A context is actually a big batch buffer with several
  2220. * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
  2221. * values we are setting here are only for the first context restore:
  2222. * on a subsequent save, the GPU will recreate this batchbuffer with new
  2223. * values (including all the missing MI_LOAD_REGISTER_IMM commands that
  2224. * we are not initializing here).
  2225. */
  2226. regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
  2227. MI_LRI_FORCE_POSTED;
  2228. CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(engine),
  2229. _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
  2230. _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH));
  2231. if (INTEL_GEN(dev_priv) < 11) {
  2232. regs[CTX_CONTEXT_CONTROL + 1] |=
  2233. _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
  2234. CTX_CTRL_RS_CTX_ENABLE);
  2235. }
  2236. CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
  2237. CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
  2238. CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
  2239. CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
  2240. RING_CTL_SIZE(ring->size) | RING_VALID);
  2241. CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
  2242. CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
  2243. CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
  2244. CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
  2245. CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
  2246. CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
  2247. if (rcs) {
  2248. struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
  2249. CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
  2250. CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
  2251. RING_INDIRECT_CTX_OFFSET(base), 0);
  2252. if (wa_ctx->indirect_ctx.size) {
  2253. u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
  2254. regs[CTX_RCS_INDIRECT_CTX + 1] =
  2255. (ggtt_offset + wa_ctx->indirect_ctx.offset) |
  2256. (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
  2257. regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
  2258. intel_lr_indirect_ctx_offset(engine) << 6;
  2259. }
  2260. CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
  2261. if (wa_ctx->per_ctx.size) {
  2262. u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
  2263. regs[CTX_BB_PER_CTX_PTR + 1] =
  2264. (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
  2265. }
  2266. }
  2267. regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
  2268. CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
  2269. /* PDP values well be assigned later if needed */
  2270. CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3), 0);
  2271. CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3), 0);
  2272. CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2), 0);
  2273. CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2), 0);
  2274. CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1), 0);
  2275. CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1), 0);
  2276. CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0);
  2277. CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0);
  2278. if (ppgtt && i915_vm_is_48bit(&ppgtt->vm)) {
  2279. /* 64b PPGTT (48bit canonical)
  2280. * PDP0_DESCRIPTOR contains the base address to PML4 and
  2281. * other PDP Descriptors are ignored.
  2282. */
  2283. ASSIGN_CTX_PML4(ppgtt, regs);
  2284. }
  2285. if (rcs) {
  2286. regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
  2287. CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
  2288. make_rpcs(dev_priv));
  2289. i915_oa_init_reg_state(engine, ctx, regs);
  2290. }
  2291. regs[CTX_END] = MI_BATCH_BUFFER_END;
  2292. if (INTEL_GEN(dev_priv) >= 10)
  2293. regs[CTX_END] |= BIT(0);
  2294. }
  2295. static int
  2296. populate_lr_context(struct i915_gem_context *ctx,
  2297. struct drm_i915_gem_object *ctx_obj,
  2298. struct intel_engine_cs *engine,
  2299. struct intel_ring *ring)
  2300. {
  2301. void *vaddr;
  2302. u32 *regs;
  2303. int ret;
  2304. ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
  2305. if (ret) {
  2306. DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
  2307. return ret;
  2308. }
  2309. vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
  2310. if (IS_ERR(vaddr)) {
  2311. ret = PTR_ERR(vaddr);
  2312. DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
  2313. return ret;
  2314. }
  2315. ctx_obj->mm.dirty = true;
  2316. if (engine->default_state) {
  2317. /*
  2318. * We only want to copy over the template context state;
  2319. * skipping over the headers reserved for GuC communication,
  2320. * leaving those as zero.
  2321. */
  2322. const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
  2323. void *defaults;
  2324. defaults = i915_gem_object_pin_map(engine->default_state,
  2325. I915_MAP_WB);
  2326. if (IS_ERR(defaults)) {
  2327. ret = PTR_ERR(defaults);
  2328. goto err_unpin_ctx;
  2329. }
  2330. memcpy(vaddr + start, defaults + start, engine->context_size);
  2331. i915_gem_object_unpin_map(engine->default_state);
  2332. }
  2333. /* The second page of the context object contains some fields which must
  2334. * be set up prior to the first execution. */
  2335. regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
  2336. execlists_init_reg_state(regs, ctx, engine, ring);
  2337. if (!engine->default_state)
  2338. regs[CTX_CONTEXT_CONTROL + 1] |=
  2339. _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
  2340. if (ctx == ctx->i915->preempt_context && INTEL_GEN(engine->i915) < 11)
  2341. regs[CTX_CONTEXT_CONTROL + 1] |=
  2342. _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
  2343. CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT);
  2344. err_unpin_ctx:
  2345. i915_gem_object_unpin_map(ctx_obj);
  2346. return ret;
  2347. }
  2348. static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
  2349. struct intel_engine_cs *engine,
  2350. struct intel_context *ce)
  2351. {
  2352. struct drm_i915_gem_object *ctx_obj;
  2353. struct i915_vma *vma;
  2354. uint32_t context_size;
  2355. struct intel_ring *ring;
  2356. struct i915_timeline *timeline;
  2357. int ret;
  2358. if (ce->state)
  2359. return 0;
  2360. context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
  2361. /*
  2362. * Before the actual start of the context image, we insert a few pages
  2363. * for our own use and for sharing with the GuC.
  2364. */
  2365. context_size += LRC_HEADER_PAGES * PAGE_SIZE;
  2366. ctx_obj = i915_gem_object_create(ctx->i915, context_size);
  2367. if (IS_ERR(ctx_obj))
  2368. return PTR_ERR(ctx_obj);
  2369. vma = i915_vma_instance(ctx_obj, &ctx->i915->ggtt.vm, NULL);
  2370. if (IS_ERR(vma)) {
  2371. ret = PTR_ERR(vma);
  2372. goto error_deref_obj;
  2373. }
  2374. timeline = i915_timeline_create(ctx->i915, ctx->name);
  2375. if (IS_ERR(timeline)) {
  2376. ret = PTR_ERR(timeline);
  2377. goto error_deref_obj;
  2378. }
  2379. ring = intel_engine_create_ring(engine, timeline, ctx->ring_size);
  2380. i915_timeline_put(timeline);
  2381. if (IS_ERR(ring)) {
  2382. ret = PTR_ERR(ring);
  2383. goto error_deref_obj;
  2384. }
  2385. ret = populate_lr_context(ctx, ctx_obj, engine, ring);
  2386. if (ret) {
  2387. DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
  2388. goto error_ring_free;
  2389. }
  2390. ce->ring = ring;
  2391. ce->state = vma;
  2392. return 0;
  2393. error_ring_free:
  2394. intel_ring_free(ring);
  2395. error_deref_obj:
  2396. i915_gem_object_put(ctx_obj);
  2397. return ret;
  2398. }
  2399. void intel_lr_context_resume(struct drm_i915_private *i915)
  2400. {
  2401. struct intel_engine_cs *engine;
  2402. struct i915_gem_context *ctx;
  2403. enum intel_engine_id id;
  2404. /*
  2405. * Because we emit WA_TAIL_DWORDS there may be a disparity
  2406. * between our bookkeeping in ce->ring->head and ce->ring->tail and
  2407. * that stored in context. As we only write new commands from
  2408. * ce->ring->tail onwards, everything before that is junk. If the GPU
  2409. * starts reading from its RING_HEAD from the context, it may try to
  2410. * execute that junk and die.
  2411. *
  2412. * So to avoid that we reset the context images upon resume. For
  2413. * simplicity, we just zero everything out.
  2414. */
  2415. list_for_each_entry(ctx, &i915->contexts.list, link) {
  2416. for_each_engine(engine, i915, id) {
  2417. struct intel_context *ce =
  2418. to_intel_context(ctx, engine);
  2419. if (!ce->state)
  2420. continue;
  2421. intel_ring_reset(ce->ring, 0);
  2422. if (ce->pin_count) { /* otherwise done in context_pin */
  2423. u32 *regs = ce->lrc_reg_state;
  2424. regs[CTX_RING_HEAD + 1] = ce->ring->head;
  2425. regs[CTX_RING_TAIL + 1] = ce->ring->tail;
  2426. }
  2427. }
  2428. }
  2429. }
  2430. #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
  2431. #include "selftests/intel_lrc.c"
  2432. #endif