intel_hangcheck.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236
  1. /*
  2. * Copyright © 2016 Intel Corporation
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice (including the next
  12. * paragraph) shall be included in all copies or substantial portions of the
  13. * Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21. * IN THE SOFTWARE.
  22. *
  23. */
  24. #include <linux/kthread.h>
  25. #include "../i915_selftest.h"
  26. #include "i915_random.h"
  27. #include "igt_flush_test.h"
  28. #include "mock_context.h"
  29. #include "mock_drm.h"
  30. #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
  31. struct hang {
  32. struct drm_i915_private *i915;
  33. struct drm_i915_gem_object *hws;
  34. struct drm_i915_gem_object *obj;
  35. struct i915_gem_context *ctx;
  36. u32 *seqno;
  37. u32 *batch;
  38. };
  39. static int hang_init(struct hang *h, struct drm_i915_private *i915)
  40. {
  41. void *vaddr;
  42. int err;
  43. memset(h, 0, sizeof(*h));
  44. h->i915 = i915;
  45. h->ctx = kernel_context(i915);
  46. if (IS_ERR(h->ctx))
  47. return PTR_ERR(h->ctx);
  48. h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  49. if (IS_ERR(h->hws)) {
  50. err = PTR_ERR(h->hws);
  51. goto err_ctx;
  52. }
  53. h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  54. if (IS_ERR(h->obj)) {
  55. err = PTR_ERR(h->obj);
  56. goto err_hws;
  57. }
  58. i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
  59. vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  60. if (IS_ERR(vaddr)) {
  61. err = PTR_ERR(vaddr);
  62. goto err_obj;
  63. }
  64. h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  65. vaddr = i915_gem_object_pin_map(h->obj,
  66. HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
  67. if (IS_ERR(vaddr)) {
  68. err = PTR_ERR(vaddr);
  69. goto err_unpin_hws;
  70. }
  71. h->batch = vaddr;
  72. return 0;
  73. err_unpin_hws:
  74. i915_gem_object_unpin_map(h->hws);
  75. err_obj:
  76. i915_gem_object_put(h->obj);
  77. err_hws:
  78. i915_gem_object_put(h->hws);
  79. err_ctx:
  80. kernel_context_close(h->ctx);
  81. return err;
  82. }
  83. static u64 hws_address(const struct i915_vma *hws,
  84. const struct i915_request *rq)
  85. {
  86. return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  87. }
  88. static int emit_recurse_batch(struct hang *h,
  89. struct i915_request *rq)
  90. {
  91. struct drm_i915_private *i915 = h->i915;
  92. struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
  93. struct i915_vma *hws, *vma;
  94. unsigned int flags;
  95. u32 *batch;
  96. int err;
  97. vma = i915_vma_instance(h->obj, vm, NULL);
  98. if (IS_ERR(vma))
  99. return PTR_ERR(vma);
  100. hws = i915_vma_instance(h->hws, vm, NULL);
  101. if (IS_ERR(hws))
  102. return PTR_ERR(hws);
  103. err = i915_vma_pin(vma, 0, 0, PIN_USER);
  104. if (err)
  105. return err;
  106. err = i915_vma_pin(hws, 0, 0, PIN_USER);
  107. if (err)
  108. goto unpin_vma;
  109. i915_vma_move_to_active(vma, rq, 0);
  110. if (!i915_gem_object_has_active_reference(vma->obj)) {
  111. i915_gem_object_get(vma->obj);
  112. i915_gem_object_set_active_reference(vma->obj);
  113. }
  114. i915_vma_move_to_active(hws, rq, 0);
  115. if (!i915_gem_object_has_active_reference(hws->obj)) {
  116. i915_gem_object_get(hws->obj);
  117. i915_gem_object_set_active_reference(hws->obj);
  118. }
  119. batch = h->batch;
  120. if (INTEL_GEN(i915) >= 8) {
  121. *batch++ = MI_STORE_DWORD_IMM_GEN4;
  122. *batch++ = lower_32_bits(hws_address(hws, rq));
  123. *batch++ = upper_32_bits(hws_address(hws, rq));
  124. *batch++ = rq->fence.seqno;
  125. *batch++ = MI_ARB_CHECK;
  126. memset(batch, 0, 1024);
  127. batch += 1024 / sizeof(*batch);
  128. *batch++ = MI_ARB_CHECK;
  129. *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
  130. *batch++ = lower_32_bits(vma->node.start);
  131. *batch++ = upper_32_bits(vma->node.start);
  132. } else if (INTEL_GEN(i915) >= 6) {
  133. *batch++ = MI_STORE_DWORD_IMM_GEN4;
  134. *batch++ = 0;
  135. *batch++ = lower_32_bits(hws_address(hws, rq));
  136. *batch++ = rq->fence.seqno;
  137. *batch++ = MI_ARB_CHECK;
  138. memset(batch, 0, 1024);
  139. batch += 1024 / sizeof(*batch);
  140. *batch++ = MI_ARB_CHECK;
  141. *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
  142. *batch++ = lower_32_bits(vma->node.start);
  143. } else if (INTEL_GEN(i915) >= 4) {
  144. *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
  145. *batch++ = 0;
  146. *batch++ = lower_32_bits(hws_address(hws, rq));
  147. *batch++ = rq->fence.seqno;
  148. *batch++ = MI_ARB_CHECK;
  149. memset(batch, 0, 1024);
  150. batch += 1024 / sizeof(*batch);
  151. *batch++ = MI_ARB_CHECK;
  152. *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
  153. *batch++ = lower_32_bits(vma->node.start);
  154. } else {
  155. *batch++ = MI_STORE_DWORD_IMM;
  156. *batch++ = lower_32_bits(hws_address(hws, rq));
  157. *batch++ = rq->fence.seqno;
  158. *batch++ = MI_ARB_CHECK;
  159. memset(batch, 0, 1024);
  160. batch += 1024 / sizeof(*batch);
  161. *batch++ = MI_ARB_CHECK;
  162. *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
  163. *batch++ = lower_32_bits(vma->node.start);
  164. }
  165. *batch++ = MI_BATCH_BUFFER_END; /* not reached */
  166. i915_gem_chipset_flush(h->i915);
  167. flags = 0;
  168. if (INTEL_GEN(vm->i915) <= 5)
  169. flags |= I915_DISPATCH_SECURE;
  170. err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
  171. i915_vma_unpin(hws);
  172. unpin_vma:
  173. i915_vma_unpin(vma);
  174. return err;
  175. }
  176. static struct i915_request *
  177. hang_create_request(struct hang *h, struct intel_engine_cs *engine)
  178. {
  179. struct i915_request *rq;
  180. int err;
  181. if (i915_gem_object_is_active(h->obj)) {
  182. struct drm_i915_gem_object *obj;
  183. void *vaddr;
  184. obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
  185. if (IS_ERR(obj))
  186. return ERR_CAST(obj);
  187. vaddr = i915_gem_object_pin_map(obj,
  188. HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
  189. if (IS_ERR(vaddr)) {
  190. i915_gem_object_put(obj);
  191. return ERR_CAST(vaddr);
  192. }
  193. i915_gem_object_unpin_map(h->obj);
  194. i915_gem_object_put(h->obj);
  195. h->obj = obj;
  196. h->batch = vaddr;
  197. }
  198. rq = i915_request_alloc(engine, h->ctx);
  199. if (IS_ERR(rq))
  200. return rq;
  201. err = emit_recurse_batch(h, rq);
  202. if (err) {
  203. __i915_request_add(rq, false);
  204. return ERR_PTR(err);
  205. }
  206. return rq;
  207. }
  208. static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
  209. {
  210. return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
  211. }
  212. static void hang_fini(struct hang *h)
  213. {
  214. *h->batch = MI_BATCH_BUFFER_END;
  215. i915_gem_chipset_flush(h->i915);
  216. i915_gem_object_unpin_map(h->obj);
  217. i915_gem_object_put(h->obj);
  218. i915_gem_object_unpin_map(h->hws);
  219. i915_gem_object_put(h->hws);
  220. kernel_context_close(h->ctx);
  221. igt_flush_test(h->i915, I915_WAIT_LOCKED);
  222. }
  223. static bool wait_until_running(struct hang *h, struct i915_request *rq)
  224. {
  225. return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
  226. rq->fence.seqno),
  227. 10) &&
  228. wait_for(i915_seqno_passed(hws_seqno(h, rq),
  229. rq->fence.seqno),
  230. 1000));
  231. }
  232. static int igt_hang_sanitycheck(void *arg)
  233. {
  234. struct drm_i915_private *i915 = arg;
  235. struct i915_request *rq;
  236. struct intel_engine_cs *engine;
  237. enum intel_engine_id id;
  238. struct hang h;
  239. int err;
  240. /* Basic check that we can execute our hanging batch */
  241. mutex_lock(&i915->drm.struct_mutex);
  242. err = hang_init(&h, i915);
  243. if (err)
  244. goto unlock;
  245. for_each_engine(engine, i915, id) {
  246. long timeout;
  247. if (!intel_engine_can_store_dword(engine))
  248. continue;
  249. rq = hang_create_request(&h, engine);
  250. if (IS_ERR(rq)) {
  251. err = PTR_ERR(rq);
  252. pr_err("Failed to create request for %s, err=%d\n",
  253. engine->name, err);
  254. goto fini;
  255. }
  256. i915_request_get(rq);
  257. *h.batch = MI_BATCH_BUFFER_END;
  258. i915_gem_chipset_flush(i915);
  259. __i915_request_add(rq, true);
  260. timeout = i915_request_wait(rq,
  261. I915_WAIT_LOCKED,
  262. MAX_SCHEDULE_TIMEOUT);
  263. i915_request_put(rq);
  264. if (timeout < 0) {
  265. err = timeout;
  266. pr_err("Wait for request failed on %s, err=%d\n",
  267. engine->name, err);
  268. goto fini;
  269. }
  270. }
  271. fini:
  272. hang_fini(&h);
  273. unlock:
  274. mutex_unlock(&i915->drm.struct_mutex);
  275. return err;
  276. }
  277. static void global_reset_lock(struct drm_i915_private *i915)
  278. {
  279. struct intel_engine_cs *engine;
  280. enum intel_engine_id id;
  281. pr_debug("%s: current gpu_error=%08lx\n",
  282. __func__, i915->gpu_error.flags);
  283. while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
  284. wait_event(i915->gpu_error.reset_queue,
  285. !test_bit(I915_RESET_BACKOFF,
  286. &i915->gpu_error.flags));
  287. for_each_engine(engine, i915, id) {
  288. while (test_and_set_bit(I915_RESET_ENGINE + id,
  289. &i915->gpu_error.flags))
  290. wait_on_bit(&i915->gpu_error.flags,
  291. I915_RESET_ENGINE + id,
  292. TASK_UNINTERRUPTIBLE);
  293. }
  294. }
  295. static void global_reset_unlock(struct drm_i915_private *i915)
  296. {
  297. struct intel_engine_cs *engine;
  298. enum intel_engine_id id;
  299. for_each_engine(engine, i915, id)
  300. clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  301. clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
  302. wake_up_all(&i915->gpu_error.reset_queue);
  303. }
  304. static int igt_global_reset(void *arg)
  305. {
  306. struct drm_i915_private *i915 = arg;
  307. unsigned int reset_count;
  308. int err = 0;
  309. /* Check that we can issue a global GPU reset */
  310. global_reset_lock(i915);
  311. set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
  312. mutex_lock(&i915->drm.struct_mutex);
  313. reset_count = i915_reset_count(&i915->gpu_error);
  314. i915_reset(i915, ALL_ENGINES, NULL);
  315. if (i915_reset_count(&i915->gpu_error) == reset_count) {
  316. pr_err("No GPU reset recorded!\n");
  317. err = -EINVAL;
  318. }
  319. mutex_unlock(&i915->drm.struct_mutex);
  320. GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
  321. global_reset_unlock(i915);
  322. if (i915_terminally_wedged(&i915->gpu_error))
  323. err = -EIO;
  324. return err;
  325. }
  326. static bool wait_for_idle(struct intel_engine_cs *engine)
  327. {
  328. return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
  329. }
  330. static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
  331. {
  332. struct intel_engine_cs *engine;
  333. enum intel_engine_id id;
  334. struct hang h;
  335. int err = 0;
  336. /* Check that we can issue an engine reset on an idle engine (no-op) */
  337. if (!intel_has_reset_engine(i915))
  338. return 0;
  339. if (active) {
  340. mutex_lock(&i915->drm.struct_mutex);
  341. err = hang_init(&h, i915);
  342. mutex_unlock(&i915->drm.struct_mutex);
  343. if (err)
  344. return err;
  345. }
  346. for_each_engine(engine, i915, id) {
  347. unsigned int reset_count, reset_engine_count;
  348. IGT_TIMEOUT(end_time);
  349. if (active && !intel_engine_can_store_dword(engine))
  350. continue;
  351. if (!wait_for_idle(engine)) {
  352. pr_err("%s failed to idle before reset\n",
  353. engine->name);
  354. err = -EIO;
  355. break;
  356. }
  357. reset_count = i915_reset_count(&i915->gpu_error);
  358. reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
  359. engine);
  360. set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  361. do {
  362. u32 seqno = intel_engine_get_seqno(engine);
  363. if (active) {
  364. struct i915_request *rq;
  365. mutex_lock(&i915->drm.struct_mutex);
  366. rq = hang_create_request(&h, engine);
  367. if (IS_ERR(rq)) {
  368. err = PTR_ERR(rq);
  369. mutex_unlock(&i915->drm.struct_mutex);
  370. break;
  371. }
  372. i915_request_get(rq);
  373. __i915_request_add(rq, true);
  374. mutex_unlock(&i915->drm.struct_mutex);
  375. if (!wait_until_running(&h, rq)) {
  376. struct drm_printer p = drm_info_printer(i915->drm.dev);
  377. pr_err("%s: Failed to start request %x, at %x\n",
  378. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  379. intel_engine_dump(engine, &p,
  380. "%s\n", engine->name);
  381. i915_request_put(rq);
  382. err = -EIO;
  383. break;
  384. }
  385. GEM_BUG_ON(!rq->global_seqno);
  386. seqno = rq->global_seqno - 1;
  387. i915_request_put(rq);
  388. }
  389. err = i915_reset_engine(engine, NULL);
  390. if (err) {
  391. pr_err("i915_reset_engine failed\n");
  392. break;
  393. }
  394. if (i915_reset_count(&i915->gpu_error) != reset_count) {
  395. pr_err("Full GPU reset recorded! (engine reset expected)\n");
  396. err = -EINVAL;
  397. break;
  398. }
  399. reset_engine_count += active;
  400. if (i915_reset_engine_count(&i915->gpu_error, engine) !=
  401. reset_engine_count) {
  402. pr_err("%s engine reset %srecorded!\n",
  403. engine->name, active ? "not " : "");
  404. err = -EINVAL;
  405. break;
  406. }
  407. if (!wait_for_idle(engine)) {
  408. struct drm_printer p =
  409. drm_info_printer(i915->drm.dev);
  410. pr_err("%s failed to idle after reset\n",
  411. engine->name);
  412. intel_engine_dump(engine, &p,
  413. "%s\n", engine->name);
  414. err = -EIO;
  415. break;
  416. }
  417. } while (time_before(jiffies, end_time));
  418. clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  419. if (err)
  420. break;
  421. err = igt_flush_test(i915, 0);
  422. if (err)
  423. break;
  424. }
  425. if (i915_terminally_wedged(&i915->gpu_error))
  426. err = -EIO;
  427. if (active) {
  428. mutex_lock(&i915->drm.struct_mutex);
  429. hang_fini(&h);
  430. mutex_unlock(&i915->drm.struct_mutex);
  431. }
  432. return err;
  433. }
  434. static int igt_reset_idle_engine(void *arg)
  435. {
  436. return __igt_reset_engine(arg, false);
  437. }
  438. static int igt_reset_active_engine(void *arg)
  439. {
  440. return __igt_reset_engine(arg, true);
  441. }
  442. struct active_engine {
  443. struct task_struct *task;
  444. struct intel_engine_cs *engine;
  445. unsigned long resets;
  446. unsigned int flags;
  447. };
  448. #define TEST_ACTIVE BIT(0)
  449. #define TEST_OTHERS BIT(1)
  450. #define TEST_SELF BIT(2)
  451. #define TEST_PRIORITY BIT(3)
  452. static int active_engine(void *data)
  453. {
  454. I915_RND_STATE(prng);
  455. struct active_engine *arg = data;
  456. struct intel_engine_cs *engine = arg->engine;
  457. struct i915_request *rq[8] = {};
  458. struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
  459. struct drm_file *file;
  460. unsigned long count = 0;
  461. int err = 0;
  462. file = mock_file(engine->i915);
  463. if (IS_ERR(file))
  464. return PTR_ERR(file);
  465. for (count = 0; count < ARRAY_SIZE(ctx); count++) {
  466. mutex_lock(&engine->i915->drm.struct_mutex);
  467. ctx[count] = live_context(engine->i915, file);
  468. mutex_unlock(&engine->i915->drm.struct_mutex);
  469. if (IS_ERR(ctx[count])) {
  470. err = PTR_ERR(ctx[count]);
  471. while (--count)
  472. i915_gem_context_put(ctx[count]);
  473. goto err_file;
  474. }
  475. }
  476. while (!kthread_should_stop()) {
  477. unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
  478. struct i915_request *old = rq[idx];
  479. struct i915_request *new;
  480. mutex_lock(&engine->i915->drm.struct_mutex);
  481. new = i915_request_alloc(engine, ctx[idx]);
  482. if (IS_ERR(new)) {
  483. mutex_unlock(&engine->i915->drm.struct_mutex);
  484. err = PTR_ERR(new);
  485. break;
  486. }
  487. if (arg->flags & TEST_PRIORITY)
  488. ctx[idx]->sched.priority =
  489. i915_prandom_u32_max_state(512, &prng);
  490. rq[idx] = i915_request_get(new);
  491. i915_request_add(new);
  492. mutex_unlock(&engine->i915->drm.struct_mutex);
  493. if (old) {
  494. if (i915_request_wait(old, 0, HZ) < 0) {
  495. GEM_TRACE("%s timed out.\n", engine->name);
  496. GEM_TRACE_DUMP();
  497. i915_gem_set_wedged(engine->i915);
  498. i915_request_put(old);
  499. err = -EIO;
  500. break;
  501. }
  502. i915_request_put(old);
  503. }
  504. cond_resched();
  505. }
  506. for (count = 0; count < ARRAY_SIZE(rq); count++)
  507. i915_request_put(rq[count]);
  508. err_file:
  509. mock_file_free(engine->i915, file);
  510. return err;
  511. }
  512. static int __igt_reset_engines(struct drm_i915_private *i915,
  513. const char *test_name,
  514. unsigned int flags)
  515. {
  516. struct intel_engine_cs *engine, *other;
  517. enum intel_engine_id id, tmp;
  518. struct hang h;
  519. int err = 0;
  520. /* Check that issuing a reset on one engine does not interfere
  521. * with any other engine.
  522. */
  523. if (!intel_has_reset_engine(i915))
  524. return 0;
  525. if (flags & TEST_ACTIVE) {
  526. mutex_lock(&i915->drm.struct_mutex);
  527. err = hang_init(&h, i915);
  528. mutex_unlock(&i915->drm.struct_mutex);
  529. if (err)
  530. return err;
  531. if (flags & TEST_PRIORITY)
  532. h.ctx->sched.priority = 1024;
  533. }
  534. for_each_engine(engine, i915, id) {
  535. struct active_engine threads[I915_NUM_ENGINES] = {};
  536. unsigned long global = i915_reset_count(&i915->gpu_error);
  537. unsigned long count = 0, reported;
  538. IGT_TIMEOUT(end_time);
  539. if (flags & TEST_ACTIVE &&
  540. !intel_engine_can_store_dword(engine))
  541. continue;
  542. if (!wait_for_idle(engine)) {
  543. pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
  544. engine->name, test_name);
  545. err = -EIO;
  546. break;
  547. }
  548. memset(threads, 0, sizeof(threads));
  549. for_each_engine(other, i915, tmp) {
  550. struct task_struct *tsk;
  551. threads[tmp].resets =
  552. i915_reset_engine_count(&i915->gpu_error,
  553. other);
  554. if (!(flags & TEST_OTHERS))
  555. continue;
  556. if (other == engine && !(flags & TEST_SELF))
  557. continue;
  558. threads[tmp].engine = other;
  559. threads[tmp].flags = flags;
  560. tsk = kthread_run(active_engine, &threads[tmp],
  561. "igt/%s", other->name);
  562. if (IS_ERR(tsk)) {
  563. err = PTR_ERR(tsk);
  564. goto unwind;
  565. }
  566. threads[tmp].task = tsk;
  567. get_task_struct(tsk);
  568. }
  569. set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  570. do {
  571. u32 seqno = intel_engine_get_seqno(engine);
  572. struct i915_request *rq = NULL;
  573. if (flags & TEST_ACTIVE) {
  574. mutex_lock(&i915->drm.struct_mutex);
  575. rq = hang_create_request(&h, engine);
  576. if (IS_ERR(rq)) {
  577. err = PTR_ERR(rq);
  578. mutex_unlock(&i915->drm.struct_mutex);
  579. break;
  580. }
  581. i915_request_get(rq);
  582. __i915_request_add(rq, true);
  583. mutex_unlock(&i915->drm.struct_mutex);
  584. if (!wait_until_running(&h, rq)) {
  585. struct drm_printer p = drm_info_printer(i915->drm.dev);
  586. pr_err("%s: Failed to start request %x, at %x\n",
  587. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  588. intel_engine_dump(engine, &p,
  589. "%s\n", engine->name);
  590. i915_request_put(rq);
  591. err = -EIO;
  592. break;
  593. }
  594. GEM_BUG_ON(!rq->global_seqno);
  595. seqno = rq->global_seqno - 1;
  596. }
  597. err = i915_reset_engine(engine, NULL);
  598. if (err) {
  599. pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
  600. engine->name, test_name, err);
  601. break;
  602. }
  603. count++;
  604. if (rq) {
  605. i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
  606. i915_request_put(rq);
  607. }
  608. if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
  609. struct drm_printer p =
  610. drm_info_printer(i915->drm.dev);
  611. pr_err("i915_reset_engine(%s:%s):"
  612. " failed to idle after reset\n",
  613. engine->name, test_name);
  614. intel_engine_dump(engine, &p,
  615. "%s\n", engine->name);
  616. err = -EIO;
  617. break;
  618. }
  619. } while (time_before(jiffies, end_time));
  620. clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  621. pr_info("i915_reset_engine(%s:%s): %lu resets\n",
  622. engine->name, test_name, count);
  623. reported = i915_reset_engine_count(&i915->gpu_error, engine);
  624. reported -= threads[engine->id].resets;
  625. if (reported != (flags & TEST_ACTIVE ? count : 0)) {
  626. pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
  627. engine->name, test_name, count, reported,
  628. (flags & TEST_ACTIVE ? count : 0));
  629. if (!err)
  630. err = -EINVAL;
  631. }
  632. unwind:
  633. for_each_engine(other, i915, tmp) {
  634. int ret;
  635. if (!threads[tmp].task)
  636. continue;
  637. ret = kthread_stop(threads[tmp].task);
  638. if (ret) {
  639. pr_err("kthread for other engine %s failed, err=%d\n",
  640. other->name, ret);
  641. if (!err)
  642. err = ret;
  643. }
  644. put_task_struct(threads[tmp].task);
  645. if (other != engine &&
  646. threads[tmp].resets !=
  647. i915_reset_engine_count(&i915->gpu_error, other)) {
  648. pr_err("Innocent engine %s was reset (count=%ld)\n",
  649. other->name,
  650. i915_reset_engine_count(&i915->gpu_error,
  651. other) -
  652. threads[tmp].resets);
  653. if (!err)
  654. err = -EINVAL;
  655. }
  656. }
  657. if (global != i915_reset_count(&i915->gpu_error)) {
  658. pr_err("Global reset (count=%ld)!\n",
  659. i915_reset_count(&i915->gpu_error) - global);
  660. if (!err)
  661. err = -EINVAL;
  662. }
  663. if (err)
  664. break;
  665. err = igt_flush_test(i915, 0);
  666. if (err)
  667. break;
  668. }
  669. if (i915_terminally_wedged(&i915->gpu_error))
  670. err = -EIO;
  671. if (flags & TEST_ACTIVE) {
  672. mutex_lock(&i915->drm.struct_mutex);
  673. hang_fini(&h);
  674. mutex_unlock(&i915->drm.struct_mutex);
  675. }
  676. return err;
  677. }
  678. static int igt_reset_engines(void *arg)
  679. {
  680. static const struct {
  681. const char *name;
  682. unsigned int flags;
  683. } phases[] = {
  684. { "idle", 0 },
  685. { "active", TEST_ACTIVE },
  686. { "others-idle", TEST_OTHERS },
  687. { "others-active", TEST_OTHERS | TEST_ACTIVE },
  688. {
  689. "others-priority",
  690. TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
  691. },
  692. {
  693. "self-priority",
  694. TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
  695. },
  696. { }
  697. };
  698. struct drm_i915_private *i915 = arg;
  699. typeof(*phases) *p;
  700. int err;
  701. for (p = phases; p->name; p++) {
  702. if (p->flags & TEST_PRIORITY) {
  703. if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
  704. continue;
  705. }
  706. err = __igt_reset_engines(arg, p->name, p->flags);
  707. if (err)
  708. return err;
  709. }
  710. return 0;
  711. }
  712. static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
  713. {
  714. struct i915_gpu_error *error = &rq->i915->gpu_error;
  715. u32 reset_count = i915_reset_count(error);
  716. error->stalled_mask = mask;
  717. /* set_bit() must be after we have setup the backchannel (mask) */
  718. smp_mb__before_atomic();
  719. set_bit(I915_RESET_HANDOFF, &error->flags);
  720. wake_up_all(&error->wait_queue);
  721. return reset_count;
  722. }
  723. static int igt_wait_reset(void *arg)
  724. {
  725. struct drm_i915_private *i915 = arg;
  726. struct i915_request *rq;
  727. unsigned int reset_count;
  728. struct hang h;
  729. long timeout;
  730. int err;
  731. if (!intel_engine_can_store_dword(i915->engine[RCS]))
  732. return 0;
  733. /* Check that we detect a stuck waiter and issue a reset */
  734. global_reset_lock(i915);
  735. mutex_lock(&i915->drm.struct_mutex);
  736. err = hang_init(&h, i915);
  737. if (err)
  738. goto unlock;
  739. rq = hang_create_request(&h, i915->engine[RCS]);
  740. if (IS_ERR(rq)) {
  741. err = PTR_ERR(rq);
  742. goto fini;
  743. }
  744. i915_request_get(rq);
  745. __i915_request_add(rq, true);
  746. if (!wait_until_running(&h, rq)) {
  747. struct drm_printer p = drm_info_printer(i915->drm.dev);
  748. pr_err("%s: Failed to start request %x, at %x\n",
  749. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  750. intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
  751. i915_gem_set_wedged(i915);
  752. err = -EIO;
  753. goto out_rq;
  754. }
  755. reset_count = fake_hangcheck(rq, ALL_ENGINES);
  756. timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
  757. if (timeout < 0) {
  758. pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
  759. timeout);
  760. err = timeout;
  761. goto out_rq;
  762. }
  763. GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
  764. if (i915_reset_count(&i915->gpu_error) == reset_count) {
  765. pr_err("No GPU reset recorded!\n");
  766. err = -EINVAL;
  767. goto out_rq;
  768. }
  769. out_rq:
  770. i915_request_put(rq);
  771. fini:
  772. hang_fini(&h);
  773. unlock:
  774. mutex_unlock(&i915->drm.struct_mutex);
  775. global_reset_unlock(i915);
  776. if (i915_terminally_wedged(&i915->gpu_error))
  777. return -EIO;
  778. return err;
  779. }
  780. static int wait_for_others(struct drm_i915_private *i915,
  781. struct intel_engine_cs *exclude)
  782. {
  783. struct intel_engine_cs *engine;
  784. enum intel_engine_id id;
  785. for_each_engine(engine, i915, id) {
  786. if (engine == exclude)
  787. continue;
  788. if (!wait_for_idle(engine))
  789. return -EIO;
  790. }
  791. return 0;
  792. }
  793. static int igt_reset_queue(void *arg)
  794. {
  795. struct drm_i915_private *i915 = arg;
  796. struct intel_engine_cs *engine;
  797. enum intel_engine_id id;
  798. struct hang h;
  799. int err;
  800. /* Check that we replay pending requests following a hang */
  801. global_reset_lock(i915);
  802. mutex_lock(&i915->drm.struct_mutex);
  803. err = hang_init(&h, i915);
  804. if (err)
  805. goto unlock;
  806. for_each_engine(engine, i915, id) {
  807. struct i915_request *prev;
  808. IGT_TIMEOUT(end_time);
  809. unsigned int count;
  810. if (!intel_engine_can_store_dword(engine))
  811. continue;
  812. prev = hang_create_request(&h, engine);
  813. if (IS_ERR(prev)) {
  814. err = PTR_ERR(prev);
  815. goto fini;
  816. }
  817. i915_request_get(prev);
  818. __i915_request_add(prev, true);
  819. count = 0;
  820. do {
  821. struct i915_request *rq;
  822. unsigned int reset_count;
  823. rq = hang_create_request(&h, engine);
  824. if (IS_ERR(rq)) {
  825. err = PTR_ERR(rq);
  826. goto fini;
  827. }
  828. i915_request_get(rq);
  829. __i915_request_add(rq, true);
  830. /*
  831. * XXX We don't handle resetting the kernel context
  832. * very well. If we trigger a device reset twice in
  833. * quick succession while the kernel context is
  834. * executing, we may end up skipping the breadcrumb.
  835. * This is really only a problem for the selftest as
  836. * normally there is a large interlude between resets
  837. * (hangcheck), or we focus on resetting just one
  838. * engine and so avoid repeatedly resetting innocents.
  839. */
  840. err = wait_for_others(i915, engine);
  841. if (err) {
  842. pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
  843. __func__, engine->name);
  844. i915_request_put(rq);
  845. i915_request_put(prev);
  846. GEM_TRACE_DUMP();
  847. i915_gem_set_wedged(i915);
  848. goto fini;
  849. }
  850. if (!wait_until_running(&h, prev)) {
  851. struct drm_printer p = drm_info_printer(i915->drm.dev);
  852. pr_err("%s(%s): Failed to start request %x, at %x\n",
  853. __func__, engine->name,
  854. prev->fence.seqno, hws_seqno(&h, prev));
  855. intel_engine_dump(engine, &p,
  856. "%s\n", engine->name);
  857. i915_request_put(rq);
  858. i915_request_put(prev);
  859. i915_gem_set_wedged(i915);
  860. err = -EIO;
  861. goto fini;
  862. }
  863. reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
  864. i915_reset(i915, ENGINE_MASK(id), NULL);
  865. GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
  866. &i915->gpu_error.flags));
  867. if (prev->fence.error != -EIO) {
  868. pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
  869. prev->fence.error);
  870. i915_request_put(rq);
  871. i915_request_put(prev);
  872. err = -EINVAL;
  873. goto fini;
  874. }
  875. if (rq->fence.error) {
  876. pr_err("Fence error status not zero [%d] after unrelated reset\n",
  877. rq->fence.error);
  878. i915_request_put(rq);
  879. i915_request_put(prev);
  880. err = -EINVAL;
  881. goto fini;
  882. }
  883. if (i915_reset_count(&i915->gpu_error) == reset_count) {
  884. pr_err("No GPU reset recorded!\n");
  885. i915_request_put(rq);
  886. i915_request_put(prev);
  887. err = -EINVAL;
  888. goto fini;
  889. }
  890. i915_request_put(prev);
  891. prev = rq;
  892. count++;
  893. } while (time_before(jiffies, end_time));
  894. pr_info("%s: Completed %d resets\n", engine->name, count);
  895. *h.batch = MI_BATCH_BUFFER_END;
  896. i915_gem_chipset_flush(i915);
  897. i915_request_put(prev);
  898. err = igt_flush_test(i915, I915_WAIT_LOCKED);
  899. if (err)
  900. break;
  901. }
  902. fini:
  903. hang_fini(&h);
  904. unlock:
  905. mutex_unlock(&i915->drm.struct_mutex);
  906. global_reset_unlock(i915);
  907. if (i915_terminally_wedged(&i915->gpu_error))
  908. return -EIO;
  909. return err;
  910. }
  911. static int igt_handle_error(void *arg)
  912. {
  913. struct drm_i915_private *i915 = arg;
  914. struct intel_engine_cs *engine = i915->engine[RCS];
  915. struct hang h;
  916. struct i915_request *rq;
  917. struct i915_gpu_state *error;
  918. int err;
  919. /* Check that we can issue a global GPU and engine reset */
  920. if (!intel_has_reset_engine(i915))
  921. return 0;
  922. if (!engine || !intel_engine_can_store_dword(engine))
  923. return 0;
  924. mutex_lock(&i915->drm.struct_mutex);
  925. err = hang_init(&h, i915);
  926. if (err)
  927. goto err_unlock;
  928. rq = hang_create_request(&h, engine);
  929. if (IS_ERR(rq)) {
  930. err = PTR_ERR(rq);
  931. goto err_fini;
  932. }
  933. i915_request_get(rq);
  934. __i915_request_add(rq, true);
  935. if (!wait_until_running(&h, rq)) {
  936. struct drm_printer p = drm_info_printer(i915->drm.dev);
  937. pr_err("%s: Failed to start request %x, at %x\n",
  938. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  939. intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
  940. i915_gem_set_wedged(i915);
  941. err = -EIO;
  942. goto err_request;
  943. }
  944. mutex_unlock(&i915->drm.struct_mutex);
  945. /* Temporarily disable error capture */
  946. error = xchg(&i915->gpu_error.first_error, (void *)-1);
  947. i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
  948. xchg(&i915->gpu_error.first_error, error);
  949. mutex_lock(&i915->drm.struct_mutex);
  950. if (rq->fence.error != -EIO) {
  951. pr_err("Guilty request not identified!\n");
  952. err = -EINVAL;
  953. goto err_request;
  954. }
  955. err_request:
  956. i915_request_put(rq);
  957. err_fini:
  958. hang_fini(&h);
  959. err_unlock:
  960. mutex_unlock(&i915->drm.struct_mutex);
  961. return err;
  962. }
  963. int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
  964. {
  965. static const struct i915_subtest tests[] = {
  966. SUBTEST(igt_global_reset), /* attempt to recover GPU first */
  967. SUBTEST(igt_hang_sanitycheck),
  968. SUBTEST(igt_reset_idle_engine),
  969. SUBTEST(igt_reset_active_engine),
  970. SUBTEST(igt_reset_engines),
  971. SUBTEST(igt_wait_reset),
  972. SUBTEST(igt_reset_queue),
  973. SUBTEST(igt_handle_error),
  974. };
  975. bool saved_hangcheck;
  976. int err;
  977. if (!intel_has_gpu_reset(i915))
  978. return 0;
  979. intel_runtime_pm_get(i915);
  980. saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
  981. err = i915_subtests(tests, i915);
  982. mutex_lock(&i915->drm.struct_mutex);
  983. igt_flush_test(i915, I915_WAIT_LOCKED);
  984. mutex_unlock(&i915->drm.struct_mutex);
  985. i915_modparams.enable_hangcheck = saved_hangcheck;
  986. intel_runtime_pm_put(i915);
  987. return err;
  988. }