intel_hangcheck.c 25 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072
  1. /*
  2. * Copyright © 2016 Intel Corporation
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice (including the next
  12. * paragraph) shall be included in all copies or substantial portions of the
  13. * Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21. * IN THE SOFTWARE.
  22. *
  23. */
  24. #include <linux/kthread.h>
  25. #include "../i915_selftest.h"
  26. #include "mock_context.h"
  27. #include "mock_drm.h"
  28. struct hang {
  29. struct drm_i915_private *i915;
  30. struct drm_i915_gem_object *hws;
  31. struct drm_i915_gem_object *obj;
  32. u32 *seqno;
  33. u32 *batch;
  34. };
  35. static int hang_init(struct hang *h, struct drm_i915_private *i915)
  36. {
  37. void *vaddr;
  38. int err;
  39. memset(h, 0, sizeof(*h));
  40. h->i915 = i915;
  41. h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  42. if (IS_ERR(h->hws))
  43. return PTR_ERR(h->hws);
  44. h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  45. if (IS_ERR(h->obj)) {
  46. err = PTR_ERR(h->obj);
  47. goto err_hws;
  48. }
  49. i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
  50. vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  51. if (IS_ERR(vaddr)) {
  52. err = PTR_ERR(vaddr);
  53. goto err_obj;
  54. }
  55. h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  56. vaddr = i915_gem_object_pin_map(h->obj,
  57. HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
  58. if (IS_ERR(vaddr)) {
  59. err = PTR_ERR(vaddr);
  60. goto err_unpin_hws;
  61. }
  62. h->batch = vaddr;
  63. return 0;
  64. err_unpin_hws:
  65. i915_gem_object_unpin_map(h->hws);
  66. err_obj:
  67. i915_gem_object_put(h->obj);
  68. err_hws:
  69. i915_gem_object_put(h->hws);
  70. return err;
  71. }
  72. static u64 hws_address(const struct i915_vma *hws,
  73. const struct drm_i915_gem_request *rq)
  74. {
  75. return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  76. }
  77. static int emit_recurse_batch(struct hang *h,
  78. struct drm_i915_gem_request *rq)
  79. {
  80. struct drm_i915_private *i915 = h->i915;
  81. struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
  82. struct i915_vma *hws, *vma;
  83. unsigned int flags;
  84. u32 *batch;
  85. int err;
  86. vma = i915_vma_instance(h->obj, vm, NULL);
  87. if (IS_ERR(vma))
  88. return PTR_ERR(vma);
  89. hws = i915_vma_instance(h->hws, vm, NULL);
  90. if (IS_ERR(hws))
  91. return PTR_ERR(hws);
  92. err = i915_vma_pin(vma, 0, 0, PIN_USER);
  93. if (err)
  94. return err;
  95. err = i915_vma_pin(hws, 0, 0, PIN_USER);
  96. if (err)
  97. goto unpin_vma;
  98. i915_vma_move_to_active(vma, rq, 0);
  99. if (!i915_gem_object_has_active_reference(vma->obj)) {
  100. i915_gem_object_get(vma->obj);
  101. i915_gem_object_set_active_reference(vma->obj);
  102. }
  103. i915_vma_move_to_active(hws, rq, 0);
  104. if (!i915_gem_object_has_active_reference(hws->obj)) {
  105. i915_gem_object_get(hws->obj);
  106. i915_gem_object_set_active_reference(hws->obj);
  107. }
  108. batch = h->batch;
  109. if (INTEL_GEN(i915) >= 8) {
  110. *batch++ = MI_STORE_DWORD_IMM_GEN4;
  111. *batch++ = lower_32_bits(hws_address(hws, rq));
  112. *batch++ = upper_32_bits(hws_address(hws, rq));
  113. *batch++ = rq->fence.seqno;
  114. *batch++ = MI_ARB_CHECK;
  115. memset(batch, 0, 1024);
  116. batch += 1024 / sizeof(*batch);
  117. *batch++ = MI_ARB_CHECK;
  118. *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
  119. *batch++ = lower_32_bits(vma->node.start);
  120. *batch++ = upper_32_bits(vma->node.start);
  121. } else if (INTEL_GEN(i915) >= 6) {
  122. *batch++ = MI_STORE_DWORD_IMM_GEN4;
  123. *batch++ = 0;
  124. *batch++ = lower_32_bits(hws_address(hws, rq));
  125. *batch++ = rq->fence.seqno;
  126. *batch++ = MI_ARB_CHECK;
  127. memset(batch, 0, 1024);
  128. batch += 1024 / sizeof(*batch);
  129. *batch++ = MI_ARB_CHECK;
  130. *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
  131. *batch++ = lower_32_bits(vma->node.start);
  132. } else if (INTEL_GEN(i915) >= 4) {
  133. *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
  134. *batch++ = 0;
  135. *batch++ = lower_32_bits(hws_address(hws, rq));
  136. *batch++ = rq->fence.seqno;
  137. *batch++ = MI_ARB_CHECK;
  138. memset(batch, 0, 1024);
  139. batch += 1024 / sizeof(*batch);
  140. *batch++ = MI_ARB_CHECK;
  141. *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
  142. *batch++ = lower_32_bits(vma->node.start);
  143. } else {
  144. *batch++ = MI_STORE_DWORD_IMM;
  145. *batch++ = lower_32_bits(hws_address(hws, rq));
  146. *batch++ = rq->fence.seqno;
  147. *batch++ = MI_ARB_CHECK;
  148. memset(batch, 0, 1024);
  149. batch += 1024 / sizeof(*batch);
  150. *batch++ = MI_ARB_CHECK;
  151. *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
  152. *batch++ = lower_32_bits(vma->node.start);
  153. }
  154. *batch++ = MI_BATCH_BUFFER_END; /* not reached */
  155. i915_gem_chipset_flush(h->i915);
  156. flags = 0;
  157. if (INTEL_GEN(vm->i915) <= 5)
  158. flags |= I915_DISPATCH_SECURE;
  159. err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
  160. i915_vma_unpin(hws);
  161. unpin_vma:
  162. i915_vma_unpin(vma);
  163. return err;
  164. }
  165. static struct drm_i915_gem_request *
  166. hang_create_request(struct hang *h,
  167. struct intel_engine_cs *engine,
  168. struct i915_gem_context *ctx)
  169. {
  170. struct drm_i915_gem_request *rq;
  171. int err;
  172. if (i915_gem_object_is_active(h->obj)) {
  173. struct drm_i915_gem_object *obj;
  174. void *vaddr;
  175. obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
  176. if (IS_ERR(obj))
  177. return ERR_CAST(obj);
  178. vaddr = i915_gem_object_pin_map(obj,
  179. HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
  180. if (IS_ERR(vaddr)) {
  181. i915_gem_object_put(obj);
  182. return ERR_CAST(vaddr);
  183. }
  184. i915_gem_object_unpin_map(h->obj);
  185. i915_gem_object_put(h->obj);
  186. h->obj = obj;
  187. h->batch = vaddr;
  188. }
  189. rq = i915_gem_request_alloc(engine, ctx);
  190. if (IS_ERR(rq))
  191. return rq;
  192. err = emit_recurse_batch(h, rq);
  193. if (err) {
  194. __i915_add_request(rq, false);
  195. return ERR_PTR(err);
  196. }
  197. return rq;
  198. }
  199. static u32 hws_seqno(const struct hang *h,
  200. const struct drm_i915_gem_request *rq)
  201. {
  202. return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
  203. }
  204. static void hang_fini(struct hang *h)
  205. {
  206. *h->batch = MI_BATCH_BUFFER_END;
  207. i915_gem_chipset_flush(h->i915);
  208. i915_gem_object_unpin_map(h->obj);
  209. i915_gem_object_put(h->obj);
  210. i915_gem_object_unpin_map(h->hws);
  211. i915_gem_object_put(h->hws);
  212. i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
  213. }
  214. static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
  215. {
  216. return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
  217. rq->fence.seqno),
  218. 10) &&
  219. wait_for(i915_seqno_passed(hws_seqno(h, rq),
  220. rq->fence.seqno),
  221. 1000));
  222. }
  223. static int igt_hang_sanitycheck(void *arg)
  224. {
  225. struct drm_i915_private *i915 = arg;
  226. struct drm_i915_gem_request *rq;
  227. struct intel_engine_cs *engine;
  228. enum intel_engine_id id;
  229. struct hang h;
  230. int err;
  231. /* Basic check that we can execute our hanging batch */
  232. mutex_lock(&i915->drm.struct_mutex);
  233. err = hang_init(&h, i915);
  234. if (err)
  235. goto unlock;
  236. for_each_engine(engine, i915, id) {
  237. long timeout;
  238. if (!intel_engine_can_store_dword(engine))
  239. continue;
  240. rq = hang_create_request(&h, engine, i915->kernel_context);
  241. if (IS_ERR(rq)) {
  242. err = PTR_ERR(rq);
  243. pr_err("Failed to create request for %s, err=%d\n",
  244. engine->name, err);
  245. goto fini;
  246. }
  247. i915_gem_request_get(rq);
  248. *h.batch = MI_BATCH_BUFFER_END;
  249. i915_gem_chipset_flush(i915);
  250. __i915_add_request(rq, true);
  251. timeout = i915_wait_request(rq,
  252. I915_WAIT_LOCKED,
  253. MAX_SCHEDULE_TIMEOUT);
  254. i915_gem_request_put(rq);
  255. if (timeout < 0) {
  256. err = timeout;
  257. pr_err("Wait for request failed on %s, err=%d\n",
  258. engine->name, err);
  259. goto fini;
  260. }
  261. }
  262. fini:
  263. hang_fini(&h);
  264. unlock:
  265. mutex_unlock(&i915->drm.struct_mutex);
  266. return err;
  267. }
  268. static void global_reset_lock(struct drm_i915_private *i915)
  269. {
  270. struct intel_engine_cs *engine;
  271. enum intel_engine_id id;
  272. pr_debug("%s: current gpu_error=%08lx\n",
  273. __func__, i915->gpu_error.flags);
  274. while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
  275. wait_event(i915->gpu_error.reset_queue,
  276. !test_bit(I915_RESET_BACKOFF,
  277. &i915->gpu_error.flags));
  278. for_each_engine(engine, i915, id) {
  279. while (test_and_set_bit(I915_RESET_ENGINE + id,
  280. &i915->gpu_error.flags))
  281. wait_on_bit(&i915->gpu_error.flags,
  282. I915_RESET_ENGINE + id,
  283. TASK_UNINTERRUPTIBLE);
  284. }
  285. }
  286. static void global_reset_unlock(struct drm_i915_private *i915)
  287. {
  288. struct intel_engine_cs *engine;
  289. enum intel_engine_id id;
  290. for_each_engine(engine, i915, id)
  291. clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  292. clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
  293. wake_up_all(&i915->gpu_error.reset_queue);
  294. }
  295. static int igt_global_reset(void *arg)
  296. {
  297. struct drm_i915_private *i915 = arg;
  298. unsigned int reset_count;
  299. int err = 0;
  300. /* Check that we can issue a global GPU reset */
  301. global_reset_lock(i915);
  302. set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
  303. mutex_lock(&i915->drm.struct_mutex);
  304. reset_count = i915_reset_count(&i915->gpu_error);
  305. i915_reset(i915, I915_RESET_QUIET);
  306. if (i915_reset_count(&i915->gpu_error) == reset_count) {
  307. pr_err("No GPU reset recorded!\n");
  308. err = -EINVAL;
  309. }
  310. mutex_unlock(&i915->drm.struct_mutex);
  311. GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
  312. global_reset_unlock(i915);
  313. if (i915_terminally_wedged(&i915->gpu_error))
  314. err = -EIO;
  315. return err;
  316. }
  317. static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
  318. {
  319. struct intel_engine_cs *engine;
  320. enum intel_engine_id id;
  321. struct hang h;
  322. int err = 0;
  323. /* Check that we can issue an engine reset on an idle engine (no-op) */
  324. if (!intel_has_reset_engine(i915))
  325. return 0;
  326. if (active) {
  327. mutex_lock(&i915->drm.struct_mutex);
  328. err = hang_init(&h, i915);
  329. mutex_unlock(&i915->drm.struct_mutex);
  330. if (err)
  331. return err;
  332. }
  333. for_each_engine(engine, i915, id) {
  334. unsigned int reset_count, reset_engine_count;
  335. IGT_TIMEOUT(end_time);
  336. if (active && !intel_engine_can_store_dword(engine))
  337. continue;
  338. reset_count = i915_reset_count(&i915->gpu_error);
  339. reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
  340. engine);
  341. set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  342. do {
  343. if (active) {
  344. struct drm_i915_gem_request *rq;
  345. mutex_lock(&i915->drm.struct_mutex);
  346. rq = hang_create_request(&h, engine,
  347. i915->kernel_context);
  348. if (IS_ERR(rq)) {
  349. err = PTR_ERR(rq);
  350. mutex_unlock(&i915->drm.struct_mutex);
  351. break;
  352. }
  353. i915_gem_request_get(rq);
  354. __i915_add_request(rq, true);
  355. mutex_unlock(&i915->drm.struct_mutex);
  356. if (!wait_for_hang(&h, rq)) {
  357. struct drm_printer p = drm_info_printer(i915->drm.dev);
  358. pr_err("%s: Failed to start request %x, at %x\n",
  359. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  360. intel_engine_dump(engine, &p,
  361. "%s\n", engine->name);
  362. i915_gem_request_put(rq);
  363. err = -EIO;
  364. break;
  365. }
  366. i915_gem_request_put(rq);
  367. }
  368. engine->hangcheck.stalled = true;
  369. engine->hangcheck.seqno =
  370. intel_engine_get_seqno(engine);
  371. err = i915_reset_engine(engine, I915_RESET_QUIET);
  372. if (err) {
  373. pr_err("i915_reset_engine failed\n");
  374. break;
  375. }
  376. if (i915_reset_count(&i915->gpu_error) != reset_count) {
  377. pr_err("Full GPU reset recorded! (engine reset expected)\n");
  378. err = -EINVAL;
  379. break;
  380. }
  381. reset_engine_count += active;
  382. if (i915_reset_engine_count(&i915->gpu_error, engine) !=
  383. reset_engine_count) {
  384. pr_err("%s engine reset %srecorded!\n",
  385. engine->name, active ? "not " : "");
  386. err = -EINVAL;
  387. break;
  388. }
  389. engine->hangcheck.stalled = false;
  390. } while (time_before(jiffies, end_time));
  391. clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  392. if (err)
  393. break;
  394. cond_resched();
  395. }
  396. if (i915_terminally_wedged(&i915->gpu_error))
  397. err = -EIO;
  398. if (active) {
  399. mutex_lock(&i915->drm.struct_mutex);
  400. hang_fini(&h);
  401. mutex_unlock(&i915->drm.struct_mutex);
  402. }
  403. return err;
  404. }
  405. static int igt_reset_idle_engine(void *arg)
  406. {
  407. return __igt_reset_engine(arg, false);
  408. }
  409. static int igt_reset_active_engine(void *arg)
  410. {
  411. return __igt_reset_engine(arg, true);
  412. }
  413. static int active_engine(void *data)
  414. {
  415. struct intel_engine_cs *engine = data;
  416. struct drm_i915_gem_request *rq[2] = {};
  417. struct i915_gem_context *ctx[2];
  418. struct drm_file *file;
  419. unsigned long count = 0;
  420. int err = 0;
  421. file = mock_file(engine->i915);
  422. if (IS_ERR(file))
  423. return PTR_ERR(file);
  424. mutex_lock(&engine->i915->drm.struct_mutex);
  425. ctx[0] = live_context(engine->i915, file);
  426. mutex_unlock(&engine->i915->drm.struct_mutex);
  427. if (IS_ERR(ctx[0])) {
  428. err = PTR_ERR(ctx[0]);
  429. goto err_file;
  430. }
  431. mutex_lock(&engine->i915->drm.struct_mutex);
  432. ctx[1] = live_context(engine->i915, file);
  433. mutex_unlock(&engine->i915->drm.struct_mutex);
  434. if (IS_ERR(ctx[1])) {
  435. err = PTR_ERR(ctx[1]);
  436. i915_gem_context_put(ctx[0]);
  437. goto err_file;
  438. }
  439. while (!kthread_should_stop()) {
  440. unsigned int idx = count++ & 1;
  441. struct drm_i915_gem_request *old = rq[idx];
  442. struct drm_i915_gem_request *new;
  443. mutex_lock(&engine->i915->drm.struct_mutex);
  444. new = i915_gem_request_alloc(engine, ctx[idx]);
  445. if (IS_ERR(new)) {
  446. mutex_unlock(&engine->i915->drm.struct_mutex);
  447. err = PTR_ERR(new);
  448. break;
  449. }
  450. rq[idx] = i915_gem_request_get(new);
  451. i915_add_request(new);
  452. mutex_unlock(&engine->i915->drm.struct_mutex);
  453. if (old) {
  454. i915_wait_request(old, 0, MAX_SCHEDULE_TIMEOUT);
  455. i915_gem_request_put(old);
  456. }
  457. }
  458. for (count = 0; count < ARRAY_SIZE(rq); count++)
  459. i915_gem_request_put(rq[count]);
  460. err_file:
  461. mock_file_free(engine->i915, file);
  462. return err;
  463. }
  464. static int __igt_reset_engine_others(struct drm_i915_private *i915,
  465. bool active)
  466. {
  467. struct intel_engine_cs *engine, *other;
  468. enum intel_engine_id id, tmp;
  469. struct hang h;
  470. int err = 0;
  471. /* Check that issuing a reset on one engine does not interfere
  472. * with any other engine.
  473. */
  474. if (!intel_has_reset_engine(i915))
  475. return 0;
  476. if (active) {
  477. mutex_lock(&i915->drm.struct_mutex);
  478. err = hang_init(&h, i915);
  479. mutex_unlock(&i915->drm.struct_mutex);
  480. if (err)
  481. return err;
  482. }
  483. for_each_engine(engine, i915, id) {
  484. struct task_struct *threads[I915_NUM_ENGINES] = {};
  485. unsigned long resets[I915_NUM_ENGINES];
  486. unsigned long global = i915_reset_count(&i915->gpu_error);
  487. unsigned long count = 0;
  488. IGT_TIMEOUT(end_time);
  489. if (active && !intel_engine_can_store_dword(engine))
  490. continue;
  491. memset(threads, 0, sizeof(threads));
  492. for_each_engine(other, i915, tmp) {
  493. struct task_struct *tsk;
  494. resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
  495. other);
  496. if (other == engine)
  497. continue;
  498. tsk = kthread_run(active_engine, other,
  499. "igt/%s", other->name);
  500. if (IS_ERR(tsk)) {
  501. err = PTR_ERR(tsk);
  502. goto unwind;
  503. }
  504. threads[tmp] = tsk;
  505. get_task_struct(tsk);
  506. }
  507. set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  508. do {
  509. if (active) {
  510. struct drm_i915_gem_request *rq;
  511. mutex_lock(&i915->drm.struct_mutex);
  512. rq = hang_create_request(&h, engine,
  513. i915->kernel_context);
  514. if (IS_ERR(rq)) {
  515. err = PTR_ERR(rq);
  516. mutex_unlock(&i915->drm.struct_mutex);
  517. break;
  518. }
  519. i915_gem_request_get(rq);
  520. __i915_add_request(rq, true);
  521. mutex_unlock(&i915->drm.struct_mutex);
  522. if (!wait_for_hang(&h, rq)) {
  523. struct drm_printer p = drm_info_printer(i915->drm.dev);
  524. pr_err("%s: Failed to start request %x, at %x\n",
  525. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  526. intel_engine_dump(engine, &p,
  527. "%s\n", engine->name);
  528. i915_gem_request_put(rq);
  529. err = -EIO;
  530. break;
  531. }
  532. i915_gem_request_put(rq);
  533. }
  534. engine->hangcheck.stalled = true;
  535. engine->hangcheck.seqno =
  536. intel_engine_get_seqno(engine);
  537. err = i915_reset_engine(engine, I915_RESET_QUIET);
  538. if (err) {
  539. pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
  540. engine->name, active ? "active" : "idle", err);
  541. break;
  542. }
  543. engine->hangcheck.stalled = false;
  544. count++;
  545. } while (time_before(jiffies, end_time));
  546. clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  547. pr_info("i915_reset_engine(%s:%s): %lu resets\n",
  548. engine->name, active ? "active" : "idle", count);
  549. if (i915_reset_engine_count(&i915->gpu_error, engine) -
  550. resets[engine->id] != (active ? count : 0)) {
  551. pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
  552. engine->name, active ? "active" : "idle", count,
  553. i915_reset_engine_count(&i915->gpu_error,
  554. engine) - resets[engine->id]);
  555. if (!err)
  556. err = -EINVAL;
  557. }
  558. unwind:
  559. for_each_engine(other, i915, tmp) {
  560. int ret;
  561. if (!threads[tmp])
  562. continue;
  563. ret = kthread_stop(threads[tmp]);
  564. if (ret) {
  565. pr_err("kthread for other engine %s failed, err=%d\n",
  566. other->name, ret);
  567. if (!err)
  568. err = ret;
  569. }
  570. put_task_struct(threads[tmp]);
  571. if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
  572. other)) {
  573. pr_err("Innocent engine %s was reset (count=%ld)\n",
  574. other->name,
  575. i915_reset_engine_count(&i915->gpu_error,
  576. other) - resets[tmp]);
  577. if (!err)
  578. err = -EINVAL;
  579. }
  580. }
  581. if (global != i915_reset_count(&i915->gpu_error)) {
  582. pr_err("Global reset (count=%ld)!\n",
  583. i915_reset_count(&i915->gpu_error) - global);
  584. if (!err)
  585. err = -EINVAL;
  586. }
  587. if (err)
  588. break;
  589. cond_resched();
  590. }
  591. if (i915_terminally_wedged(&i915->gpu_error))
  592. err = -EIO;
  593. if (active) {
  594. mutex_lock(&i915->drm.struct_mutex);
  595. hang_fini(&h);
  596. mutex_unlock(&i915->drm.struct_mutex);
  597. }
  598. return err;
  599. }
  600. static int igt_reset_idle_engine_others(void *arg)
  601. {
  602. return __igt_reset_engine_others(arg, false);
  603. }
  604. static int igt_reset_active_engine_others(void *arg)
  605. {
  606. return __igt_reset_engine_others(arg, true);
  607. }
  608. static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
  609. {
  610. u32 reset_count;
  611. rq->engine->hangcheck.stalled = true;
  612. rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
  613. reset_count = i915_reset_count(&rq->i915->gpu_error);
  614. set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
  615. wake_up_all(&rq->i915->gpu_error.wait_queue);
  616. return reset_count;
  617. }
  618. static int igt_wait_reset(void *arg)
  619. {
  620. struct drm_i915_private *i915 = arg;
  621. struct drm_i915_gem_request *rq;
  622. unsigned int reset_count;
  623. struct hang h;
  624. long timeout;
  625. int err;
  626. if (!intel_engine_can_store_dword(i915->engine[RCS]))
  627. return 0;
  628. /* Check that we detect a stuck waiter and issue a reset */
  629. global_reset_lock(i915);
  630. mutex_lock(&i915->drm.struct_mutex);
  631. err = hang_init(&h, i915);
  632. if (err)
  633. goto unlock;
  634. rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context);
  635. if (IS_ERR(rq)) {
  636. err = PTR_ERR(rq);
  637. goto fini;
  638. }
  639. i915_gem_request_get(rq);
  640. __i915_add_request(rq, true);
  641. if (!wait_for_hang(&h, rq)) {
  642. struct drm_printer p = drm_info_printer(i915->drm.dev);
  643. pr_err("%s: Failed to start request %x, at %x\n",
  644. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  645. intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
  646. i915_reset(i915, 0);
  647. i915_gem_set_wedged(i915);
  648. err = -EIO;
  649. goto out_rq;
  650. }
  651. reset_count = fake_hangcheck(rq);
  652. timeout = i915_wait_request(rq, I915_WAIT_LOCKED, 10);
  653. if (timeout < 0) {
  654. pr_err("i915_wait_request failed on a stuck request: err=%ld\n",
  655. timeout);
  656. err = timeout;
  657. goto out_rq;
  658. }
  659. GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
  660. if (i915_reset_count(&i915->gpu_error) == reset_count) {
  661. pr_err("No GPU reset recorded!\n");
  662. err = -EINVAL;
  663. goto out_rq;
  664. }
  665. out_rq:
  666. i915_gem_request_put(rq);
  667. fini:
  668. hang_fini(&h);
  669. unlock:
  670. mutex_unlock(&i915->drm.struct_mutex);
  671. global_reset_unlock(i915);
  672. if (i915_terminally_wedged(&i915->gpu_error))
  673. return -EIO;
  674. return err;
  675. }
  676. static int igt_reset_queue(void *arg)
  677. {
  678. struct drm_i915_private *i915 = arg;
  679. struct intel_engine_cs *engine;
  680. enum intel_engine_id id;
  681. struct hang h;
  682. int err;
  683. /* Check that we replay pending requests following a hang */
  684. global_reset_lock(i915);
  685. mutex_lock(&i915->drm.struct_mutex);
  686. err = hang_init(&h, i915);
  687. if (err)
  688. goto unlock;
  689. for_each_engine(engine, i915, id) {
  690. struct drm_i915_gem_request *prev;
  691. IGT_TIMEOUT(end_time);
  692. unsigned int count;
  693. if (!intel_engine_can_store_dword(engine))
  694. continue;
  695. prev = hang_create_request(&h, engine, i915->kernel_context);
  696. if (IS_ERR(prev)) {
  697. err = PTR_ERR(prev);
  698. goto fini;
  699. }
  700. i915_gem_request_get(prev);
  701. __i915_add_request(prev, true);
  702. count = 0;
  703. do {
  704. struct drm_i915_gem_request *rq;
  705. unsigned int reset_count;
  706. rq = hang_create_request(&h,
  707. engine,
  708. i915->kernel_context);
  709. if (IS_ERR(rq)) {
  710. err = PTR_ERR(rq);
  711. goto fini;
  712. }
  713. i915_gem_request_get(rq);
  714. __i915_add_request(rq, true);
  715. if (!wait_for_hang(&h, prev)) {
  716. struct drm_printer p = drm_info_printer(i915->drm.dev);
  717. pr_err("%s: Failed to start request %x, at %x\n",
  718. __func__, prev->fence.seqno, hws_seqno(&h, prev));
  719. intel_engine_dump(prev->engine, &p,
  720. "%s\n", prev->engine->name);
  721. i915_gem_request_put(rq);
  722. i915_gem_request_put(prev);
  723. i915_reset(i915, 0);
  724. i915_gem_set_wedged(i915);
  725. err = -EIO;
  726. goto fini;
  727. }
  728. reset_count = fake_hangcheck(prev);
  729. i915_reset(i915, I915_RESET_QUIET);
  730. GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
  731. &i915->gpu_error.flags));
  732. if (prev->fence.error != -EIO) {
  733. pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
  734. prev->fence.error);
  735. i915_gem_request_put(rq);
  736. i915_gem_request_put(prev);
  737. err = -EINVAL;
  738. goto fini;
  739. }
  740. if (rq->fence.error) {
  741. pr_err("Fence error status not zero [%d] after unrelated reset\n",
  742. rq->fence.error);
  743. i915_gem_request_put(rq);
  744. i915_gem_request_put(prev);
  745. err = -EINVAL;
  746. goto fini;
  747. }
  748. if (i915_reset_count(&i915->gpu_error) == reset_count) {
  749. pr_err("No GPU reset recorded!\n");
  750. i915_gem_request_put(rq);
  751. i915_gem_request_put(prev);
  752. err = -EINVAL;
  753. goto fini;
  754. }
  755. i915_gem_request_put(prev);
  756. prev = rq;
  757. count++;
  758. } while (time_before(jiffies, end_time));
  759. pr_info("%s: Completed %d resets\n", engine->name, count);
  760. *h.batch = MI_BATCH_BUFFER_END;
  761. i915_gem_chipset_flush(i915);
  762. i915_gem_request_put(prev);
  763. }
  764. fini:
  765. hang_fini(&h);
  766. unlock:
  767. mutex_unlock(&i915->drm.struct_mutex);
  768. global_reset_unlock(i915);
  769. if (i915_terminally_wedged(&i915->gpu_error))
  770. return -EIO;
  771. return err;
  772. }
  773. static int igt_handle_error(void *arg)
  774. {
  775. struct drm_i915_private *i915 = arg;
  776. struct intel_engine_cs *engine = i915->engine[RCS];
  777. struct hang h;
  778. struct drm_i915_gem_request *rq;
  779. struct i915_gpu_state *error;
  780. int err;
  781. /* Check that we can issue a global GPU and engine reset */
  782. if (!intel_has_reset_engine(i915))
  783. return 0;
  784. if (!intel_engine_can_store_dword(i915->engine[RCS]))
  785. return 0;
  786. mutex_lock(&i915->drm.struct_mutex);
  787. err = hang_init(&h, i915);
  788. if (err)
  789. goto err_unlock;
  790. rq = hang_create_request(&h, engine, i915->kernel_context);
  791. if (IS_ERR(rq)) {
  792. err = PTR_ERR(rq);
  793. goto err_fini;
  794. }
  795. i915_gem_request_get(rq);
  796. __i915_add_request(rq, true);
  797. if (!wait_for_hang(&h, rq)) {
  798. struct drm_printer p = drm_info_printer(i915->drm.dev);
  799. pr_err("%s: Failed to start request %x, at %x\n",
  800. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  801. intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
  802. i915_reset(i915, 0);
  803. i915_gem_set_wedged(i915);
  804. err = -EIO;
  805. goto err_request;
  806. }
  807. mutex_unlock(&i915->drm.struct_mutex);
  808. /* Temporarily disable error capture */
  809. error = xchg(&i915->gpu_error.first_error, (void *)-1);
  810. engine->hangcheck.stalled = true;
  811. engine->hangcheck.seqno = intel_engine_get_seqno(engine);
  812. i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
  813. xchg(&i915->gpu_error.first_error, error);
  814. mutex_lock(&i915->drm.struct_mutex);
  815. if (rq->fence.error != -EIO) {
  816. pr_err("Guilty request not identified!\n");
  817. err = -EINVAL;
  818. goto err_request;
  819. }
  820. err_request:
  821. i915_gem_request_put(rq);
  822. err_fini:
  823. hang_fini(&h);
  824. err_unlock:
  825. mutex_unlock(&i915->drm.struct_mutex);
  826. return err;
  827. }
  828. int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
  829. {
  830. static const struct i915_subtest tests[] = {
  831. SUBTEST(igt_global_reset), /* attempt to recover GPU first */
  832. SUBTEST(igt_hang_sanitycheck),
  833. SUBTEST(igt_reset_idle_engine),
  834. SUBTEST(igt_reset_active_engine),
  835. SUBTEST(igt_reset_idle_engine_others),
  836. SUBTEST(igt_reset_active_engine_others),
  837. SUBTEST(igt_wait_reset),
  838. SUBTEST(igt_reset_queue),
  839. SUBTEST(igt_handle_error),
  840. };
  841. bool saved_hangcheck;
  842. int err;
  843. if (!intel_has_gpu_reset(i915))
  844. return 0;
  845. intel_runtime_pm_get(i915);
  846. saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
  847. err = i915_subtests(tests, i915);
  848. i915_modparams.enable_hangcheck = saved_hangcheck;
  849. intel_runtime_pm_put(i915);
  850. return err;
  851. }