intel_hangcheck.c 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136
  1. /*
  2. * Copyright © 2016 Intel Corporation
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice (including the next
  12. * paragraph) shall be included in all copies or substantial portions of the
  13. * Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21. * IN THE SOFTWARE.
  22. *
  23. */
  24. #include <linux/kthread.h>
  25. #include "../i915_selftest.h"
  26. #include "mock_context.h"
  27. #include "mock_drm.h"
  28. struct hang {
  29. struct drm_i915_private *i915;
  30. struct drm_i915_gem_object *hws;
  31. struct drm_i915_gem_object *obj;
  32. struct i915_gem_context *ctx;
  33. u32 *seqno;
  34. u32 *batch;
  35. };
  36. static int hang_init(struct hang *h, struct drm_i915_private *i915)
  37. {
  38. void *vaddr;
  39. int err;
  40. memset(h, 0, sizeof(*h));
  41. h->i915 = i915;
  42. h->ctx = kernel_context(i915);
  43. if (IS_ERR(h->ctx))
  44. return PTR_ERR(h->ctx);
  45. h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  46. if (IS_ERR(h->hws)) {
  47. err = PTR_ERR(h->hws);
  48. goto err_ctx;
  49. }
  50. h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  51. if (IS_ERR(h->obj)) {
  52. err = PTR_ERR(h->obj);
  53. goto err_hws;
  54. }
  55. i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
  56. vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  57. if (IS_ERR(vaddr)) {
  58. err = PTR_ERR(vaddr);
  59. goto err_obj;
  60. }
  61. h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  62. vaddr = i915_gem_object_pin_map(h->obj,
  63. HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
  64. if (IS_ERR(vaddr)) {
  65. err = PTR_ERR(vaddr);
  66. goto err_unpin_hws;
  67. }
  68. h->batch = vaddr;
  69. return 0;
  70. err_unpin_hws:
  71. i915_gem_object_unpin_map(h->hws);
  72. err_obj:
  73. i915_gem_object_put(h->obj);
  74. err_hws:
  75. i915_gem_object_put(h->hws);
  76. err_ctx:
  77. kernel_context_close(h->ctx);
  78. return err;
  79. }
  80. static u64 hws_address(const struct i915_vma *hws,
  81. const struct i915_request *rq)
  82. {
  83. return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  84. }
  85. static int emit_recurse_batch(struct hang *h,
  86. struct i915_request *rq)
  87. {
  88. struct drm_i915_private *i915 = h->i915;
  89. struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
  90. struct i915_vma *hws, *vma;
  91. unsigned int flags;
  92. u32 *batch;
  93. int err;
  94. vma = i915_vma_instance(h->obj, vm, NULL);
  95. if (IS_ERR(vma))
  96. return PTR_ERR(vma);
  97. hws = i915_vma_instance(h->hws, vm, NULL);
  98. if (IS_ERR(hws))
  99. return PTR_ERR(hws);
  100. err = i915_vma_pin(vma, 0, 0, PIN_USER);
  101. if (err)
  102. return err;
  103. err = i915_vma_pin(hws, 0, 0, PIN_USER);
  104. if (err)
  105. goto unpin_vma;
  106. i915_vma_move_to_active(vma, rq, 0);
  107. if (!i915_gem_object_has_active_reference(vma->obj)) {
  108. i915_gem_object_get(vma->obj);
  109. i915_gem_object_set_active_reference(vma->obj);
  110. }
  111. i915_vma_move_to_active(hws, rq, 0);
  112. if (!i915_gem_object_has_active_reference(hws->obj)) {
  113. i915_gem_object_get(hws->obj);
  114. i915_gem_object_set_active_reference(hws->obj);
  115. }
  116. batch = h->batch;
  117. if (INTEL_GEN(i915) >= 8) {
  118. *batch++ = MI_STORE_DWORD_IMM_GEN4;
  119. *batch++ = lower_32_bits(hws_address(hws, rq));
  120. *batch++ = upper_32_bits(hws_address(hws, rq));
  121. *batch++ = rq->fence.seqno;
  122. *batch++ = MI_ARB_CHECK;
  123. memset(batch, 0, 1024);
  124. batch += 1024 / sizeof(*batch);
  125. *batch++ = MI_ARB_CHECK;
  126. *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
  127. *batch++ = lower_32_bits(vma->node.start);
  128. *batch++ = upper_32_bits(vma->node.start);
  129. } else if (INTEL_GEN(i915) >= 6) {
  130. *batch++ = MI_STORE_DWORD_IMM_GEN4;
  131. *batch++ = 0;
  132. *batch++ = lower_32_bits(hws_address(hws, rq));
  133. *batch++ = rq->fence.seqno;
  134. *batch++ = MI_ARB_CHECK;
  135. memset(batch, 0, 1024);
  136. batch += 1024 / sizeof(*batch);
  137. *batch++ = MI_ARB_CHECK;
  138. *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
  139. *batch++ = lower_32_bits(vma->node.start);
  140. } else if (INTEL_GEN(i915) >= 4) {
  141. *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
  142. *batch++ = 0;
  143. *batch++ = lower_32_bits(hws_address(hws, rq));
  144. *batch++ = rq->fence.seqno;
  145. *batch++ = MI_ARB_CHECK;
  146. memset(batch, 0, 1024);
  147. batch += 1024 / sizeof(*batch);
  148. *batch++ = MI_ARB_CHECK;
  149. *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
  150. *batch++ = lower_32_bits(vma->node.start);
  151. } else {
  152. *batch++ = MI_STORE_DWORD_IMM;
  153. *batch++ = lower_32_bits(hws_address(hws, rq));
  154. *batch++ = rq->fence.seqno;
  155. *batch++ = MI_ARB_CHECK;
  156. memset(batch, 0, 1024);
  157. batch += 1024 / sizeof(*batch);
  158. *batch++ = MI_ARB_CHECK;
  159. *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
  160. *batch++ = lower_32_bits(vma->node.start);
  161. }
  162. *batch++ = MI_BATCH_BUFFER_END; /* not reached */
  163. i915_gem_chipset_flush(h->i915);
  164. flags = 0;
  165. if (INTEL_GEN(vm->i915) <= 5)
  166. flags |= I915_DISPATCH_SECURE;
  167. err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
  168. i915_vma_unpin(hws);
  169. unpin_vma:
  170. i915_vma_unpin(vma);
  171. return err;
  172. }
  173. static struct i915_request *
  174. hang_create_request(struct hang *h, struct intel_engine_cs *engine)
  175. {
  176. struct i915_request *rq;
  177. int err;
  178. if (i915_gem_object_is_active(h->obj)) {
  179. struct drm_i915_gem_object *obj;
  180. void *vaddr;
  181. obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
  182. if (IS_ERR(obj))
  183. return ERR_CAST(obj);
  184. vaddr = i915_gem_object_pin_map(obj,
  185. HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
  186. if (IS_ERR(vaddr)) {
  187. i915_gem_object_put(obj);
  188. return ERR_CAST(vaddr);
  189. }
  190. i915_gem_object_unpin_map(h->obj);
  191. i915_gem_object_put(h->obj);
  192. h->obj = obj;
  193. h->batch = vaddr;
  194. }
  195. rq = i915_request_alloc(engine, h->ctx);
  196. if (IS_ERR(rq))
  197. return rq;
  198. err = emit_recurse_batch(h, rq);
  199. if (err) {
  200. __i915_request_add(rq, false);
  201. return ERR_PTR(err);
  202. }
  203. return rq;
  204. }
  205. static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
  206. {
  207. return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
  208. }
  209. struct wedge_me {
  210. struct delayed_work work;
  211. struct drm_i915_private *i915;
  212. const void *symbol;
  213. };
  214. static void wedge_me(struct work_struct *work)
  215. {
  216. struct wedge_me *w = container_of(work, typeof(*w), work.work);
  217. pr_err("%pS timed out, cancelling all further testing.\n",
  218. w->symbol);
  219. i915_gem_set_wedged(w->i915);
  220. }
  221. static void __init_wedge(struct wedge_me *w,
  222. struct drm_i915_private *i915,
  223. long timeout,
  224. const void *symbol)
  225. {
  226. w->i915 = i915;
  227. w->symbol = symbol;
  228. INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
  229. schedule_delayed_work(&w->work, timeout);
  230. }
  231. static void __fini_wedge(struct wedge_me *w)
  232. {
  233. cancel_delayed_work_sync(&w->work);
  234. destroy_delayed_work_on_stack(&w->work);
  235. w->i915 = NULL;
  236. }
  237. #define wedge_on_timeout(W, DEV, TIMEOUT) \
  238. for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
  239. (W)->i915; \
  240. __fini_wedge((W)))
  241. static noinline int
  242. flush_test(struct drm_i915_private *i915, unsigned int flags)
  243. {
  244. struct wedge_me w;
  245. cond_resched();
  246. wedge_on_timeout(&w, i915, HZ)
  247. i915_gem_wait_for_idle(i915, flags);
  248. return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
  249. }
  250. static void hang_fini(struct hang *h)
  251. {
  252. *h->batch = MI_BATCH_BUFFER_END;
  253. i915_gem_chipset_flush(h->i915);
  254. i915_gem_object_unpin_map(h->obj);
  255. i915_gem_object_put(h->obj);
  256. i915_gem_object_unpin_map(h->hws);
  257. i915_gem_object_put(h->hws);
  258. kernel_context_close(h->ctx);
  259. flush_test(h->i915, I915_WAIT_LOCKED);
  260. }
  261. static bool wait_for_hang(struct hang *h, struct i915_request *rq)
  262. {
  263. return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
  264. rq->fence.seqno),
  265. 10) &&
  266. wait_for(i915_seqno_passed(hws_seqno(h, rq),
  267. rq->fence.seqno),
  268. 1000));
  269. }
  270. static int igt_hang_sanitycheck(void *arg)
  271. {
  272. struct drm_i915_private *i915 = arg;
  273. struct i915_request *rq;
  274. struct intel_engine_cs *engine;
  275. enum intel_engine_id id;
  276. struct hang h;
  277. int err;
  278. /* Basic check that we can execute our hanging batch */
  279. mutex_lock(&i915->drm.struct_mutex);
  280. err = hang_init(&h, i915);
  281. if (err)
  282. goto unlock;
  283. for_each_engine(engine, i915, id) {
  284. long timeout;
  285. if (!intel_engine_can_store_dword(engine))
  286. continue;
  287. rq = hang_create_request(&h, engine);
  288. if (IS_ERR(rq)) {
  289. err = PTR_ERR(rq);
  290. pr_err("Failed to create request for %s, err=%d\n",
  291. engine->name, err);
  292. goto fini;
  293. }
  294. i915_request_get(rq);
  295. *h.batch = MI_BATCH_BUFFER_END;
  296. i915_gem_chipset_flush(i915);
  297. __i915_request_add(rq, true);
  298. timeout = i915_request_wait(rq,
  299. I915_WAIT_LOCKED,
  300. MAX_SCHEDULE_TIMEOUT);
  301. i915_request_put(rq);
  302. if (timeout < 0) {
  303. err = timeout;
  304. pr_err("Wait for request failed on %s, err=%d\n",
  305. engine->name, err);
  306. goto fini;
  307. }
  308. }
  309. fini:
  310. hang_fini(&h);
  311. unlock:
  312. mutex_unlock(&i915->drm.struct_mutex);
  313. return err;
  314. }
  315. static void global_reset_lock(struct drm_i915_private *i915)
  316. {
  317. struct intel_engine_cs *engine;
  318. enum intel_engine_id id;
  319. pr_debug("%s: current gpu_error=%08lx\n",
  320. __func__, i915->gpu_error.flags);
  321. while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
  322. wait_event(i915->gpu_error.reset_queue,
  323. !test_bit(I915_RESET_BACKOFF,
  324. &i915->gpu_error.flags));
  325. for_each_engine(engine, i915, id) {
  326. while (test_and_set_bit(I915_RESET_ENGINE + id,
  327. &i915->gpu_error.flags))
  328. wait_on_bit(&i915->gpu_error.flags,
  329. I915_RESET_ENGINE + id,
  330. TASK_UNINTERRUPTIBLE);
  331. }
  332. }
  333. static void global_reset_unlock(struct drm_i915_private *i915)
  334. {
  335. struct intel_engine_cs *engine;
  336. enum intel_engine_id id;
  337. for_each_engine(engine, i915, id)
  338. clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  339. clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
  340. wake_up_all(&i915->gpu_error.reset_queue);
  341. }
  342. static int igt_global_reset(void *arg)
  343. {
  344. struct drm_i915_private *i915 = arg;
  345. unsigned int reset_count;
  346. int err = 0;
  347. /* Check that we can issue a global GPU reset */
  348. global_reset_lock(i915);
  349. set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
  350. mutex_lock(&i915->drm.struct_mutex);
  351. reset_count = i915_reset_count(&i915->gpu_error);
  352. i915_reset(i915, I915_RESET_QUIET);
  353. if (i915_reset_count(&i915->gpu_error) == reset_count) {
  354. pr_err("No GPU reset recorded!\n");
  355. err = -EINVAL;
  356. }
  357. mutex_unlock(&i915->drm.struct_mutex);
  358. GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
  359. global_reset_unlock(i915);
  360. if (i915_terminally_wedged(&i915->gpu_error))
  361. err = -EIO;
  362. return err;
  363. }
  364. static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
  365. {
  366. struct intel_engine_cs *engine;
  367. enum intel_engine_id id;
  368. struct hang h;
  369. int err = 0;
  370. /* Check that we can issue an engine reset on an idle engine (no-op) */
  371. if (!intel_has_reset_engine(i915))
  372. return 0;
  373. if (active) {
  374. mutex_lock(&i915->drm.struct_mutex);
  375. err = hang_init(&h, i915);
  376. mutex_unlock(&i915->drm.struct_mutex);
  377. if (err)
  378. return err;
  379. }
  380. for_each_engine(engine, i915, id) {
  381. unsigned int reset_count, reset_engine_count;
  382. IGT_TIMEOUT(end_time);
  383. if (active && !intel_engine_can_store_dword(engine))
  384. continue;
  385. reset_count = i915_reset_count(&i915->gpu_error);
  386. reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
  387. engine);
  388. set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  389. do {
  390. if (active) {
  391. struct i915_request *rq;
  392. mutex_lock(&i915->drm.struct_mutex);
  393. rq = hang_create_request(&h, engine);
  394. if (IS_ERR(rq)) {
  395. err = PTR_ERR(rq);
  396. mutex_unlock(&i915->drm.struct_mutex);
  397. break;
  398. }
  399. i915_request_get(rq);
  400. __i915_request_add(rq, true);
  401. mutex_unlock(&i915->drm.struct_mutex);
  402. if (!wait_for_hang(&h, rq)) {
  403. struct drm_printer p = drm_info_printer(i915->drm.dev);
  404. pr_err("%s: Failed to start request %x, at %x\n",
  405. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  406. intel_engine_dump(engine, &p,
  407. "%s\n", engine->name);
  408. i915_request_put(rq);
  409. err = -EIO;
  410. break;
  411. }
  412. i915_request_put(rq);
  413. }
  414. engine->hangcheck.stalled = true;
  415. engine->hangcheck.seqno =
  416. intel_engine_get_seqno(engine);
  417. err = i915_reset_engine(engine, I915_RESET_QUIET);
  418. if (err) {
  419. pr_err("i915_reset_engine failed\n");
  420. break;
  421. }
  422. if (i915_reset_count(&i915->gpu_error) != reset_count) {
  423. pr_err("Full GPU reset recorded! (engine reset expected)\n");
  424. err = -EINVAL;
  425. break;
  426. }
  427. reset_engine_count += active;
  428. if (i915_reset_engine_count(&i915->gpu_error, engine) !=
  429. reset_engine_count) {
  430. pr_err("%s engine reset %srecorded!\n",
  431. engine->name, active ? "not " : "");
  432. err = -EINVAL;
  433. break;
  434. }
  435. engine->hangcheck.stalled = false;
  436. } while (time_before(jiffies, end_time));
  437. clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  438. if (err)
  439. break;
  440. err = flush_test(i915, 0);
  441. if (err)
  442. break;
  443. }
  444. if (i915_terminally_wedged(&i915->gpu_error))
  445. err = -EIO;
  446. if (active) {
  447. mutex_lock(&i915->drm.struct_mutex);
  448. hang_fini(&h);
  449. mutex_unlock(&i915->drm.struct_mutex);
  450. }
  451. return err;
  452. }
  453. static int igt_reset_idle_engine(void *arg)
  454. {
  455. return __igt_reset_engine(arg, false);
  456. }
  457. static int igt_reset_active_engine(void *arg)
  458. {
  459. return __igt_reset_engine(arg, true);
  460. }
  461. static int active_engine(void *data)
  462. {
  463. struct intel_engine_cs *engine = data;
  464. struct i915_request *rq[2] = {};
  465. struct i915_gem_context *ctx[2];
  466. struct drm_file *file;
  467. unsigned long count = 0;
  468. int err = 0;
  469. file = mock_file(engine->i915);
  470. if (IS_ERR(file))
  471. return PTR_ERR(file);
  472. mutex_lock(&engine->i915->drm.struct_mutex);
  473. ctx[0] = live_context(engine->i915, file);
  474. mutex_unlock(&engine->i915->drm.struct_mutex);
  475. if (IS_ERR(ctx[0])) {
  476. err = PTR_ERR(ctx[0]);
  477. goto err_file;
  478. }
  479. mutex_lock(&engine->i915->drm.struct_mutex);
  480. ctx[1] = live_context(engine->i915, file);
  481. mutex_unlock(&engine->i915->drm.struct_mutex);
  482. if (IS_ERR(ctx[1])) {
  483. err = PTR_ERR(ctx[1]);
  484. i915_gem_context_put(ctx[0]);
  485. goto err_file;
  486. }
  487. while (!kthread_should_stop()) {
  488. unsigned int idx = count++ & 1;
  489. struct i915_request *old = rq[idx];
  490. struct i915_request *new;
  491. mutex_lock(&engine->i915->drm.struct_mutex);
  492. new = i915_request_alloc(engine, ctx[idx]);
  493. if (IS_ERR(new)) {
  494. mutex_unlock(&engine->i915->drm.struct_mutex);
  495. err = PTR_ERR(new);
  496. break;
  497. }
  498. rq[idx] = i915_request_get(new);
  499. i915_request_add(new);
  500. mutex_unlock(&engine->i915->drm.struct_mutex);
  501. if (old) {
  502. i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
  503. i915_request_put(old);
  504. }
  505. }
  506. for (count = 0; count < ARRAY_SIZE(rq); count++)
  507. i915_request_put(rq[count]);
  508. err_file:
  509. mock_file_free(engine->i915, file);
  510. return err;
  511. }
  512. static int __igt_reset_engine_others(struct drm_i915_private *i915,
  513. bool active)
  514. {
  515. struct intel_engine_cs *engine, *other;
  516. enum intel_engine_id id, tmp;
  517. struct hang h;
  518. int err = 0;
  519. /* Check that issuing a reset on one engine does not interfere
  520. * with any other engine.
  521. */
  522. if (!intel_has_reset_engine(i915))
  523. return 0;
  524. if (active) {
  525. mutex_lock(&i915->drm.struct_mutex);
  526. err = hang_init(&h, i915);
  527. mutex_unlock(&i915->drm.struct_mutex);
  528. if (err)
  529. return err;
  530. }
  531. for_each_engine(engine, i915, id) {
  532. struct task_struct *threads[I915_NUM_ENGINES] = {};
  533. unsigned long resets[I915_NUM_ENGINES];
  534. unsigned long global = i915_reset_count(&i915->gpu_error);
  535. unsigned long count = 0;
  536. IGT_TIMEOUT(end_time);
  537. if (active && !intel_engine_can_store_dword(engine))
  538. continue;
  539. memset(threads, 0, sizeof(threads));
  540. for_each_engine(other, i915, tmp) {
  541. struct task_struct *tsk;
  542. resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
  543. other);
  544. if (other == engine)
  545. continue;
  546. tsk = kthread_run(active_engine, other,
  547. "igt/%s", other->name);
  548. if (IS_ERR(tsk)) {
  549. err = PTR_ERR(tsk);
  550. goto unwind;
  551. }
  552. threads[tmp] = tsk;
  553. get_task_struct(tsk);
  554. }
  555. set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  556. do {
  557. if (active) {
  558. struct i915_request *rq;
  559. mutex_lock(&i915->drm.struct_mutex);
  560. rq = hang_create_request(&h, engine);
  561. if (IS_ERR(rq)) {
  562. err = PTR_ERR(rq);
  563. mutex_unlock(&i915->drm.struct_mutex);
  564. break;
  565. }
  566. i915_request_get(rq);
  567. __i915_request_add(rq, true);
  568. mutex_unlock(&i915->drm.struct_mutex);
  569. if (!wait_for_hang(&h, rq)) {
  570. struct drm_printer p = drm_info_printer(i915->drm.dev);
  571. pr_err("%s: Failed to start request %x, at %x\n",
  572. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  573. intel_engine_dump(engine, &p,
  574. "%s\n", engine->name);
  575. i915_request_put(rq);
  576. err = -EIO;
  577. break;
  578. }
  579. i915_request_put(rq);
  580. }
  581. engine->hangcheck.stalled = true;
  582. engine->hangcheck.seqno =
  583. intel_engine_get_seqno(engine);
  584. err = i915_reset_engine(engine, I915_RESET_QUIET);
  585. if (err) {
  586. pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
  587. engine->name, active ? "active" : "idle", err);
  588. break;
  589. }
  590. engine->hangcheck.stalled = false;
  591. count++;
  592. } while (time_before(jiffies, end_time));
  593. clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  594. pr_info("i915_reset_engine(%s:%s): %lu resets\n",
  595. engine->name, active ? "active" : "idle", count);
  596. if (i915_reset_engine_count(&i915->gpu_error, engine) -
  597. resets[engine->id] != (active ? count : 0)) {
  598. pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
  599. engine->name, active ? "active" : "idle", count,
  600. i915_reset_engine_count(&i915->gpu_error,
  601. engine) - resets[engine->id]);
  602. if (!err)
  603. err = -EINVAL;
  604. }
  605. unwind:
  606. for_each_engine(other, i915, tmp) {
  607. int ret;
  608. if (!threads[tmp])
  609. continue;
  610. ret = kthread_stop(threads[tmp]);
  611. if (ret) {
  612. pr_err("kthread for other engine %s failed, err=%d\n",
  613. other->name, ret);
  614. if (!err)
  615. err = ret;
  616. }
  617. put_task_struct(threads[tmp]);
  618. if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
  619. other)) {
  620. pr_err("Innocent engine %s was reset (count=%ld)\n",
  621. other->name,
  622. i915_reset_engine_count(&i915->gpu_error,
  623. other) - resets[tmp]);
  624. if (!err)
  625. err = -EINVAL;
  626. }
  627. }
  628. if (global != i915_reset_count(&i915->gpu_error)) {
  629. pr_err("Global reset (count=%ld)!\n",
  630. i915_reset_count(&i915->gpu_error) - global);
  631. if (!err)
  632. err = -EINVAL;
  633. }
  634. if (err)
  635. break;
  636. err = flush_test(i915, 0);
  637. if (err)
  638. break;
  639. }
  640. if (i915_terminally_wedged(&i915->gpu_error))
  641. err = -EIO;
  642. if (active) {
  643. mutex_lock(&i915->drm.struct_mutex);
  644. hang_fini(&h);
  645. mutex_unlock(&i915->drm.struct_mutex);
  646. }
  647. return err;
  648. }
  649. static int igt_reset_idle_engine_others(void *arg)
  650. {
  651. return __igt_reset_engine_others(arg, false);
  652. }
  653. static int igt_reset_active_engine_others(void *arg)
  654. {
  655. return __igt_reset_engine_others(arg, true);
  656. }
  657. static u32 fake_hangcheck(struct i915_request *rq)
  658. {
  659. u32 reset_count;
  660. rq->engine->hangcheck.stalled = true;
  661. rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
  662. reset_count = i915_reset_count(&rq->i915->gpu_error);
  663. set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
  664. wake_up_all(&rq->i915->gpu_error.wait_queue);
  665. return reset_count;
  666. }
  667. static int igt_wait_reset(void *arg)
  668. {
  669. struct drm_i915_private *i915 = arg;
  670. struct i915_request *rq;
  671. unsigned int reset_count;
  672. struct hang h;
  673. long timeout;
  674. int err;
  675. if (!intel_engine_can_store_dword(i915->engine[RCS]))
  676. return 0;
  677. /* Check that we detect a stuck waiter and issue a reset */
  678. global_reset_lock(i915);
  679. mutex_lock(&i915->drm.struct_mutex);
  680. err = hang_init(&h, i915);
  681. if (err)
  682. goto unlock;
  683. rq = hang_create_request(&h, i915->engine[RCS]);
  684. if (IS_ERR(rq)) {
  685. err = PTR_ERR(rq);
  686. goto fini;
  687. }
  688. i915_request_get(rq);
  689. __i915_request_add(rq, true);
  690. if (!wait_for_hang(&h, rq)) {
  691. struct drm_printer p = drm_info_printer(i915->drm.dev);
  692. pr_err("%s: Failed to start request %x, at %x\n",
  693. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  694. intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
  695. i915_reset(i915, 0);
  696. i915_gem_set_wedged(i915);
  697. err = -EIO;
  698. goto out_rq;
  699. }
  700. reset_count = fake_hangcheck(rq);
  701. timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
  702. if (timeout < 0) {
  703. pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
  704. timeout);
  705. err = timeout;
  706. goto out_rq;
  707. }
  708. GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
  709. if (i915_reset_count(&i915->gpu_error) == reset_count) {
  710. pr_err("No GPU reset recorded!\n");
  711. err = -EINVAL;
  712. goto out_rq;
  713. }
  714. out_rq:
  715. i915_request_put(rq);
  716. fini:
  717. hang_fini(&h);
  718. unlock:
  719. mutex_unlock(&i915->drm.struct_mutex);
  720. global_reset_unlock(i915);
  721. if (i915_terminally_wedged(&i915->gpu_error))
  722. return -EIO;
  723. return err;
  724. }
  725. static int igt_reset_queue(void *arg)
  726. {
  727. struct drm_i915_private *i915 = arg;
  728. struct intel_engine_cs *engine;
  729. enum intel_engine_id id;
  730. struct hang h;
  731. int err;
  732. /* Check that we replay pending requests following a hang */
  733. global_reset_lock(i915);
  734. mutex_lock(&i915->drm.struct_mutex);
  735. err = hang_init(&h, i915);
  736. if (err)
  737. goto unlock;
  738. for_each_engine(engine, i915, id) {
  739. struct i915_request *prev;
  740. IGT_TIMEOUT(end_time);
  741. unsigned int count;
  742. if (!intel_engine_can_store_dword(engine))
  743. continue;
  744. prev = hang_create_request(&h, engine);
  745. if (IS_ERR(prev)) {
  746. err = PTR_ERR(prev);
  747. goto fini;
  748. }
  749. i915_request_get(prev);
  750. __i915_request_add(prev, true);
  751. count = 0;
  752. do {
  753. struct i915_request *rq;
  754. unsigned int reset_count;
  755. rq = hang_create_request(&h, engine);
  756. if (IS_ERR(rq)) {
  757. err = PTR_ERR(rq);
  758. goto fini;
  759. }
  760. i915_request_get(rq);
  761. __i915_request_add(rq, true);
  762. if (!wait_for_hang(&h, prev)) {
  763. struct drm_printer p = drm_info_printer(i915->drm.dev);
  764. pr_err("%s: Failed to start request %x, at %x\n",
  765. __func__, prev->fence.seqno, hws_seqno(&h, prev));
  766. intel_engine_dump(prev->engine, &p,
  767. "%s\n", prev->engine->name);
  768. i915_request_put(rq);
  769. i915_request_put(prev);
  770. i915_reset(i915, 0);
  771. i915_gem_set_wedged(i915);
  772. err = -EIO;
  773. goto fini;
  774. }
  775. reset_count = fake_hangcheck(prev);
  776. i915_reset(i915, I915_RESET_QUIET);
  777. GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
  778. &i915->gpu_error.flags));
  779. if (prev->fence.error != -EIO) {
  780. pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
  781. prev->fence.error);
  782. i915_request_put(rq);
  783. i915_request_put(prev);
  784. err = -EINVAL;
  785. goto fini;
  786. }
  787. if (rq->fence.error) {
  788. pr_err("Fence error status not zero [%d] after unrelated reset\n",
  789. rq->fence.error);
  790. i915_request_put(rq);
  791. i915_request_put(prev);
  792. err = -EINVAL;
  793. goto fini;
  794. }
  795. if (i915_reset_count(&i915->gpu_error) == reset_count) {
  796. pr_err("No GPU reset recorded!\n");
  797. i915_request_put(rq);
  798. i915_request_put(prev);
  799. err = -EINVAL;
  800. goto fini;
  801. }
  802. i915_request_put(prev);
  803. prev = rq;
  804. count++;
  805. } while (time_before(jiffies, end_time));
  806. pr_info("%s: Completed %d resets\n", engine->name, count);
  807. *h.batch = MI_BATCH_BUFFER_END;
  808. i915_gem_chipset_flush(i915);
  809. i915_request_put(prev);
  810. err = flush_test(i915, I915_WAIT_LOCKED);
  811. if (err)
  812. break;
  813. }
  814. fini:
  815. hang_fini(&h);
  816. unlock:
  817. mutex_unlock(&i915->drm.struct_mutex);
  818. global_reset_unlock(i915);
  819. if (i915_terminally_wedged(&i915->gpu_error))
  820. return -EIO;
  821. return err;
  822. }
  823. static int igt_handle_error(void *arg)
  824. {
  825. struct drm_i915_private *i915 = arg;
  826. struct intel_engine_cs *engine = i915->engine[RCS];
  827. struct hang h;
  828. struct i915_request *rq;
  829. struct i915_gpu_state *error;
  830. int err;
  831. /* Check that we can issue a global GPU and engine reset */
  832. if (!intel_has_reset_engine(i915))
  833. return 0;
  834. if (!intel_engine_can_store_dword(i915->engine[RCS]))
  835. return 0;
  836. mutex_lock(&i915->drm.struct_mutex);
  837. err = hang_init(&h, i915);
  838. if (err)
  839. goto err_unlock;
  840. rq = hang_create_request(&h, engine);
  841. if (IS_ERR(rq)) {
  842. err = PTR_ERR(rq);
  843. goto err_fini;
  844. }
  845. i915_request_get(rq);
  846. __i915_request_add(rq, true);
  847. if (!wait_for_hang(&h, rq)) {
  848. struct drm_printer p = drm_info_printer(i915->drm.dev);
  849. pr_err("%s: Failed to start request %x, at %x\n",
  850. __func__, rq->fence.seqno, hws_seqno(&h, rq));
  851. intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
  852. i915_reset(i915, 0);
  853. i915_gem_set_wedged(i915);
  854. err = -EIO;
  855. goto err_request;
  856. }
  857. mutex_unlock(&i915->drm.struct_mutex);
  858. /* Temporarily disable error capture */
  859. error = xchg(&i915->gpu_error.first_error, (void *)-1);
  860. engine->hangcheck.stalled = true;
  861. engine->hangcheck.seqno = intel_engine_get_seqno(engine);
  862. i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
  863. xchg(&i915->gpu_error.first_error, error);
  864. mutex_lock(&i915->drm.struct_mutex);
  865. if (rq->fence.error != -EIO) {
  866. pr_err("Guilty request not identified!\n");
  867. err = -EINVAL;
  868. goto err_request;
  869. }
  870. err_request:
  871. i915_request_put(rq);
  872. err_fini:
  873. hang_fini(&h);
  874. err_unlock:
  875. mutex_unlock(&i915->drm.struct_mutex);
  876. return err;
  877. }
  878. int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
  879. {
  880. static const struct i915_subtest tests[] = {
  881. SUBTEST(igt_global_reset), /* attempt to recover GPU first */
  882. SUBTEST(igt_hang_sanitycheck),
  883. SUBTEST(igt_reset_idle_engine),
  884. SUBTEST(igt_reset_active_engine),
  885. SUBTEST(igt_reset_idle_engine_others),
  886. SUBTEST(igt_reset_active_engine_others),
  887. SUBTEST(igt_wait_reset),
  888. SUBTEST(igt_reset_queue),
  889. SUBTEST(igt_handle_error),
  890. };
  891. bool saved_hangcheck;
  892. int err;
  893. if (!intel_has_gpu_reset(i915))
  894. return 0;
  895. intel_runtime_pm_get(i915);
  896. saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
  897. err = i915_subtests(tests, i915);
  898. i915_modparams.enable_hangcheck = saved_hangcheck;
  899. intel_runtime_pm_put(i915);
  900. return err;
  901. }