i915_gpu_error.h 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. /*
  2. * SPDX-License-Identifier: MIT
  3. *
  4. * Copyright � 2008-2018 Intel Corporation
  5. */
  6. #ifndef _I915_GPU_ERROR_H_
  7. #define _I915_GPU_ERROR_H_
  8. #include <linux/kref.h>
  9. #include <linux/ktime.h>
  10. #include <linux/sched.h>
  11. #include <drm/drm_mm.h>
  12. #include "intel_device_info.h"
  13. #include "intel_ringbuffer.h"
  14. #include "intel_uc_fw.h"
  15. #include "i915_gem.h"
  16. #include "i915_gem_gtt.h"
  17. #include "i915_params.h"
  18. #include "i915_scheduler.h"
  19. struct drm_i915_private;
  20. struct intel_overlay_error_state;
  21. struct intel_display_error_state;
  22. struct i915_gpu_state {
  23. struct kref ref;
  24. ktime_t time;
  25. ktime_t boottime;
  26. ktime_t uptime;
  27. unsigned long capture;
  28. unsigned long epoch;
  29. struct drm_i915_private *i915;
  30. char error_msg[128];
  31. bool simulated;
  32. bool awake;
  33. bool wakelock;
  34. bool suspended;
  35. int iommu;
  36. u32 reset_count;
  37. u32 suspend_count;
  38. struct intel_device_info device_info;
  39. struct intel_driver_caps driver_caps;
  40. struct i915_params params;
  41. struct i915_error_uc {
  42. struct intel_uc_fw guc_fw;
  43. struct intel_uc_fw huc_fw;
  44. struct drm_i915_error_object *guc_log;
  45. } uc;
  46. /* Generic register state */
  47. u32 eir;
  48. u32 pgtbl_er;
  49. u32 ier;
  50. u32 gtier[6], ngtier;
  51. u32 ccid;
  52. u32 derrmr;
  53. u32 forcewake;
  54. u32 error; /* gen6+ */
  55. u32 err_int; /* gen7 */
  56. u32 fault_data0; /* gen8, gen9 */
  57. u32 fault_data1; /* gen8, gen9 */
  58. u32 done_reg;
  59. u32 gac_eco;
  60. u32 gam_ecochk;
  61. u32 gab_ctl;
  62. u32 gfx_mode;
  63. u32 nfence;
  64. u64 fence[I915_MAX_NUM_FENCES];
  65. struct intel_overlay_error_state *overlay;
  66. struct intel_display_error_state *display;
  67. struct drm_i915_error_engine {
  68. int engine_id;
  69. /* Software tracked state */
  70. bool idle;
  71. bool waiting;
  72. int num_waiters;
  73. unsigned long hangcheck_timestamp;
  74. bool hangcheck_stalled;
  75. enum intel_engine_hangcheck_action hangcheck_action;
  76. struct i915_address_space *vm;
  77. int num_requests;
  78. u32 reset_count;
  79. /* position of active request inside the ring */
  80. u32 rq_head, rq_post, rq_tail;
  81. /* our own tracking of ring head and tail */
  82. u32 cpu_ring_head;
  83. u32 cpu_ring_tail;
  84. u32 last_seqno;
  85. /* Register state */
  86. u32 start;
  87. u32 tail;
  88. u32 head;
  89. u32 ctl;
  90. u32 mode;
  91. u32 hws;
  92. u32 ipeir;
  93. u32 ipehr;
  94. u32 bbstate;
  95. u32 instpm;
  96. u32 instps;
  97. u32 seqno;
  98. u64 bbaddr;
  99. u64 acthd;
  100. u32 fault_reg;
  101. u64 faddr;
  102. u32 rc_psmi; /* sleep state */
  103. u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
  104. struct intel_instdone instdone;
  105. struct drm_i915_error_context {
  106. char comm[TASK_COMM_LEN];
  107. pid_t pid;
  108. u32 handle;
  109. u32 hw_id;
  110. int ban_score;
  111. int active;
  112. int guilty;
  113. bool bannable;
  114. struct i915_sched_attr sched_attr;
  115. } context;
  116. struct drm_i915_error_object {
  117. u64 gtt_offset;
  118. u64 gtt_size;
  119. int num_pages;
  120. int page_count;
  121. int unused;
  122. u32 *pages[0];
  123. } *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
  124. struct drm_i915_error_object **user_bo;
  125. long user_bo_count;
  126. struct drm_i915_error_object *wa_ctx;
  127. struct drm_i915_error_object *default_state;
  128. struct drm_i915_error_request {
  129. long jiffies;
  130. pid_t pid;
  131. u32 context;
  132. int ban_score;
  133. u32 seqno;
  134. u32 start;
  135. u32 head;
  136. u32 tail;
  137. struct i915_sched_attr sched_attr;
  138. } *requests, execlist[EXECLIST_MAX_PORTS];
  139. unsigned int num_ports;
  140. struct drm_i915_error_waiter {
  141. char comm[TASK_COMM_LEN];
  142. pid_t pid;
  143. u32 seqno;
  144. } *waiters;
  145. struct {
  146. u32 gfx_mode;
  147. union {
  148. u64 pdp[4];
  149. u32 pp_dir_base;
  150. };
  151. } vm_info;
  152. } engine[I915_NUM_ENGINES];
  153. struct drm_i915_error_buffer {
  154. u32 size;
  155. u32 name;
  156. u32 wseqno;
  157. u64 gtt_offset;
  158. u32 read_domains;
  159. u32 write_domain;
  160. s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
  161. u32 tiling:2;
  162. u32 dirty:1;
  163. u32 purgeable:1;
  164. u32 userptr:1;
  165. s32 engine:4;
  166. u32 cache_level:3;
  167. } *active_bo[I915_NUM_ENGINES], *pinned_bo;
  168. u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
  169. struct i915_address_space *active_vm[I915_NUM_ENGINES];
  170. };
  171. struct i915_gpu_error {
  172. /* For hangcheck timer */
  173. #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
  174. #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
  175. struct delayed_work hangcheck_work;
  176. /* For reset and error_state handling. */
  177. spinlock_t lock;
  178. /* Protected by the above dev->gpu_error.lock. */
  179. struct i915_gpu_state *first_error;
  180. atomic_t pending_fb_pin;
  181. unsigned long missed_irq_rings;
  182. /**
  183. * State variable controlling the reset flow and count
  184. *
  185. * This is a counter which gets incremented when reset is triggered,
  186. *
  187. * Before the reset commences, the I915_RESET_BACKOFF bit is set
  188. * meaning that any waiters holding onto the struct_mutex should
  189. * relinquish the lock immediately in order for the reset to start.
  190. *
  191. * If reset is not completed successfully, the I915_WEDGE bit is
  192. * set meaning that hardware is terminally sour and there is no
  193. * recovery. All waiters on the reset_queue will be woken when
  194. * that happens.
  195. *
  196. * This counter is used by the wait_seqno code to notice that reset
  197. * event happened and it needs to restart the entire ioctl (since most
  198. * likely the seqno it waited for won't ever signal anytime soon).
  199. *
  200. * This is important for lock-free wait paths, where no contended lock
  201. * naturally enforces the correct ordering between the bail-out of the
  202. * waiter and the gpu reset work code.
  203. */
  204. unsigned long reset_count;
  205. /**
  206. * flags: Control various stages of the GPU reset
  207. *
  208. * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
  209. * other users acquiring the struct_mutex. To do this we set the
  210. * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
  211. * and then check for that bit before acquiring the struct_mutex (in
  212. * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
  213. * secondary role in preventing two concurrent global reset attempts.
  214. *
  215. * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
  216. * struct_mutex. We try to acquire the struct_mutex in the reset worker,
  217. * but it may be held by some long running waiter (that we cannot
  218. * interrupt without causing trouble). Once we are ready to do the GPU
  219. * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
  220. * they already hold the struct_mutex and want to participate they can
  221. * inspect the bit and do the reset directly, otherwise the worker
  222. * waits for the struct_mutex.
  223. *
  224. * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
  225. * acquire the struct_mutex to reset an engine, we need an explicit
  226. * flag to prevent two concurrent reset attempts in the same engine.
  227. * As the number of engines continues to grow, allocate the flags from
  228. * the most significant bits.
  229. *
  230. * #I915_WEDGED - If reset fails and we can no longer use the GPU,
  231. * we set the #I915_WEDGED bit. Prior to command submission, e.g.
  232. * i915_request_alloc(), this bit is checked and the sequence
  233. * aborted (with -EIO reported to userspace) if set.
  234. */
  235. unsigned long flags;
  236. #define I915_RESET_BACKOFF 0
  237. #define I915_RESET_HANDOFF 1
  238. #define I915_RESET_MODESET 2
  239. #define I915_WEDGED (BITS_PER_LONG - 1)
  240. #define I915_RESET_ENGINE (I915_WEDGED - I915_NUM_ENGINES)
  241. /** Number of times an engine has been reset */
  242. u32 reset_engine_count[I915_NUM_ENGINES];
  243. /** Set of stalled engines with guilty requests, in the current reset */
  244. u32 stalled_mask;
  245. /** Reason for the current *global* reset */
  246. const char *reason;
  247. /**
  248. * Waitqueue to signal when a hang is detected. Used to for waiters
  249. * to release the struct_mutex for the reset to procede.
  250. */
  251. wait_queue_head_t wait_queue;
  252. /**
  253. * Waitqueue to signal when the reset has completed. Used by clients
  254. * that wait for dev_priv->mm.wedged to settle.
  255. */
  256. wait_queue_head_t reset_queue;
  257. /* For missed irq/seqno simulation. */
  258. unsigned long test_irq_rings;
  259. };
  260. struct drm_i915_error_state_buf {
  261. struct drm_i915_private *i915;
  262. unsigned int bytes;
  263. unsigned int size;
  264. int err;
  265. u8 *buf;
  266. loff_t start;
  267. loff_t pos;
  268. };
  269. #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
  270. __printf(2, 3)
  271. void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
  272. int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
  273. const struct i915_gpu_state *gpu);
  274. int i915_error_state_buf_init(struct drm_i915_error_state_buf *eb,
  275. struct drm_i915_private *i915,
  276. size_t count, loff_t pos);
  277. static inline void
  278. i915_error_state_buf_release(struct drm_i915_error_state_buf *eb)
  279. {
  280. kfree(eb->buf);
  281. }
  282. struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
  283. void i915_capture_error_state(struct drm_i915_private *dev_priv,
  284. u32 engine_mask,
  285. const char *error_msg);
  286. static inline struct i915_gpu_state *
  287. i915_gpu_state_get(struct i915_gpu_state *gpu)
  288. {
  289. kref_get(&gpu->ref);
  290. return gpu;
  291. }
  292. void __i915_gpu_state_free(struct kref *kref);
  293. static inline void i915_gpu_state_put(struct i915_gpu_state *gpu)
  294. {
  295. if (gpu)
  296. kref_put(&gpu->ref, __i915_gpu_state_free);
  297. }
  298. struct i915_gpu_state *i915_first_error_state(struct drm_i915_private *i915);
  299. void i915_reset_error_state(struct drm_i915_private *i915);
  300. #else
  301. static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
  302. u32 engine_mask,
  303. const char *error_msg)
  304. {
  305. }
  306. static inline struct i915_gpu_state *
  307. i915_first_error_state(struct drm_i915_private *i915)
  308. {
  309. return NULL;
  310. }
  311. static inline void i915_reset_error_state(struct drm_i915_private *i915)
  312. {
  313. }
  314. #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
  315. #endif /* _I915_GPU_ERROR_H_ */