i915_gpu_error.h 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. /*
  2. * SPDX-License-Identifier: MIT
  3. *
  4. * Copyright � 2008-2018 Intel Corporation
  5. */
  6. #ifndef _I915_GPU_ERROR_H_
  7. #define _I915_GPU_ERROR_H_
  8. #include <linux/kref.h>
  9. #include <linux/ktime.h>
  10. #include <linux/sched.h>
  11. #include <drm/drm_mm.h>
  12. #include "intel_device_info.h"
  13. #include "intel_ringbuffer.h"
  14. #include "intel_uc_fw.h"
  15. #include "i915_gem.h"
  16. #include "i915_gem_gtt.h"
  17. #include "i915_params.h"
  18. #include "i915_scheduler.h"
  19. struct drm_i915_private;
  20. struct intel_overlay_error_state;
  21. struct intel_display_error_state;
  22. struct i915_gpu_state {
  23. struct kref ref;
  24. ktime_t time;
  25. ktime_t boottime;
  26. ktime_t uptime;
  27. unsigned long capture;
  28. unsigned long epoch;
  29. struct drm_i915_private *i915;
  30. char error_msg[128];
  31. bool simulated;
  32. bool awake;
  33. bool wakelock;
  34. bool suspended;
  35. int iommu;
  36. u32 reset_count;
  37. u32 suspend_count;
  38. struct intel_device_info device_info;
  39. struct intel_driver_caps driver_caps;
  40. struct i915_params params;
  41. struct i915_error_uc {
  42. struct intel_uc_fw guc_fw;
  43. struct intel_uc_fw huc_fw;
  44. struct drm_i915_error_object *guc_log;
  45. } uc;
  46. /* Generic register state */
  47. u32 eir;
  48. u32 pgtbl_er;
  49. u32 ier;
  50. u32 gtier[4], ngtier;
  51. u32 ccid;
  52. u32 derrmr;
  53. u32 forcewake;
  54. u32 error; /* gen6+ */
  55. u32 err_int; /* gen7 */
  56. u32 fault_data0; /* gen8, gen9 */
  57. u32 fault_data1; /* gen8, gen9 */
  58. u32 done_reg;
  59. u32 gac_eco;
  60. u32 gam_ecochk;
  61. u32 gab_ctl;
  62. u32 gfx_mode;
  63. u32 nfence;
  64. u64 fence[I915_MAX_NUM_FENCES];
  65. struct intel_overlay_error_state *overlay;
  66. struct intel_display_error_state *display;
  67. struct drm_i915_error_engine {
  68. int engine_id;
  69. /* Software tracked state */
  70. bool idle;
  71. bool waiting;
  72. int num_waiters;
  73. unsigned long hangcheck_timestamp;
  74. bool hangcheck_stalled;
  75. enum intel_engine_hangcheck_action hangcheck_action;
  76. struct i915_address_space *vm;
  77. int num_requests;
  78. u32 reset_count;
  79. /* position of active request inside the ring */
  80. u32 rq_head, rq_post, rq_tail;
  81. /* our own tracking of ring head and tail */
  82. u32 cpu_ring_head;
  83. u32 cpu_ring_tail;
  84. u32 last_seqno;
  85. /* Register state */
  86. u32 start;
  87. u32 tail;
  88. u32 head;
  89. u32 ctl;
  90. u32 mode;
  91. u32 hws;
  92. u32 ipeir;
  93. u32 ipehr;
  94. u32 bbstate;
  95. u32 instpm;
  96. u32 instps;
  97. u32 seqno;
  98. u64 bbaddr;
  99. u64 acthd;
  100. u32 fault_reg;
  101. u64 faddr;
  102. u32 rc_psmi; /* sleep state */
  103. u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
  104. struct intel_instdone instdone;
  105. struct drm_i915_error_context {
  106. char comm[TASK_COMM_LEN];
  107. pid_t pid;
  108. u32 handle;
  109. u32 hw_id;
  110. int ban_score;
  111. int active;
  112. int guilty;
  113. bool bannable;
  114. struct i915_sched_attr sched_attr;
  115. } context;
  116. struct drm_i915_error_object {
  117. u64 gtt_offset;
  118. u64 gtt_size;
  119. int page_count;
  120. int unused;
  121. u32 *pages[0];
  122. } *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
  123. struct drm_i915_error_object **user_bo;
  124. long user_bo_count;
  125. struct drm_i915_error_object *wa_ctx;
  126. struct drm_i915_error_object *default_state;
  127. struct drm_i915_error_request {
  128. long jiffies;
  129. pid_t pid;
  130. u32 context;
  131. int ban_score;
  132. u32 seqno;
  133. u32 start;
  134. u32 head;
  135. u32 tail;
  136. struct i915_sched_attr sched_attr;
  137. } *requests, execlist[EXECLIST_MAX_PORTS];
  138. unsigned int num_ports;
  139. struct drm_i915_error_waiter {
  140. char comm[TASK_COMM_LEN];
  141. pid_t pid;
  142. u32 seqno;
  143. } *waiters;
  144. struct {
  145. u32 gfx_mode;
  146. union {
  147. u64 pdp[4];
  148. u32 pp_dir_base;
  149. };
  150. } vm_info;
  151. } engine[I915_NUM_ENGINES];
  152. struct drm_i915_error_buffer {
  153. u32 size;
  154. u32 name;
  155. u32 rseqno[I915_NUM_ENGINES], wseqno;
  156. u64 gtt_offset;
  157. u32 read_domains;
  158. u32 write_domain;
  159. s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
  160. u32 tiling:2;
  161. u32 dirty:1;
  162. u32 purgeable:1;
  163. u32 userptr:1;
  164. s32 engine:4;
  165. u32 cache_level:3;
  166. } *active_bo[I915_NUM_ENGINES], *pinned_bo;
  167. u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
  168. struct i915_address_space *active_vm[I915_NUM_ENGINES];
  169. };
  170. struct i915_gpu_error {
  171. /* For hangcheck timer */
  172. #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
  173. #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
  174. struct delayed_work hangcheck_work;
  175. /* For reset and error_state handling. */
  176. spinlock_t lock;
  177. /* Protected by the above dev->gpu_error.lock. */
  178. struct i915_gpu_state *first_error;
  179. atomic_t pending_fb_pin;
  180. unsigned long missed_irq_rings;
  181. /**
  182. * State variable controlling the reset flow and count
  183. *
  184. * This is a counter which gets incremented when reset is triggered,
  185. *
  186. * Before the reset commences, the I915_RESET_BACKOFF bit is set
  187. * meaning that any waiters holding onto the struct_mutex should
  188. * relinquish the lock immediately in order for the reset to start.
  189. *
  190. * If reset is not completed successfully, the I915_WEDGE bit is
  191. * set meaning that hardware is terminally sour and there is no
  192. * recovery. All waiters on the reset_queue will be woken when
  193. * that happens.
  194. *
  195. * This counter is used by the wait_seqno code to notice that reset
  196. * event happened and it needs to restart the entire ioctl (since most
  197. * likely the seqno it waited for won't ever signal anytime soon).
  198. *
  199. * This is important for lock-free wait paths, where no contended lock
  200. * naturally enforces the correct ordering between the bail-out of the
  201. * waiter and the gpu reset work code.
  202. */
  203. unsigned long reset_count;
  204. /**
  205. * flags: Control various stages of the GPU reset
  206. *
  207. * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
  208. * other users acquiring the struct_mutex. To do this we set the
  209. * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
  210. * and then check for that bit before acquiring the struct_mutex (in
  211. * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
  212. * secondary role in preventing two concurrent global reset attempts.
  213. *
  214. * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
  215. * struct_mutex. We try to acquire the struct_mutex in the reset worker,
  216. * but it may be held by some long running waiter (that we cannot
  217. * interrupt without causing trouble). Once we are ready to do the GPU
  218. * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
  219. * they already hold the struct_mutex and want to participate they can
  220. * inspect the bit and do the reset directly, otherwise the worker
  221. * waits for the struct_mutex.
  222. *
  223. * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
  224. * acquire the struct_mutex to reset an engine, we need an explicit
  225. * flag to prevent two concurrent reset attempts in the same engine.
  226. * As the number of engines continues to grow, allocate the flags from
  227. * the most significant bits.
  228. *
  229. * #I915_WEDGED - If reset fails and we can no longer use the GPU,
  230. * we set the #I915_WEDGED bit. Prior to command submission, e.g.
  231. * i915_request_alloc(), this bit is checked and the sequence
  232. * aborted (with -EIO reported to userspace) if set.
  233. */
  234. unsigned long flags;
  235. #define I915_RESET_BACKOFF 0
  236. #define I915_RESET_HANDOFF 1
  237. #define I915_RESET_MODESET 2
  238. #define I915_WEDGED (BITS_PER_LONG - 1)
  239. #define I915_RESET_ENGINE (I915_WEDGED - I915_NUM_ENGINES)
  240. /** Number of times an engine has been reset */
  241. u32 reset_engine_count[I915_NUM_ENGINES];
  242. /** Set of stalled engines with guilty requests, in the current reset */
  243. u32 stalled_mask;
  244. /** Reason for the current *global* reset */
  245. const char *reason;
  246. /**
  247. * Waitqueue to signal when a hang is detected. Used to for waiters
  248. * to release the struct_mutex for the reset to procede.
  249. */
  250. wait_queue_head_t wait_queue;
  251. /**
  252. * Waitqueue to signal when the reset has completed. Used by clients
  253. * that wait for dev_priv->mm.wedged to settle.
  254. */
  255. wait_queue_head_t reset_queue;
  256. /* For missed irq/seqno simulation. */
  257. unsigned long test_irq_rings;
  258. };
  259. struct drm_i915_error_state_buf {
  260. struct drm_i915_private *i915;
  261. unsigned int bytes;
  262. unsigned int size;
  263. int err;
  264. u8 *buf;
  265. loff_t start;
  266. loff_t pos;
  267. };
  268. #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
  269. __printf(2, 3)
  270. void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
  271. int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
  272. const struct i915_gpu_state *gpu);
  273. int i915_error_state_buf_init(struct drm_i915_error_state_buf *eb,
  274. struct drm_i915_private *i915,
  275. size_t count, loff_t pos);
  276. static inline void
  277. i915_error_state_buf_release(struct drm_i915_error_state_buf *eb)
  278. {
  279. kfree(eb->buf);
  280. }
  281. struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
  282. void i915_capture_error_state(struct drm_i915_private *dev_priv,
  283. u32 engine_mask,
  284. const char *error_msg);
  285. static inline struct i915_gpu_state *
  286. i915_gpu_state_get(struct i915_gpu_state *gpu)
  287. {
  288. kref_get(&gpu->ref);
  289. return gpu;
  290. }
  291. void __i915_gpu_state_free(struct kref *kref);
  292. static inline void i915_gpu_state_put(struct i915_gpu_state *gpu)
  293. {
  294. if (gpu)
  295. kref_put(&gpu->ref, __i915_gpu_state_free);
  296. }
  297. struct i915_gpu_state *i915_first_error_state(struct drm_i915_private *i915);
  298. void i915_reset_error_state(struct drm_i915_private *i915);
  299. #else
  300. static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
  301. u32 engine_mask,
  302. const char *error_msg)
  303. {
  304. }
  305. static inline struct i915_gpu_state *
  306. i915_first_error_state(struct drm_i915_private *i915)
  307. {
  308. return NULL;
  309. }
  310. static inline void i915_reset_error_state(struct drm_i915_private *i915)
  311. {
  312. }
  313. #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
  314. #endif /* _I915_GPU_ERROR_H_ */