Эх сурвалжийг харах

drm/amdgpu: add option to stop on VM fault

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Christian König 10 жил өмнө
parent
commit
d9c13156a6

+ 6 - 0
drivers/gpu/drm/amd/amdgpu/amdgpu.h

@@ -79,6 +79,7 @@ extern int amdgpu_bapm;
 extern int amdgpu_deep_color;
 extern int amdgpu_deep_color;
 extern int amdgpu_vm_size;
 extern int amdgpu_vm_size;
 extern int amdgpu_vm_block_size;
 extern int amdgpu_vm_block_size;
+extern int amdgpu_vm_fault_stop;
 extern int amdgpu_enable_scheduler;
 extern int amdgpu_enable_scheduler;
 extern int amdgpu_sched_jobs;
 extern int amdgpu_sched_jobs;
 extern int amdgpu_sched_hw_submission;
 extern int amdgpu_sched_hw_submission;
@@ -960,6 +961,11 @@ struct amdgpu_ring {
 #define AMDGPU_PTE_FRAG_64KB	(4 << 7)
 #define AMDGPU_PTE_FRAG_64KB	(4 << 7)
 #define AMDGPU_LOG2_PAGES_PER_FRAG 4
 #define AMDGPU_LOG2_PAGES_PER_FRAG 4
 
 
+/* How to programm VM fault handling */
+#define AMDGPU_VM_FAULT_STOP_NEVER	0
+#define AMDGPU_VM_FAULT_STOP_FIRST	1
+#define AMDGPU_VM_FAULT_STOP_ALWAYS	2
+
 struct amdgpu_vm_pt {
 struct amdgpu_vm_pt {
 	struct amdgpu_bo		*bo;
 	struct amdgpu_bo		*bo;
 	uint64_t			addr;
 	uint64_t			addr;

+ 4 - 0
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

@@ -75,6 +75,7 @@ int amdgpu_bapm = -1;
 int amdgpu_deep_color = 0;
 int amdgpu_deep_color = 0;
 int amdgpu_vm_size = 8;
 int amdgpu_vm_size = 8;
 int amdgpu_vm_block_size = -1;
 int amdgpu_vm_block_size = -1;
+int amdgpu_vm_fault_stop = 0;
 int amdgpu_exp_hw_support = 0;
 int amdgpu_exp_hw_support = 0;
 int amdgpu_enable_scheduler = 1;
 int amdgpu_enable_scheduler = 1;
 int amdgpu_sched_jobs = 16;
 int amdgpu_sched_jobs = 16;
@@ -141,6 +142,9 @@ module_param_named(vm_size, amdgpu_vm_size, int, 0444);
 MODULE_PARM_DESC(vm_block_size, "VM page table size in bits (default depending on vm_size)");
 MODULE_PARM_DESC(vm_block_size, "VM page table size in bits (default depending on vm_size)");
 module_param_named(vm_block_size, amdgpu_vm_block_size, int, 0444);
 module_param_named(vm_block_size, amdgpu_vm_block_size, int, 0444);
 
 
+MODULE_PARM_DESC(vm_fault_stop, "Stop on VM fault (0 = never (default), 1 = print first, 2 = always)");
+module_param_named(vm_fault_stop, amdgpu_vm_fault_stop, int, 0444);
+
 MODULE_PARM_DESC(exp_hw_support, "experimental hw support (1 = enable, 0 = disable (default))");
 MODULE_PARM_DESC(exp_hw_support, "experimental hw support (1 = enable, 0 = disable (default))");
 module_param_named(exp_hw_support, amdgpu_exp_hw_support, int, 0444);
 module_param_named(exp_hw_support, amdgpu_exp_hw_support, int, 0444);
 
 

+ 34 - 6
drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c

@@ -435,6 +435,33 @@ static int gmc_v7_0_gart_set_pte_pde(struct amdgpu_device *adev,
 	return 0;
 	return 0;
 }
 }
 
 
+/**
+ * gmc_v8_0_set_fault_enable_default - update VM fault handling
+ *
+ * @adev: amdgpu_device pointer
+ * @value: true redirects VM faults to the default page
+ */
+static void gmc_v7_0_set_fault_enable_default(struct amdgpu_device *adev,
+					      bool value)
+{
+	u32 tmp;
+
+	tmp = RREG32(mmVM_CONTEXT1_CNTL);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    RANGE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    DUMMY_PAGE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    PDE0_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    VALID_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    READ_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    WRITE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	WREG32(mmVM_CONTEXT1_CNTL, tmp);
+}
+
 /**
 /**
  * gmc_v7_0_gart_enable - gart enable
  * gmc_v7_0_gart_enable - gart enable
  *
  *
@@ -523,15 +550,13 @@ static int gmc_v7_0_gart_enable(struct amdgpu_device *adev)
 	tmp = RREG32(mmVM_CONTEXT1_CNTL);
 	tmp = RREG32(mmVM_CONTEXT1_CNTL);
 	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, ENABLE_CONTEXT, 1);
 	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, ENABLE_CONTEXT, 1);
 	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_DEPTH, 1);
 	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_DEPTH, 1);
-	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, RANGE_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
-	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, DUMMY_PAGE_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
-	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PDE0_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
-	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, VALID_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
-	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, READ_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
-	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, WRITE_PROTECTION_FAULT_ENABLE_DEFAULT, 1);
 	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_BLOCK_SIZE,
 	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_BLOCK_SIZE,
 			    amdgpu_vm_block_size - 9);
 			    amdgpu_vm_block_size - 9);
 	WREG32(mmVM_CONTEXT1_CNTL, tmp);
 	WREG32(mmVM_CONTEXT1_CNTL, tmp);
+	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS)
+		gmc_v7_0_set_fault_enable_default(adev, false);
+	else
+		gmc_v7_0_set_fault_enable_default(adev, true);
 
 
 	if (adev->asic_type == CHIP_KAVERI) {
 	if (adev->asic_type == CHIP_KAVERI) {
 		tmp = RREG32(mmCHUB_CONTROL);
 		tmp = RREG32(mmCHUB_CONTROL);
@@ -1268,6 +1293,9 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev,
 	if (!addr && !status)
 	if (!addr && !status)
 		return 0;
 		return 0;
 
 
+	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
+		gmc_v7_0_set_fault_enable_default(adev, false);
+
 	dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
 	dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
 		entry->src_id, entry->src_data);
 		entry->src_id, entry->src_data);
 	dev_err(adev->dev, "  VM_CONTEXT1_PROTECTION_FAULT_ADDR   0x%08X\n",
 	dev_err(adev->dev, "  VM_CONTEXT1_PROTECTION_FAULT_ADDR   0x%08X\n",

+ 36 - 0
drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c

@@ -549,6 +549,35 @@ static int gmc_v8_0_gart_set_pte_pde(struct amdgpu_device *adev,
 	return 0;
 	return 0;
 }
 }
 
 
+/**
+ * gmc_v8_0_set_fault_enable_default - update VM fault handling
+ *
+ * @adev: amdgpu_device pointer
+ * @value: true redirects VM faults to the default page
+ */
+static void gmc_v8_0_set_fault_enable_default(struct amdgpu_device *adev,
+					      bool value)
+{
+	u32 tmp;
+
+	tmp = RREG32(mmVM_CONTEXT1_CNTL);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    RANGE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    DUMMY_PAGE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    PDE0_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    VALID_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    READ_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    WRITE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
+			    EXECUTE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
+	WREG32(mmVM_CONTEXT1_CNTL, tmp);
+}
+
 /**
 /**
  * gmc_v8_0_gart_enable - gart enable
  * gmc_v8_0_gart_enable - gart enable
  *
  *
@@ -663,6 +692,10 @@ static int gmc_v8_0_gart_enable(struct amdgpu_device *adev)
 	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_BLOCK_SIZE,
 	tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_BLOCK_SIZE,
 			    amdgpu_vm_block_size - 9);
 			    amdgpu_vm_block_size - 9);
 	WREG32(mmVM_CONTEXT1_CNTL, tmp);
 	WREG32(mmVM_CONTEXT1_CNTL, tmp);
+	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS)
+		gmc_v8_0_set_fault_enable_default(adev, false);
+	else
+		gmc_v8_0_set_fault_enable_default(adev, true);
 
 
 	gmc_v8_0_gart_flush_gpu_tlb(adev, 0);
 	gmc_v8_0_gart_flush_gpu_tlb(adev, 0);
 	DRM_INFO("PCIE GART of %uM enabled (table at 0x%016llX).\n",
 	DRM_INFO("PCIE GART of %uM enabled (table at 0x%016llX).\n",
@@ -1268,6 +1301,9 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
 	if (!addr && !status)
 	if (!addr && !status)
 		return 0;
 		return 0;
 
 
+	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
+		gmc_v8_0_set_fault_enable_default(adev, false);
+
 	dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
 	dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
 		entry->src_id, entry->src_data);
 		entry->src_id, entry->src_data);
 	dev_err(adev->dev, "  VM_CONTEXT1_PROTECTION_FAULT_ADDR   0x%08X\n",
 	dev_err(adev->dev, "  VM_CONTEXT1_PROTECTION_FAULT_ADDR   0x%08X\n",