vor 7 Jahren · a9742b794a
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -959,6 +959,7 @@ struct amdgpu_gfx_config {
 
				 };
			
 
				 
			
 
				 struct amdgpu_cu_info {
			
 
				+	uint32_t simd_per_cu;
			
 
				 	uint32_t max_waves_per_simd;
			
 
				 	uint32_t wave_front_size;
			
 
				 	uint32_t max_scratch_slots_per_cu;
			
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -275,14 +275,34 @@ void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
 
				 	kfree(mem);
			
 
				 }
			
 
				 
			
 
				-uint64_t get_vmem_size(struct kgd_dev *kgd)
			
 
				+void get_local_mem_info(struct kgd_dev *kgd,
			
 
				+			struct kfd_local_mem_info *mem_info)
			
 
				 {
			
 
				-	struct amdgpu_device *adev =
			
 
				-		(struct amdgpu_device *)kgd;
			
 
				+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
			
 
				+	uint64_t address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask :
			
 
				+					     ~((1ULL << 32) - 1);
			
 
				+	resource_size_t aper_limit = adev->mc.aper_base + adev->mc.aper_size;
			
 
				+
			
 
				+	memset(mem_info, 0, sizeof(*mem_info));
			
 
				+	if (!(adev->mc.aper_base & address_mask || aper_limit & address_mask)) {
			
 
				+		mem_info->local_mem_size_public = adev->mc.visible_vram_size;
			
 
				+		mem_info->local_mem_size_private = adev->mc.real_vram_size -
			
 
				+				adev->mc.visible_vram_size;
			
 
				+	} else {
			
 
				+		mem_info->local_mem_size_public = 0;
			
 
				+		mem_info->local_mem_size_private = adev->mc.real_vram_size;
			
 
				+	}
			
 
				+	mem_info->vram_width = adev->mc.vram_width;
			
 
				 
			
 
				-	BUG_ON(kgd == NULL);
			
 
				+	pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n",
			
 
				+			adev->mc.aper_base, aper_limit,
			
 
				+			mem_info->local_mem_size_public,
			
 
				+			mem_info->local_mem_size_private);
			
 
				 
			
 
				-	return adev->mc.real_vram_size;
			
 
				+	if (amdgpu_sriov_vf(adev))
			
 
				+		mem_info->mem_clk_max = adev->clock.default_mclk / 100;
			
 
				+	else
			
 
				+		mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100;
			
 
				 }
			
 
				 
			
 
				 uint64_t get_gpu_clock_counter(struct kgd_dev *kgd)
			
@@ -298,6 +318,39 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd)
 
				 {
			
 
				 	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
			
 
				 
			
 
				-	/* The sclk is in quantas of 10kHz */
			
 
				-	return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100;
			
 
				+	/* the sclk is in quantas of 10kHz */
			
 
				+	if (amdgpu_sriov_vf(adev))
			
 
				+		return adev->clock.default_sclk / 100;
			
 
				+
			
 
				+	return amdgpu_dpm_get_sclk(adev, false) / 100;
			
 
				+}
			
 
				+
			
 
				+void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info)
			
 
				+{
			
 
				+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
			
 
				+	struct amdgpu_cu_info acu_info = adev->gfx.cu_info;
			
 
				+
			
 
				+	memset(cu_info, 0, sizeof(*cu_info));
			
 
				+	if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap))
			
 
				+		return;
			
 
				+
			
 
				+	cu_info->cu_active_number = acu_info.number;
			
 
				+	cu_info->cu_ao_mask = acu_info.ao_cu_mask;
			
 
				+	memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0],
			
 
				+	       sizeof(acu_info.bitmap));
			
 
				+	cu_info->num_shader_engines = adev->gfx.config.max_shader_engines;
			
 
				+	cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se;
			
 
				+	cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh;
			
 
				+	cu_info->simd_per_cu = acu_info.simd_per_cu;
			
 
				+	cu_info->max_waves_per_simd = acu_info.max_waves_per_simd;
			
 
				+	cu_info->wave_front_size = acu_info.wave_front_size;
			
 
				+	cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu;
			
 
				+	cu_info->lds_size = acu_info.lds_size;
			
 
				+}
			
 
				+
			
 
				+uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
			
 
				+{
			
 
				+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
			
 
				+
			
 
				+	return amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]);
			
 
				 }
			
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -56,10 +56,13 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
 
				 			void **mem_obj, uint64_t *gpu_addr,
			
 
				 			void **cpu_ptr);
			
 
				 void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
			
 
				-uint64_t get_vmem_size(struct kgd_dev *kgd);
			
 
				+void get_local_mem_info(struct kgd_dev *kgd,
			
 
				+			struct kfd_local_mem_info *mem_info);
			
 
				 uint64_t get_gpu_clock_counter(struct kgd_dev *kgd);
			
 
				 
			
 
				 uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd);
			
 
				+void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info);
			
 
				+uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
			
 
				 
			
 
				 #define read_user_wptr(mmptr, wptr, dst)				\
			
 
				 	({								\
			
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -105,7 +105,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 
				 			uint32_t queue_id, uint32_t __user *wptr,
			
 
				 			uint32_t wptr_shift, uint32_t wptr_mask,
			
 
				 			struct mm_struct *mm);
			
 
				-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
			
 
				+static int kgd_hqd_dump(struct kgd_dev *kgd,
			
 
				+			uint32_t pipe_id, uint32_t queue_id,
			
 
				+			uint32_t (**dump)[2], uint32_t *n_regs);
			
 
				+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
			
 
				+			     uint32_t __user *wptr, struct mm_struct *mm);
			
 
				+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
			
 
				+			     uint32_t engine_id, uint32_t queue_id,
			
 
				+			     uint32_t (**dump)[2], uint32_t *n_regs);
			
 
				 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
			
 
				 				uint32_t pipe_id, uint32_t queue_id);
			
 
				 
			
@@ -166,7 +173,7 @@ static int get_tile_config(struct kgd_dev *kgd,
 
				 static const struct kfd2kgd_calls kfd2kgd = {
			
 
				 	.init_gtt_mem_allocation = alloc_gtt_mem,
			
 
				 	.free_gtt_mem = free_gtt_mem,
			
 
				-	.get_vmem_size = get_vmem_size,
			
 
				+	.get_local_mem_info = get_local_mem_info,
			
 
				 	.get_gpu_clock_counter = get_gpu_clock_counter,
			
 
				 	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
			
 
				 	.alloc_pasid = amdgpu_vm_alloc_pasid,
			
@@ -177,6 +184,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
 
				 	.init_interrupts = kgd_init_interrupts,
			
 
				 	.hqd_load = kgd_hqd_load,
			
 
				 	.hqd_sdma_load = kgd_hqd_sdma_load,
			
 
				+	.hqd_dump = kgd_hqd_dump,
			
 
				+	.hqd_sdma_dump = kgd_hqd_sdma_dump,
			
 
				 	.hqd_is_occupied = kgd_hqd_is_occupied,
			
 
				 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
			
 
				 	.hqd_destroy = kgd_hqd_destroy,
			
@@ -191,6 +200,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
 
				 	.get_fw_version = get_fw_version,
			
 
				 	.set_scratch_backing_va = set_scratch_backing_va,
			
 
				 	.get_tile_config = get_tile_config,
			
 
				+	.get_cu_info = get_cu_info,
			
 
				+	.get_vram_usage = amdgpu_amdkfd_get_vram_usage
			
 
				 };
			
 
				 
			
 
				 struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void)
			
@@ -375,7 +386,44 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
			
 
				+static int kgd_hqd_dump(struct kgd_dev *kgd,
			
 
				+			uint32_t pipe_id, uint32_t queue_id,
			
 
				+			uint32_t (**dump)[2], uint32_t *n_regs)
			
 
				+{
			
 
				+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
			
 
				+	uint32_t i = 0, reg;
			
 
				+#define HQD_N_REGS (35+4)
			
 
				+#define DUMP_REG(addr) do {				\
			
 
				+		if (WARN_ON_ONCE(i >= HQD_N_REGS))	\
			
 
				+			break;				\
			
 
				+		(*dump)[i][0] = (addr) << 2;		\
			
 
				+		(*dump)[i++][1] = RREG32(addr);		\
			
 
				+	} while (0)
			
 
				+
			
 
				+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
			
 
				+	if (*dump == NULL)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	acquire_queue(kgd, pipe_id, queue_id);
			
 
				+
			
 
				+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0);
			
 
				+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1);
			
 
				+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2);
			
 
				+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3);
			
 
				+
			
 
				+	for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++)
			
 
				+		DUMP_REG(reg);
			
 
				+
			
 
				+	release_queue(kgd);
			
 
				+
			
 
				+	WARN_ON_ONCE(i != HQD_N_REGS);
			
 
				+	*n_regs = i;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
			
 
				+			     uint32_t __user *wptr, struct mm_struct *mm)
			
 
				 {
			
 
				 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
			
 
				 	struct cik_sdma_rlc_registers *m;
			
@@ -410,10 +458,17 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
 
				 		WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data);
			
 
				 	}
			
 
				 
			
 
				-	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL,
			
 
				-				m->sdma_rlc_doorbell);
			
 
				-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
			
 
				-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
			
 
				+	data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL,
			
 
				+			     ENABLE, 1);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr);
			
 
				+
			
 
				+	if (read_user_wptr(mm, wptr, data))
			
 
				+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
			
 
				+	else
			
 
				+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
			
 
				+		       m->sdma_rlc_rb_rptr);
			
 
				+
			
 
				 	WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
			
 
				 				m->sdma_rlc_virtual_addr);
			
 
				 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base);
			
@@ -423,8 +478,37 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
 
				 			m->sdma_rlc_rb_rptr_addr_lo);
			
 
				 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
			
 
				 			m->sdma_rlc_rb_rptr_addr_hi);
			
 
				-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
			
 
				-			m->sdma_rlc_rb_cntl);
			
 
				+
			
 
				+	data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL,
			
 
				+			     RB_ENABLE, 1);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
			
 
				+			     uint32_t engine_id, uint32_t queue_id,
			
 
				+			     uint32_t (**dump)[2], uint32_t *n_regs)
			
 
				+{
			
 
				+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
			
 
				+	uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET +
			
 
				+		queue_id * KFD_CIK_SDMA_QUEUE_OFFSET;
			
 
				+	uint32_t i = 0, reg;
			
 
				+#undef HQD_N_REGS
			
 
				+#define HQD_N_REGS (19+4)
			
 
				+
			
 
				+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
			
 
				+	if (*dump == NULL)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
			
 
				+		DUMP_REG(sdma_offset + reg);
			
 
				+	for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK;
			
 
				+	     reg++)
			
 
				+		DUMP_REG(sdma_offset + reg);
			
 
				+
			
 
				+	WARN_ON_ONCE(i != HQD_N_REGS);
			
 
				+	*n_regs = i;
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -575,7 +659,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 
				 	struct cik_sdma_rlc_registers *m;
			
 
				 	uint32_t sdma_base_addr;
			
 
				 	uint32_t temp;
			
 
				-	int timeout = utimeout;
			
 
				+	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
			
 
				 
			
 
				 	m = get_sdma_mqd(mqd);
			
 
				 	sdma_base_addr = get_sdma_base_addr(m);
			
@@ -588,10 +672,9 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 
				 		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
			
 
				 		if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
			
 
				 			break;
			
 
				-		if (timeout <= 0)
			
 
				+		if (time_after(jiffies, end_jiffies))
			
 
				 			return -ETIME;
			
 
				-		msleep(20);
			
 
				-		timeout -= 20;
			
 
				+		usleep_range(500, 1000);
			
 
				 	}
			
 
				 
			
 
				 	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
			
@@ -599,6 +682,8 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 
				 		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
			
 
				 		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
			
 
				 
			
 
				+	m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -45,7 +45,7 @@ enum hqd_dequeue_request_type {
 
				 	RESET_WAVES
			
 
				 };
			
 
				 
			
 
				-struct cik_sdma_rlc_registers;
			
 
				+struct vi_sdma_mqd;
			
 
				 
			
 
				 /*
			
 
				  * Register access functions
			
@@ -64,7 +64,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 
				 			uint32_t queue_id, uint32_t __user *wptr,
			
 
				 			uint32_t wptr_shift, uint32_t wptr_mask,
			
 
				 			struct mm_struct *mm);
			
 
				-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
			
 
				+static int kgd_hqd_dump(struct kgd_dev *kgd,
			
 
				+			uint32_t pipe_id, uint32_t queue_id,
			
 
				+			uint32_t (**dump)[2], uint32_t *n_regs);
			
 
				+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
			
 
				+			     uint32_t __user *wptr, struct mm_struct *mm);
			
 
				+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
			
 
				+			     uint32_t engine_id, uint32_t queue_id,
			
 
				+			     uint32_t (**dump)[2], uint32_t *n_regs);
			
 
				 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
			
 
				 		uint32_t pipe_id, uint32_t queue_id);
			
 
				 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
			
@@ -125,7 +132,7 @@ static int get_tile_config(struct kgd_dev *kgd,
 
				 static const struct kfd2kgd_calls kfd2kgd = {
			
 
				 	.init_gtt_mem_allocation = alloc_gtt_mem,
			
 
				 	.free_gtt_mem = free_gtt_mem,
			
 
				-	.get_vmem_size = get_vmem_size,
			
 
				+	.get_local_mem_info = get_local_mem_info,
			
 
				 	.get_gpu_clock_counter = get_gpu_clock_counter,
			
 
				 	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
			
 
				 	.alloc_pasid = amdgpu_vm_alloc_pasid,
			
@@ -136,6 +143,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
 
				 	.init_interrupts = kgd_init_interrupts,
			
 
				 	.hqd_load = kgd_hqd_load,
			
 
				 	.hqd_sdma_load = kgd_hqd_sdma_load,
			
 
				+	.hqd_dump = kgd_hqd_dump,
			
 
				+	.hqd_sdma_dump = kgd_hqd_sdma_dump,
			
 
				 	.hqd_is_occupied = kgd_hqd_is_occupied,
			
 
				 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
			
 
				 	.hqd_destroy = kgd_hqd_destroy,
			
@@ -152,6 +161,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
 
				 	.get_fw_version = get_fw_version,
			
 
				 	.set_scratch_backing_va = set_scratch_backing_va,
			
 
				 	.get_tile_config = get_tile_config,
			
 
				+	.get_cu_info = get_cu_info,
			
 
				+	.get_vram_usage = amdgpu_amdkfd_get_vram_usage
			
 
				 };
			
 
				 
			
 
				 struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void)
			
@@ -268,9 +279,15 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m)
			
 
				+static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m)
			
 
				 {
			
 
				-	return 0;
			
 
				+	uint32_t retval;
			
 
				+
			
 
				+	retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET +
			
 
				+		m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET;
			
 
				+	pr_debug("kfd: sdma base address: 0x%x\n", retval);
			
 
				+
			
 
				+	return retval;
			
 
				 }
			
 
				 
			
 
				 static inline struct vi_mqd *get_mqd(void *mqd)
			
@@ -278,9 +295,9 @@ static inline struct vi_mqd *get_mqd(void *mqd)
 
				 	return (struct vi_mqd *)mqd;
			
 
				 }
			
 
				 
			
 
				-static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
			
 
				+static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
			
 
				 {
			
 
				-	return (struct cik_sdma_rlc_registers *)mqd;
			
 
				+	return (struct vi_sdma_mqd *)mqd;
			
 
				 }
			
 
				 
			
 
				 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
			
@@ -358,8 +375,138 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
			
 
				+static int kgd_hqd_dump(struct kgd_dev *kgd,
			
 
				+			uint32_t pipe_id, uint32_t queue_id,
			
 
				+			uint32_t (**dump)[2], uint32_t *n_regs)
			
 
				 {
			
 
				+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
			
 
				+	uint32_t i = 0, reg;
			
 
				+#define HQD_N_REGS (54+4)
			
 
				+#define DUMP_REG(addr) do {				\
			
 
				+		if (WARN_ON_ONCE(i >= HQD_N_REGS))	\
			
 
				+			break;				\
			
 
				+		(*dump)[i][0] = (addr) << 2;		\
			
 
				+		(*dump)[i++][1] = RREG32(addr);		\
			
 
				+	} while (0)
			
 
				+
			
 
				+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
			
 
				+	if (*dump == NULL)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	acquire_queue(kgd, pipe_id, queue_id);
			
 
				+
			
 
				+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0);
			
 
				+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1);
			
 
				+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2);
			
 
				+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3);
			
 
				+
			
 
				+	for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_DONES; reg++)
			
 
				+		DUMP_REG(reg);
			
 
				+
			
 
				+	release_queue(kgd);
			
 
				+
			
 
				+	WARN_ON_ONCE(i != HQD_N_REGS);
			
 
				+	*n_regs = i;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
			
 
				+			     uint32_t __user *wptr, struct mm_struct *mm)
			
 
				+{
			
 
				+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
			
 
				+	struct vi_sdma_mqd *m;
			
 
				+	unsigned long end_jiffies;
			
 
				+	uint32_t sdma_base_addr;
			
 
				+	uint32_t data;
			
 
				+
			
 
				+	m = get_sdma_mqd(mqd);
			
 
				+	sdma_base_addr = get_sdma_base_addr(m);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
			
 
				+		m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
			
 
				+
			
 
				+	end_jiffies = msecs_to_jiffies(2000) + jiffies;
			
 
				+	while (true) {
			
 
				+		data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
			
 
				+		if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
			
 
				+			break;
			
 
				+		if (time_after(jiffies, end_jiffies))
			
 
				+			return -ETIME;
			
 
				+		usleep_range(500, 1000);
			
 
				+	}
			
 
				+	if (m->sdma_engine_id) {
			
 
				+		data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL);
			
 
				+		data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL,
			
 
				+				RESUME_CTX, 0);
			
 
				+		WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data);
			
 
				+	} else {
			
 
				+		data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL);
			
 
				+		data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
			
 
				+				RESUME_CTX, 0);
			
 
				+		WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data);
			
 
				+	}
			
 
				+
			
 
				+	data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
			
 
				+			     ENABLE, 1);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
			
 
				+
			
 
				+	if (read_user_wptr(mm, wptr, data))
			
 
				+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
			
 
				+	else
			
 
				+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
			
 
				+		       m->sdmax_rlcx_rb_rptr);
			
 
				+
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
			
 
				+				m->sdmax_rlcx_virtual_addr);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
			
 
				+			m->sdmax_rlcx_rb_base_hi);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
			
 
				+			m->sdmax_rlcx_rb_rptr_addr_lo);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
			
 
				+			m->sdmax_rlcx_rb_rptr_addr_hi);
			
 
				+
			
 
				+	data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
			
 
				+			     RB_ENABLE, 1);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
			
 
				+			     uint32_t engine_id, uint32_t queue_id,
			
 
				+			     uint32_t (**dump)[2], uint32_t *n_regs)
			
 
				+{
			
 
				+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
			
 
				+	uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET +
			
 
				+		queue_id * KFD_VI_SDMA_QUEUE_OFFSET;
			
 
				+	uint32_t i = 0, reg;
			
 
				+#undef HQD_N_REGS
			
 
				+#define HQD_N_REGS (19+4+2+3+7)
			
 
				+
			
 
				+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
			
 
				+	if (*dump == NULL)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
			
 
				+		DUMP_REG(sdma_offset + reg);
			
 
				+	for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK;
			
 
				+	     reg++)
			
 
				+		DUMP_REG(sdma_offset + reg);
			
 
				+	for (reg = mmSDMA0_RLC0_CSA_ADDR_LO; reg <= mmSDMA0_RLC0_CSA_ADDR_HI;
			
 
				+	     reg++)
			
 
				+		DUMP_REG(sdma_offset + reg);
			
 
				+	for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; reg <= mmSDMA0_RLC0_DUMMY_REG;
			
 
				+	     reg++)
			
 
				+		DUMP_REG(sdma_offset + reg);
			
 
				+	for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; reg <= mmSDMA0_RLC0_MIDCMD_CNTL;
			
 
				+	     reg++)
			
 
				+		DUMP_REG(sdma_offset + reg);
			
 
				+
			
 
				+	WARN_ON_ONCE(i != HQD_N_REGS);
			
 
				+	*n_regs = i;
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -388,7 +535,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
 
				 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
			
 
				 {
			
 
				 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
			
 
				-	struct cik_sdma_rlc_registers *m;
			
 
				+	struct vi_sdma_mqd *m;
			
 
				 	uint32_t sdma_base_addr;
			
 
				 	uint32_t sdma_rlc_rb_cntl;
			
 
				 
			
@@ -509,10 +656,10 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 
				 				unsigned int utimeout)
			
 
				 {
			
 
				 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
			
 
				-	struct cik_sdma_rlc_registers *m;
			
 
				+	struct vi_sdma_mqd *m;
			
 
				 	uint32_t sdma_base_addr;
			
 
				 	uint32_t temp;
			
 
				-	int timeout = utimeout;
			
 
				+	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
			
 
				 
			
 
				 	m = get_sdma_mqd(mqd);
			
 
				 	sdma_base_addr = get_sdma_base_addr(m);
			
@@ -523,18 +670,19 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 
				 
			
 
				 	while (true) {
			
 
				 		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
			
 
				-		if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
			
 
				+		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
			
 
				 			break;
			
 
				-		if (timeout <= 0)
			
 
				+		if (time_after(jiffies, end_jiffies))
			
 
				 			return -ETIME;
			
 
				-		msleep(20);
			
 
				-		timeout -= 20;
			
 
				+		usleep_range(500, 1000);
			
 
				 	}
			
 
				 
			
 
				 	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
			
 
				-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
			
 
				-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
			
 
				-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0);
			
 
				+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
			
 
				+		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
			
 
				+		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
			
 
				+
			
 
				+	m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/drivers/gpu/drm/amd/amdgpu/cikd.h
+++ b/drivers/gpu/drm/amd/amdgpu/cikd.h
@@ -562,7 +562,7 @@
 
				 #define	PRIVATE_BASE(x)	((x) << 0) /* scratch */
			
 
				 #define	SHARED_BASE(x)	((x) << 16) /* LDS */
			
 
				 
			
 
				-#define KFD_CIK_SDMA_QUEUE_OFFSET	0x200
			
 
				+#define KFD_CIK_SDMA_QUEUE_OFFSET (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL)
			
 
				 
			
 
				 /* valid for both DEFAULT_MTYPE and APE1_MTYPE */
			
 
				 enum {
			
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -48,6 +48,8 @@
 
				 #include "oss/oss_2_0_d.h"
			
 
				 #include "oss/oss_2_0_sh_mask.h"
			
 
				 
			
 
				+#define NUM_SIMD_PER_CU 0x4 /* missing from the gfx_7 IP headers */
			
 
				+
			
 
				 #define GFX7_NUM_GFX_RINGS     1
			
 
				 #define GFX7_MEC_HPD_SIZE      2048
			
 
				 
			
@@ -5277,6 +5279,11 @@ static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev)
 
				 
			
 
				 	cu_info->number = active_cu_number;
			
 
				 	cu_info->ao_cu_mask = ao_cu_mask;
			
 
				+	cu_info->simd_per_cu = NUM_SIMD_PER_CU;
			
 
				+	cu_info->max_waves_per_simd = 10;
			
 
				+	cu_info->max_scratch_slots_per_cu = 32;
			
 
				+	cu_info->wave_front_size = 64;
			
 
				+	cu_info->lds_size = 64;
			
 
				 }
			
 
				 
			
 
				 const struct amdgpu_ip_block_version gfx_v7_0_ip_block =
			
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -7116,6 +7116,11 @@ static void gfx_v8_0_get_cu_info(struct amdgpu_device *adev)
 
				 
			
 
				 	cu_info->number = active_cu_number;
			
 
				 	cu_info->ao_cu_mask = ao_cu_mask;
			
 
				+	cu_info->simd_per_cu = NUM_SIMD_PER_CU;
			
 
				+	cu_info->max_waves_per_simd = 10;
			
 
				+	cu_info->max_scratch_slots_per_cu = 32;
			
 
				+	cu_info->wave_front_size = 64;
			
 
				+	cu_info->lds_size = 64;
			
 
				 }
			
 
				 
			
 
				 const struct amdgpu_ip_block_version gfx_v8_0_ip_block =
			
--- a/drivers/gpu/drm/amd/amdgpu/vid.h
+++ b/drivers/gpu/drm/amd/amdgpu/vid.h
@@ -27,6 +27,8 @@
 
				 #define SDMA1_REGISTER_OFFSET                             0x200 /* not a register */
			
 
				 #define SDMA_MAX_INSTANCE 2
			
 
				 
			
 
				+#define KFD_VI_SDMA_QUEUE_OFFSET                      0x80 /* not a register */
			
 
				+
			
 
				 /* crtc instance offsets */
			
 
				 #define CRTC0_REGISTER_OFFSET                 (0x1b9c - 0x1b9c)
			
 
				 #define CRTC1_REGISTER_OFFSET                 (0x1d9c - 0x1b9c)
			
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -35,6 +35,8 @@ amdkfd-y	:= kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
 
				 		kfd_process_queue_manager.o kfd_device_queue_manager.o \
			
 
				 		kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \
			
 
				 		kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
			
 
				-		kfd_dbgdev.o kfd_dbgmgr.o
			
 
				+		kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o
			
 
				+
			
 
				+amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o
			
 
				 
			
 
				 obj-$(CONFIG_HSA_AMD)	+= amdkfd.o
			
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
@@ -0,0 +1,1384 @@
 
				+/*
			
 
				+ * Copyright 2015-2017 Advanced Micro Devices, Inc.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
			
 
				+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
			
 
				+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
			
 
				+ * OTHER DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#if 0
			
 
				+HW (VI) source code for CWSR trap handler
			
 
				+#Version 18 + multiple trap handler
			
 
				+
			
 
				+// this performance-optimal version was originally from Seven Xu at SRDC
			
 
				+
			
 
				+// Revison #18   --...
			
 
				+/* Rev History
			
 
				+** #1. Branch from gc dv.   //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
			
 
				+** #4. SR Memory Layout:
			
 
				+**             1. VGPR-SGPR-HWREG-{LDS}
			
 
				+**             2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
			
 
				+** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
			
 
				+** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
			
 
				+** #7. Update: 1. don't barrier if noLDS
			
 
				+** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
			
 
				+**             2. Fix SQ issue by s_sleep 2
			
 
				+** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
			
 
				+**             2. optimize s_buffer save by burst 16sgprs...
			
 
				+** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
			
 
				+** #11. Update 1. Add 2 more timestamp for debug version
			
 
				+** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
			
 
				+** #13. Integ  1. Always use MUBUF for PV trap shader...
			
 
				+** #14. Update 1. s_buffer_store soft clause...
			
 
				+** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
			
 
				+** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
			
 
				+** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
			
 
				+**             2. PERF - Save LDS before save VGPR to cover LDS save long latency...
			
 
				+** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
			
 
				+**             2. FUNC - Handle non-CWSR traps
			
 
				+*/
			
 
				+
			
 
				+var G8SR_WDMEM_HWREG_OFFSET = 0
			
 
				+var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
			
 
				+
			
 
				+// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
			
 
				+
			
 
				+var G8SR_DEBUG_TIMESTAMP = 0
			
 
				+var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4  // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
			
 
				+var s_g8sr_ts_save_s    = s[34:35]   // save start
			
 
				+var s_g8sr_ts_sq_save_msg  = s[36:37]   // The save shader send SAVEWAVE msg to spi
			
 
				+var s_g8sr_ts_spi_wrexec   = s[38:39]   // the SPI write the sr address to SQ
			
 
				+var s_g8sr_ts_save_d    = s[40:41]   // save end
			
 
				+var s_g8sr_ts_restore_s = s[42:43]   // restore start
			
 
				+var s_g8sr_ts_restore_d = s[44:45]   // restore end
			
 
				+
			
 
				+var G8SR_VGPR_SR_IN_DWX4 = 0
			
 
				+var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000    // DWx4 stride is 4*4Bytes
			
 
				+var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
			
 
				+
			
 
				+
			
 
				+/*************************************************************************/
			
 
				+/*                  control on how to run the shader                     */
			
 
				+/*************************************************************************/
			
 
				+//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
			
 
				+var EMU_RUN_HACK                    =   0
			
 
				+var EMU_RUN_HACK_RESTORE_NORMAL     =   0
			
 
				+var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =   0
			
 
				+var EMU_RUN_HACK_SAVE_SINGLE_WAVE   =   0
			
 
				+var EMU_RUN_HACK_SAVE_FIRST_TIME    =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
			
 
				+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
			
 
				+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
			
 
				+var SAVE_LDS                        =   1
			
 
				+var WG_BASE_ADDR_LO                 =   0x9000a000
			
 
				+var WG_BASE_ADDR_HI                 =   0x0
			
 
				+var WAVE_SPACE                      =   0x5000              //memory size that each wave occupies in workgroup state mem
			
 
				+var CTX_SAVE_CONTROL                =   0x0
			
 
				+var CTX_RESTORE_CONTROL             =   CTX_SAVE_CONTROL
			
 
				+var SIM_RUN_HACK                    =   0                   //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
			
 
				+var SGPR_SAVE_USE_SQC               =   1                   //use SQC D$ to do the write
			
 
				+var USE_MTBUF_INSTEAD_OF_MUBUF      =   0                   //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
			
 
				+var SWIZZLE_EN                      =   0                   //whether we use swizzled buffer addressing
			
 
				+
			
 
				+/**************************************************************************/
			
 
				+/*                      variables                                         */
			
 
				+/**************************************************************************/
			
 
				+var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
			
 
				+var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
			
 
				+var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
			
 
				+
			
 
				+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT    = 12
			
 
				+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE     = 9
			
 
				+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT   = 8
			
 
				+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE    = 6
			
 
				+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT   = 24
			
 
				+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE    = 3                     //FIXME  sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
			
 
				+
			
 
				+var SQ_WAVE_TRAPSTS_SAVECTX_MASK    =   0x400
			
 
				+var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF                   // Exception mask
			
 
				+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =   10
			
 
				+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =   0x100
			
 
				+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =   8
			
 
				+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK    =   0x3FF
			
 
				+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT   =   0x0
			
 
				+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE    =   10
			
 
				+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK   =   0xFFFFF800
			
 
				+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT  =   11
			
 
				+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE   =   21
			
 
				+
			
 
				+var SQ_WAVE_IB_STS_RCNT_SHIFT           =   16                  //FIXME
			
 
				+var SQ_WAVE_IB_STS_RCNT_SIZE            =   4                   //FIXME
			
 
				+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT   =   15                  //FIXME
			
 
				+var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE    =   1                   //FIXME
			
 
				+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG   = 0x00007FFF    //FIXME
			
 
				+
			
 
				+var SQ_BUF_RSRC_WORD1_ATC_SHIFT     =   24
			
 
				+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =   27
			
 
				+
			
 
				+
			
 
				+/*      Save        */
			
 
				+var S_SAVE_BUF_RSRC_WORD1_STRIDE        =   0x00040000          //stride is 4 bytes
			
 
				+var S_SAVE_BUF_RSRC_WORD3_MISC          =   0x00807FAC          //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
			
 
				+
			
 
				+var S_SAVE_SPI_INIT_ATC_MASK            =   0x08000000          //bit[27]: ATC bit
			
 
				+var S_SAVE_SPI_INIT_ATC_SHIFT           =   27
			
 
				+var S_SAVE_SPI_INIT_MTYPE_MASK          =   0x70000000          //bit[30:28]: Mtype
			
 
				+var S_SAVE_SPI_INIT_MTYPE_SHIFT         =   28
			
 
				+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK     =   0x04000000          //bit[26]: FirstWaveInTG
			
 
				+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT    =   26
			
 
				+
			
 
				+var S_SAVE_PC_HI_RCNT_SHIFT             =   28                  //FIXME  check with Brian to ensure all fields other than PC[47:0] can be used
			
 
				+var S_SAVE_PC_HI_RCNT_MASK              =   0xF0000000          //FIXME
			
 
				+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT     =   27                  //FIXME
			
 
				+var S_SAVE_PC_HI_FIRST_REPLAY_MASK      =   0x08000000          //FIXME
			
 
				+
			
 
				+var s_save_spi_init_lo              =   exec_lo
			
 
				+var s_save_spi_init_hi              =   exec_hi
			
 
				+
			
 
				+                                                //tba_lo and tba_hi need to be saved/restored
			
 
				+var s_save_pc_lo            =   ttmp0           //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
			
 
				+var s_save_pc_hi            =   ttmp1
			
 
				+var s_save_exec_lo          =   ttmp2
			
 
				+var s_save_exec_hi          =   ttmp3
			
 
				+var s_save_status           =   ttmp4
			
 
				+var s_save_trapsts          =   ttmp5           //not really used until the end of the SAVE routine
			
 
				+var s_save_xnack_mask_lo    =   ttmp6
			
 
				+var s_save_xnack_mask_hi    =   ttmp7
			
 
				+var s_save_buf_rsrc0        =   ttmp8
			
 
				+var s_save_buf_rsrc1        =   ttmp9
			
 
				+var s_save_buf_rsrc2        =   ttmp10
			
 
				+var s_save_buf_rsrc3        =   ttmp11
			
 
				+
			
 
				+var s_save_mem_offset       =   tma_lo
			
 
				+var s_save_alloc_size       =   s_save_trapsts          //conflict
			
 
				+var s_save_tmp              =   s_save_buf_rsrc2        //shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
			
 
				+var s_save_m0               =   tma_hi
			
 
				+
			
 
				+/*      Restore     */
			
 
				+var S_RESTORE_BUF_RSRC_WORD1_STRIDE         =   S_SAVE_BUF_RSRC_WORD1_STRIDE
			
 
				+var S_RESTORE_BUF_RSRC_WORD3_MISC           =   S_SAVE_BUF_RSRC_WORD3_MISC
			
 
				+
			
 
				+var S_RESTORE_SPI_INIT_ATC_MASK             =   0x08000000          //bit[27]: ATC bit
			
 
				+var S_RESTORE_SPI_INIT_ATC_SHIFT            =   27
			
 
				+var S_RESTORE_SPI_INIT_MTYPE_MASK           =   0x70000000          //bit[30:28]: Mtype
			
 
				+var S_RESTORE_SPI_INIT_MTYPE_SHIFT          =   28
			
 
				+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK      =   0x04000000          //bit[26]: FirstWaveInTG
			
 
				+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT     =   26
			
 
				+
			
 
				+var S_RESTORE_PC_HI_RCNT_SHIFT              =   S_SAVE_PC_HI_RCNT_SHIFT
			
 
				+var S_RESTORE_PC_HI_RCNT_MASK               =   S_SAVE_PC_HI_RCNT_MASK
			
 
				+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT      =   S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
			
 
				+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK       =   S_SAVE_PC_HI_FIRST_REPLAY_MASK
			
 
				+
			
 
				+var s_restore_spi_init_lo                   =   exec_lo
			
 
				+var s_restore_spi_init_hi                   =   exec_hi
			
 
				+
			
 
				+var s_restore_mem_offset        =   ttmp2
			
 
				+var s_restore_alloc_size        =   ttmp3
			
 
				+var s_restore_tmp               =   ttmp6               //tba_lo/hi need to be restored
			
 
				+var s_restore_mem_offset_save   =   s_restore_tmp       //no conflict
			
 
				+
			
 
				+var s_restore_m0            =   s_restore_alloc_size    //no conflict
			
 
				+
			
 
				+var s_restore_mode          =   ttmp7
			
 
				+
			
 
				+var s_restore_pc_lo         =   ttmp0
			
 
				+var s_restore_pc_hi         =   ttmp1
			
 
				+var s_restore_exec_lo       =   tma_lo                  //no conflict
			
 
				+var s_restore_exec_hi       =   tma_hi                  //no conflict
			
 
				+var s_restore_status        =   ttmp4
			
 
				+var s_restore_trapsts       =   ttmp5
			
 
				+var s_restore_xnack_mask_lo =   xnack_mask_lo
			
 
				+var s_restore_xnack_mask_hi =   xnack_mask_hi
			
 
				+var s_restore_buf_rsrc0     =   ttmp8
			
 
				+var s_restore_buf_rsrc1     =   ttmp9
			
 
				+var s_restore_buf_rsrc2     =   ttmp10
			
 
				+var s_restore_buf_rsrc3     =   ttmp11
			
 
				+
			
 
				+/**************************************************************************/
			
 
				+/*                      trap handler entry points                         */
			
 
				+/**************************************************************************/
			
 
				+/* Shader Main*/
			
 
				+
			
 
				+shader main
			
 
				+  asic(VI)
			
 
				+  type(CS)
			
 
				+
			
 
				+
			
 
				+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))                   //hack to use trap_id for determining save/restore
			
 
				+        //FIXME VCCZ un-init assertion s_getreg_b32     s_save_status, hwreg(HW_REG_STATUS)         //save STATUS since we will change SCC
			
 
				+        s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000              //change SCC
			
 
				+        s_cmp_eq_u32 s_save_tmp, 0x007e0000                         //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
			
 
				+        s_cbranch_scc0 L_JUMP_TO_RESTORE                            //do not need to recover STATUS here  since we are going to RESTORE
			
 
				+        //FIXME  s_setreg_b32   hwreg(HW_REG_STATUS),   s_save_status       //need to recover STATUS since we are going to SAVE
			
 
				+        s_branch L_SKIP_RESTORE                                     //NOT restore, SAVE actually
			
 
				+    else
			
 
				+        s_branch L_SKIP_RESTORE                                     //NOT restore. might be a regular trap or save
			
 
				+    end
			
 
				+
			
 
				+L_JUMP_TO_RESTORE:
			
 
				+    s_branch L_RESTORE                                              //restore
			
 
				+
			
 
				+L_SKIP_RESTORE:
			
 
				+
			
 
				+    s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)                             //save STATUS since we will change SCC
			
 
				+    s_andn2_b32     s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK      //check whether this is for save
			
 
				+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
			
 
				+    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
			
 
				+    s_cbranch_scc1  L_SAVE                                      //this is the operation for save
			
 
				+
			
 
				+    // *********    Handle non-CWSR traps       *******************
			
 
				+if (!EMU_RUN_HACK)
			
 
				+    /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
			
 
				+    s_load_dwordx4  [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
			
 
				+    s_waitcnt lgkmcnt(0)
			
 
				+    s_or_b32        ttmp7, ttmp8, ttmp9
			
 
				+    s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
			
 
				+    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
			
 
				+    s_setpc_b64     [ttmp8,ttmp9] //jump to next level trap handler
			
 
				+
			
 
				+L_NO_NEXT_TRAP:
			
 
				+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
			
 
				+    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
			
 
				+    s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
			
 
				+    s_add_u32       ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
			
 
				+    s_addc_u32  ttmp1, ttmp1, 0
			
 
				+L_EXCP_CASE:
			
 
				+    s_and_b32   ttmp1, ttmp1, 0xFFFF
			
 
				+    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
			
 
				+    s_rfe_b64       [ttmp0, ttmp1]
			
 
				+end
			
 
				+    // *********        End handling of non-CWSR traps   *******************
			
 
				+
			
 
				+/**************************************************************************/
			
 
				+/*                      save routine                                      */
			
 
				+/**************************************************************************/
			
 
				+
			
 
				+L_SAVE:
			
 
				+
			
 
				+if G8SR_DEBUG_TIMESTAMP
			
 
				+        s_memrealtime   s_g8sr_ts_save_s
			
 
				+        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
			
 
				+end
			
 
				+
			
 
				+    //check whether there is mem_viol
			
 
				+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
			
 
				+    s_and_b32   s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
			
 
				+    s_cbranch_scc0  L_NO_PC_REWIND
			
 
				+
			
 
				+    //if so, need rewind PC assuming GDS operation gets NACKed
			
 
				+    s_mov_b32       s_save_tmp, 0                                                           //clear mem_viol bit
			
 
				+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp    //clear mem_viol bit
			
 
				+    s_and_b32       s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
			
 
				+    s_sub_u32       s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
			
 
				+    s_subb_u32      s_save_pc_hi, s_save_pc_hi, 0x0           // -scc
			
 
				+
			
 
				+L_NO_PC_REWIND:
			
 
				+    s_mov_b32       s_save_tmp, 0                                                           //clear saveCtx bit
			
 
				+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp     //clear saveCtx bit
			
 
				+
			
 
				+    s_mov_b32       s_save_xnack_mask_lo,   xnack_mask_lo                                   //save XNACK_MASK
			
 
				+    s_mov_b32       s_save_xnack_mask_hi,   xnack_mask_hi    //save XNACK must before any memory operation
			
 
				+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)                   //save RCNT
			
 
				+    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
			
 
				+    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
			
 
				+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)   //save FIRST_REPLAY
			
 
				+    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
			
 
				+    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
			
 
				+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)                                        //clear RCNT and FIRST_REPLAY in IB_STS
			
 
				+    s_and_b32       s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
			
 
				+
			
 
				+    s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
			
 
				+
			
 
				+    /*      inform SPI the readiness and wait for SPI's go signal */
			
 
				+    s_mov_b32       s_save_exec_lo, exec_lo                                                 //save EXEC and use EXEC for the go signal from SPI
			
 
				+    s_mov_b32       s_save_exec_hi, exec_hi
			
 
				+    s_mov_b64       exec,   0x0                                                             //clear EXEC to get ready to receive
			
 
				+
			
 
				+if G8SR_DEBUG_TIMESTAMP
			
 
				+        s_memrealtime  s_g8sr_ts_sq_save_msg
			
 
				+        s_waitcnt lgkmcnt(0)
			
 
				+end
			
 
				+
			
 
				+    if (EMU_RUN_HACK)
			
 
				+
			
 
				+    else
			
 
				+        s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
			
 
				+    end
			
 
				+
			
 
				+  L_SLEEP:
			
 
				+    s_sleep 0x2                // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
			
 
				+
			
 
				+    if (EMU_RUN_HACK)
			
 
				+
			
 
				+    else
			
 
				+        s_cbranch_execz L_SLEEP
			
 
				+    end
			
 
				+
			
 
				+if G8SR_DEBUG_TIMESTAMP
			
 
				+        s_memrealtime  s_g8sr_ts_spi_wrexec
			
 
				+        s_waitcnt lgkmcnt(0)
			
 
				+end
			
 
				+
			
 
				+    /*      setup Resource Contants    */
			
 
				+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
			
 
				+        //calculate wd_addr using absolute thread id
			
 
				+        v_readlane_b32 s_save_tmp, v9, 0
			
 
				+        s_lshr_b32 s_save_tmp, s_save_tmp, 6
			
 
				+        s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
			
 
				+        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
			
 
				+        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
			
 
				+        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
			
 
				+    else
			
 
				+    end
			
 
				+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
			
 
				+        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
			
 
				+        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
			
 
				+        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
			
 
				+    else
			
 
				+    end
			
 
				+
			
 
				+
			
 
				+    s_mov_b32       s_save_buf_rsrc0,   s_save_spi_init_lo                                                      //base_addr_lo
			
 
				+    s_and_b32       s_save_buf_rsrc1,   s_save_spi_init_hi, 0x0000FFFF                                          //base_addr_hi
			
 
				+    s_or_b32        s_save_buf_rsrc1,   s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
			
 
				+    s_mov_b32       s_save_buf_rsrc2,   0                                                                       //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
			
 
				+    s_mov_b32       s_save_buf_rsrc3,   S_SAVE_BUF_RSRC_WORD3_MISC
			
 
				+    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
			
 
				+    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)         //get ATC bit into position
			
 
				+    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or ATC
			
 
				+    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
			
 
				+    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)     //get MTYPE bits into position
			
 
				+    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or MTYPE
			
 
				+
			
 
				+    //FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
			
 
				+    s_mov_b32       s_save_m0,          m0                                                                  //save M0
			
 
				+
			
 
				+    /*      global mem offset           */
			
 
				+    s_mov_b32       s_save_mem_offset,  0x0                                                                     //mem offset initial value = 0
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    /*      save HW registers   */
			
 
				+    //////////////////////////////
			
 
				+
			
 
				+  L_SAVE_HWREG:
			
 
				+        // HWREG SR memory offset : size(VGPR)+size(SGPR)
			
 
				+       get_vgpr_size_bytes(s_save_mem_offset)
			
 
				+       get_sgpr_size_bytes(s_save_tmp)
			
 
				+       s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
			
 
				+
			
 
				+
			
 
				+    s_mov_b32       s_save_buf_rsrc2, 0x4                               //NUM_RECORDS   in bytes
			
 
				+    if (SWIZZLE_EN)
			
 
				+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
			
 
				+    else
			
 
				+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
			
 
				+    end
			
 
				+
			
 
				+
			
 
				+    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)                  //M0
			
 
				+
			
 
				+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
			
 
				+        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
			
 
				+        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
			
 
				+        s_mov_b32   tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
			
 
				+        s_mov_b32   tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
			
 
				+    end
			
 
				+
			
 
				+    write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)                   //PC
			
 
				+    write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
			
 
				+    write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)             //EXEC
			
 
				+    write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
			
 
				+    write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)              //STATUS
			
 
				+
			
 
				+    //s_save_trapsts conflicts with s_save_alloc_size
			
 
				+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
			
 
				+    write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)             //TRAPSTS
			
 
				+
			
 
				+    write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_LO
			
 
				+    write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_HI
			
 
				+
			
 
				+    //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
			
 
				+    s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)                                                   //MODE
			
 
				+    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
			
 
				+    write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_LO
			
 
				+    write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_HI
			
 
				+
			
 
				+
			
 
				+
			
 
				+    /*      the first wave in the threadgroup    */
			
 
				+        // save fist_wave bits in tba_hi unused bit.26
			
 
				+    s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK     // extract fisrt wave bit
			
 
				+    //s_or_b32        tba_hi, s_save_tmp, tba_hi                                        // save first wave bit to tba_hi.bits[26]
			
 
				+    s_mov_b32        s_save_exec_hi, 0x0
			
 
				+    s_or_b32         s_save_exec_hi, s_save_tmp, s_save_exec_hi                          // save first wave bit to s_save_exec_hi.bits[26]
			
 
				+
			
 
				+
			
 
				+    /*          save SGPRs      */
			
 
				+        // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
			
 
				+    //////////////////////////////
			
 
				+
			
 
				+    // SGPR SR memory offset : size(VGPR)
			
 
				+    get_vgpr_size_bytes(s_save_mem_offset)
			
 
				+    // TODO, change RSRC word to rearrange memory layout for SGPRS
			
 
				+
			
 
				+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)               //spgr_size
			
 
				+    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
			
 
				+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 4                         //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
			
 
				+
			
 
				+    if (SGPR_SAVE_USE_SQC)
			
 
				+        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 2                    //NUM_RECORDS in bytes
			
 
				+    else
			
 
				+        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 8                    //NUM_RECORDS in bytes (64 threads)
			
 
				+    end
			
 
				+
			
 
				+    if (SWIZZLE_EN)
			
 
				+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
			
 
				+    else
			
 
				+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
			
 
				+    end
			
 
				+
			
 
				+
			
 
				+    // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
			
 
				+    //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
			
 
				+    s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
			
 
				+    s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
			
 
				+    s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
			
 
				+
			
 
				+    s_mov_b32       m0, 0x0                         //SGPR initial index value =0
			
 
				+  L_SAVE_SGPR_LOOP:
			
 
				+    // SGPR is allocated in 16 SGPR granularity
			
 
				+    s_movrels_b64   s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
			
 
				+    s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
			
 
				+    s_movrels_b64   s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
			
 
				+    s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
			
 
				+    s_movrels_b64   s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
			
 
				+    s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
			
 
				+    s_movrels_b64   s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
			
 
				+    s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
			
 
				+
			
 
				+    write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
			
 
				+    s_add_u32       m0, m0, 16                                                      //next sgpr index
			
 
				+    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
			
 
				+    s_cbranch_scc1  L_SAVE_SGPR_LOOP                                    //SGPR save is complete?
			
 
				+    // restore s_save_buf_rsrc0,1
			
 
				+    //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
			
 
				+    s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    /*          save first 4 VGPR, then LDS save could use   */
			
 
				+        // each wave will alloc 4 vgprs at least...
			
 
				+    /////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+    s_mov_b32       s_save_mem_offset, 0
			
 
				+    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
			
 
				+    s_mov_b32       exec_hi, 0xFFFFFFFF
			
 
				+
			
 
				+    if (SWIZZLE_EN)
			
 
				+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
			
 
				+    else
			
 
				+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
			
 
				+    end
			
 
				+
			
 
				+
			
 
				+    // VGPR Allocated in 4-GPR granularity
			
 
				+
			
 
				+if G8SR_VGPR_SR_IN_DWX4
			
 
				+        // the const stride for DWx4 is 4*4 bytes
			
 
				+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
			
 
				+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
			
 
				+
			
 
				+        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
			
 
				+
			
 
				+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
			
 
				+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
			
 
				+else
			
 
				+        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
			
 
				+        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
			
 
				+        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
			
 
				+        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+    /*          save LDS        */
			
 
				+    //////////////////////////////
			
 
				+
			
 
				+  L_SAVE_LDS:
			
 
				+
			
 
				+        // Change EXEC to all threads...
			
 
				+    s_mov_b32       exec_lo, 0xFFFFFFFF   //need every thread from now on
			
 
				+    s_mov_b32       exec_hi, 0xFFFFFFFF
			
 
				+
			
 
				+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)             //lds_size
			
 
				+    s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF                //lds_size is zero?
			
 
				+    s_cbranch_scc0  L_SAVE_LDS_DONE                                                                            //no lds used? jump to L_SAVE_DONE
			
 
				+
			
 
				+    s_barrier               //LDS is used? wait for other waves in the same TG
			
 
				+    //s_and_b32     s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
			
 
				+    s_and_b32       s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
			
 
				+    s_cbranch_scc0  L_SAVE_LDS_DONE
			
 
				+
			
 
				+        // first wave do LDS save;
			
 
				+
			
 
				+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 6                         //LDS size in dwords = lds_size * 64dw
			
 
				+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //LDS size in bytes
			
 
				+    s_mov_b32       s_save_buf_rsrc2,  s_save_alloc_size                            //NUM_RECORDS in bytes
			
 
				+
			
 
				+    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
			
 
				+    //
			
 
				+    get_vgpr_size_bytes(s_save_mem_offset)
			
 
				+    get_sgpr_size_bytes(s_save_tmp)
			
 
				+    s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
			
 
				+    s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
			
 
				+
			
 
				+
			
 
				+    if (SWIZZLE_EN)
			
 
				+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0       //FIXME need to use swizzle to enable bounds checking?
			
 
				+    else
			
 
				+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                  //NUM_RECORDS in bytes
			
 
				+    end
			
 
				+
			
 
				+    s_mov_b32       m0, 0x0                                               //lds_offset initial value = 0
			
 
				+
			
 
				+
			
 
				+var LDS_DMA_ENABLE = 0
			
 
				+var UNROLL = 0
			
 
				+if UNROLL==0 && LDS_DMA_ENABLE==1
			
 
				+        s_mov_b32  s3, 256*2
			
 
				+        s_nop 0
			
 
				+        s_nop 0
			
 
				+        s_nop 0
			
 
				+  L_SAVE_LDS_LOOP:
			
 
				+        //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
			
 
				+    if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
			
 
				+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1            // first 64DW
			
 
				+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
			
 
				+    end
			
 
				+
			
 
				+    s_add_u32       m0, m0, s3                                          //every buffer_store_lds does 256 bytes
			
 
				+    s_add_u32       s_save_mem_offset, s_save_mem_offset, s3                            //mem offset increased by 256 bytes
			
 
				+    s_cmp_lt_u32    m0, s_save_alloc_size                                               //scc=(m0 < s_save_alloc_size) ? 1 : 0
			
 
				+    s_cbranch_scc1  L_SAVE_LDS_LOOP                                                     //LDS save is complete?
			
 
				+
			
 
				+elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL  , has ichace miss
			
 
				+      // store from higest LDS address to lowest
			
 
				+      s_mov_b32  s3, 256*2
			
 
				+      s_sub_u32  m0, s_save_alloc_size, s3
			
 
				+      s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
			
 
				+      s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
			
 
				+      s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
			
 
				+      s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
			
 
				+      s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
			
 
				+      s_nop 0
			
 
				+      s_nop 0
			
 
				+      s_nop 0   //pad 3 dw to let LDS_DMA align with 64Bytes
			
 
				+      s_getpc_b64 s[0:1]                              // reuse s[0:1], since s[0:1] already saved
			
 
				+      s_add_u32   s0, s0,s_save_alloc_size
			
 
				+      s_addc_u32  s1, s1, 0
			
 
				+      s_setpc_b64 s[0:1]
			
 
				+
			
 
				+
			
 
				+       for var i =0; i< 128; i++
			
 
				+            // be careful to make here a 64Byte aligned address, which could improve performance...
			
 
				+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0           // first 64DW
			
 
				+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256           // second 64DW
			
 
				+
			
 
				+        if i!=127
			
 
				+        s_sub_u32  m0, m0, s3      // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
			
 
				+            s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
			
 
				+            end
			
 
				+       end
			
 
				+
			
 
				+else   // BUFFER_STORE
			
 
				+      v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
			
 
				+      v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2     // tid
			
 
				+      v_mul_i32_i24 v2, v3, 8   // tid*8
			
 
				+      v_mov_b32 v3, 256*2
			
 
				+      s_mov_b32 m0, 0x10000
			
 
				+      s_mov_b32 s0, s_save_buf_rsrc3
			
 
				+      s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF    // disable add_tid
			
 
				+      s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
			
 
				+
			
 
				+L_SAVE_LDS_LOOP_VECTOR:
			
 
				+      ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
			
 
				+      s_waitcnt lgkmcnt(0)
			
 
				+      buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
			
 
				+//      s_waitcnt vmcnt(0)
			
 
				+      v_add_u32 v2, vcc[0:1], v2, v3
			
 
				+      v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
			
 
				+      s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
			
 
				+
			
 
				+      // restore rsrc3
			
 
				+      s_mov_b32 s_save_buf_rsrc3, s0
			
 
				+
			
 
				+end
			
 
				+
			
 
				+L_SAVE_LDS_DONE:
			
 
				+
			
 
				+
			
 
				+    /*          save VGPRs  - set the Rest VGPRs        */
			
 
				+    //////////////////////////////////////////////////////////////////////////////////////
			
 
				+  L_SAVE_VGPR:
			
 
				+    // VGPR SR memory offset: 0
			
 
				+    // TODO rearrange the RSRC words to use swizzle for VGPR save...
			
 
				+
			
 
				+    s_mov_b32       s_save_mem_offset, (0+256*4)                                    // for the rest VGPRs
			
 
				+    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
			
 
				+    s_mov_b32       exec_hi, 0xFFFFFFFF
			
 
				+
			
 
				+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)                   //vpgr_size
			
 
				+    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
			
 
				+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
			
 
				+    s_lshl_b32      s_save_buf_rsrc2,  s_save_alloc_size, 8                         //NUM_RECORDS in bytes (64 threads*4)
			
 
				+    if (SWIZZLE_EN)
			
 
				+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
			
 
				+    else
			
 
				+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
			
 
				+    end
			
 
				+
			
 
				+
			
 
				+    // VGPR Allocated in 4-GPR granularity
			
 
				+
			
 
				+if G8SR_VGPR_SR_IN_DWX4
			
 
				+        // the const stride for DWx4 is 4*4 bytes
			
 
				+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
			
 
				+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
			
 
				+
			
 
				+        s_mov_b32         m0, 4     // skip first 4 VGPRs
			
 
				+        s_cmp_lt_u32      m0, s_save_alloc_size
			
 
				+        s_cbranch_scc0    L_SAVE_VGPR_LOOP_END      // no more vgprs
			
 
				+
			
 
				+        s_set_gpr_idx_on  m0, 0x1   // This will change M0
			
 
				+        s_add_u32         s_save_alloc_size, s_save_alloc_size, 0x1000  // because above inst change m0
			
 
				+L_SAVE_VGPR_LOOP:
			
 
				+        v_mov_b32         v0, v0   // v0 = v[0+m0]
			
 
				+        v_mov_b32         v1, v1
			
 
				+        v_mov_b32         v2, v2
			
 
				+        v_mov_b32         v3, v3
			
 
				+
			
 
				+
			
 
				+        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
			
 
				+        s_add_u32         m0, m0, 4
			
 
				+        s_add_u32         s_save_mem_offset, s_save_mem_offset, 256*4
			
 
				+        s_cmp_lt_u32      m0, s_save_alloc_size
			
 
				+    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
			
 
				+    s_set_gpr_idx_off
			
 
				+L_SAVE_VGPR_LOOP_END:
			
 
				+
			
 
				+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
			
 
				+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
			
 
				+else
			
 
				+    // VGPR store using dw burst
			
 
				+    s_mov_b32         m0, 0x4   //VGPR initial index value =0
			
 
				+    s_cmp_lt_u32      m0, s_save_alloc_size
			
 
				+    s_cbranch_scc0    L_SAVE_VGPR_END
			
 
				+
			
 
				+
			
 
				+    s_set_gpr_idx_on    m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
			
 
				+    s_add_u32       s_save_alloc_size, s_save_alloc_size, 0x1000                    //add 0x1000 since we compare m0 against it later
			
 
				+
			
 
				+  L_SAVE_VGPR_LOOP:
			
 
				+    v_mov_b32       v0, v0              //v0 = v[0+m0]
			
 
				+    v_mov_b32       v1, v1              //v0 = v[0+m0]
			
 
				+    v_mov_b32       v2, v2              //v0 = v[0+m0]
			
 
				+    v_mov_b32       v3, v3              //v0 = v[0+m0]
			
 
				+
			
 
				+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
			
 
				+        tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
			
 
				+    else
			
 
				+        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
			
 
				+        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
			
 
				+        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
			
 
				+        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
			
 
				+    end
			
 
				+
			
 
				+    s_add_u32       m0, m0, 4                                                       //next vgpr index
			
 
				+    s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4                     //every buffer_store_dword does 256 bytes
			
 
				+    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
			
 
				+    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
			
 
				+    s_set_gpr_idx_off
			
 
				+end
			
 
				+
			
 
				+L_SAVE_VGPR_END:
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    /*     S_PGM_END_SAVED  */                              //FIXME  graphics ONLY
			
 
				+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
			
 
				+        s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
			
 
				+        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
			
 
				+        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
			
 
				+        s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
			
 
				+    else
			
 
				+    end
			
 
				+
			
 
				+// Save Done timestamp
			
 
				+if G8SR_DEBUG_TIMESTAMP
			
 
				+        s_memrealtime   s_g8sr_ts_save_d
			
 
				+        // SGPR SR memory offset : size(VGPR)
			
 
				+        get_vgpr_size_bytes(s_save_mem_offset)
			
 
				+        s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
			
 
				+        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
			
 
				+        // Need reset rsrc2??
			
 
				+        s_mov_b32 m0, s_save_mem_offset
			
 
				+        s_mov_b32 s_save_buf_rsrc2,  0x1000000                                  //NUM_RECORDS in bytes
			
 
				+        s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0       glc:1
			
 
				+end
			
 
				+
			
 
				+
			
 
				+    s_branch    L_END_PGM
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**************************************************************************/
			
 
				+/*                      restore routine                                   */
			
 
				+/**************************************************************************/
			
 
				+
			
 
				+L_RESTORE:
			
 
				+    /*      Setup Resource Contants    */
			
 
				+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
			
 
				+        //calculate wd_addr using absolute thread id
			
 
				+        v_readlane_b32 s_restore_tmp, v9, 0
			
 
				+        s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
			
 
				+        s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
			
 
				+        s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
			
 
				+        s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
			
 
				+        s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
			
 
				+    else
			
 
				+    end
			
 
				+
			
 
				+if G8SR_DEBUG_TIMESTAMP
			
 
				+        s_memrealtime   s_g8sr_ts_restore_s
			
 
				+        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
			
 
				+        // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
			
 
				+        s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
			
 
				+        s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+    s_mov_b32       s_restore_buf_rsrc0,    s_restore_spi_init_lo                                                           //base_addr_lo
			
 
				+    s_and_b32       s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF                                               //base_addr_hi
			
 
				+    s_or_b32        s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
			
 
				+    s_mov_b32       s_restore_buf_rsrc2,    0                                                                               //NUM_RECORDS initial value = 0 (in bytes)
			
 
				+    s_mov_b32       s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
			
 
				+    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
			
 
				+    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)       //get ATC bit into position
			
 
				+    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or ATC
			
 
				+    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
			
 
				+    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)   //get MTYPE bits into position
			
 
				+    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or MTYPE
			
 
				+
			
 
				+    /*      global mem offset           */
			
 
				+//  s_mov_b32       s_restore_mem_offset, 0x0                               //mem offset initial value = 0
			
 
				+
			
 
				+    /*      the first wave in the threadgroup    */
			
 
				+    s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
			
 
				+    s_cbranch_scc0  L_RESTORE_VGPR
			
 
				+
			
 
				+    /*          restore LDS     */
			
 
				+    //////////////////////////////
			
 
				+  L_RESTORE_LDS:
			
 
				+
			
 
				+    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
			
 
				+    s_mov_b32       exec_hi, 0xFFFFFFFF
			
 
				+
			
 
				+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)              //lds_size
			
 
				+    s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF                  //lds_size is zero?
			
 
				+    s_cbranch_scc0  L_RESTORE_VGPR                                                          //no lds used? jump to L_RESTORE_VGPR
			
 
				+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 6                           //LDS size in dwords = lds_size * 64dw
			
 
				+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //LDS size in bytes
			
 
				+    s_mov_b32       s_restore_buf_rsrc2,    s_restore_alloc_size                            //NUM_RECORDS in bytes
			
 
				+
			
 
				+    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
			
 
				+    //
			
 
				+    get_vgpr_size_bytes(s_restore_mem_offset)
			
 
				+    get_sgpr_size_bytes(s_restore_tmp)
			
 
				+    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
			
 
				+    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()            //FIXME, Check if offset overflow???
			
 
				+
			
 
				+
			
 
				+    if (SWIZZLE_EN)
			
 
				+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
			
 
				+    else
			
 
				+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
			
 
				+    end
			
 
				+    s_mov_b32       m0, 0x0                                                                 //lds_offset initial value = 0
			
 
				+
			
 
				+  L_RESTORE_LDS_LOOP:
			
 
				+    if (SAVE_LDS)
			
 
				+        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1                    // first 64DW
			
 
				+        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256         // second 64DW
			
 
				+    end
			
 
				+    s_add_u32       m0, m0, 256*2                                               // 128 DW
			
 
				+    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*2           //mem offset increased by 128DW
			
 
				+    s_cmp_lt_u32    m0, s_restore_alloc_size                                    //scc=(m0 < s_restore_alloc_size) ? 1 : 0
			
 
				+    s_cbranch_scc1  L_RESTORE_LDS_LOOP                                                      //LDS restore is complete?
			
 
				+
			
 
				+
			
 
				+    /*          restore VGPRs       */
			
 
				+    //////////////////////////////
			
 
				+  L_RESTORE_VGPR:
			
 
				+        // VGPR SR memory offset : 0
			
 
				+    s_mov_b32       s_restore_mem_offset, 0x0
			
 
				+    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
			
 
				+    s_mov_b32       exec_hi, 0xFFFFFFFF
			
 
				+
			
 
				+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)    //vpgr_size
			
 
				+    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
			
 
				+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
			
 
				+    s_lshl_b32      s_restore_buf_rsrc2,  s_restore_alloc_size, 8                           //NUM_RECORDS in bytes (64 threads*4)
			
 
				+    if (SWIZZLE_EN)
			
 
				+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
			
 
				+    else
			
 
				+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
			
 
				+    end
			
 
				+
			
 
				+if G8SR_VGPR_SR_IN_DWX4
			
 
				+     get_vgpr_size_bytes(s_restore_mem_offset)
			
 
				+     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
			
 
				+
			
 
				+     // the const stride for DWx4 is 4*4 bytes
			
 
				+     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
			
 
				+     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
			
 
				+
			
 
				+     s_mov_b32         m0, s_restore_alloc_size
			
 
				+     s_set_gpr_idx_on  m0, 0x8    // Note.. This will change m0
			
 
				+
			
 
				+L_RESTORE_VGPR_LOOP:
			
 
				+     buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
			
 
				+     s_waitcnt vmcnt(0)
			
 
				+     s_sub_u32         m0, m0, 4
			
 
				+     v_mov_b32         v0, v0   // v[0+m0] = v0
			
 
				+     v_mov_b32         v1, v1
			
 
				+     v_mov_b32         v2, v2
			
 
				+     v_mov_b32         v3, v3
			
 
				+     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
			
 
				+     s_cmp_eq_u32      m0, 0x8000
			
 
				+     s_cbranch_scc0    L_RESTORE_VGPR_LOOP
			
 
				+     s_set_gpr_idx_off
			
 
				+
			
 
				+     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
			
 
				+     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
			
 
				+
			
 
				+else
			
 
				+    // VGPR load using dw burst
			
 
				+    s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset     // restore start with v1, v0 will be the last
			
 
				+    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
			
 
				+    s_mov_b32       m0, 4                               //VGPR initial index value = 1
			
 
				+    s_set_gpr_idx_on  m0, 0x8                       //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
			
 
				+    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 0x8000                      //add 0x8000 since we compare m0 against it later
			
 
				+
			
 
				+  L_RESTORE_VGPR_LOOP:
			
 
				+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
			
 
				+        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
			
 
				+    else
			
 
				+        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
			
 
				+        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
			
 
				+        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
			
 
				+        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
			
 
				+    end
			
 
				+    s_waitcnt       vmcnt(0)                                                                //ensure data ready
			
 
				+    v_mov_b32       v0, v0                                                                  //v[0+m0] = v0
			
 
				+    v_mov_b32       v1, v1
			
 
				+    v_mov_b32       v2, v2
			
 
				+    v_mov_b32       v3, v3
			
 
				+    s_add_u32       m0, m0, 4                                                               //next vgpr index
			
 
				+    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4                           //every buffer_load_dword does 256 bytes
			
 
				+    s_cmp_lt_u32    m0, s_restore_alloc_size                                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
			
 
				+    s_cbranch_scc1  L_RESTORE_VGPR_LOOP                                                     //VGPR restore (except v0) is complete?
			
 
				+    s_set_gpr_idx_off
			
 
				+                                                                                            /* VGPR restore on v0 */
			
 
				+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
			
 
				+        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
			
 
				+    else
			
 
				+        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
			
 
				+        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
			
 
				+        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
			
 
				+        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
			
 
				+    end
			
 
				+
			
 
				+end
			
 
				+
			
 
				+    /*          restore SGPRs       */
			
 
				+    //////////////////////////////
			
 
				+
			
 
				+    // SGPR SR memory offset : size(VGPR)
			
 
				+    get_vgpr_size_bytes(s_restore_mem_offset)
			
 
				+    get_sgpr_size_bytes(s_restore_tmp)
			
 
				+    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
			
 
				+    s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4     // restore SGPR from S[n] to S[0], by 16 sgprs group
			
 
				+    // TODO, change RSRC word to rearrange memory layout for SGPRS
			
 
				+
			
 
				+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)                //spgr_size
			
 
				+    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
			
 
				+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 4                           //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
			
 
				+
			
 
				+    if (SGPR_SAVE_USE_SQC)
			
 
				+        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 2                     //NUM_RECORDS in bytes
			
 
				+    else
			
 
				+        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 8                     //NUM_RECORDS in bytes (64 threads)
			
 
				+    end
			
 
				+    if (SWIZZLE_EN)
			
 
				+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
			
 
				+    else
			
 
				+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
			
 
				+    end
			
 
				+
			
 
				+    /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
			
 
				+       However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
			
 
				+    */
			
 
				+    s_mov_b32 m0, s_restore_alloc_size
			
 
				+
			
 
				+ L_RESTORE_SGPR_LOOP:
			
 
				+    read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)  //PV: further performance improvement can be made
			
 
				+    s_waitcnt       lgkmcnt(0)                                                              //ensure data ready
			
 
				+
			
 
				+    s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
			
 
				+
			
 
				+    s_movreld_b64   s0, s0      //s[0+m0] = s0
			
 
				+    s_movreld_b64   s2, s2
			
 
				+    s_movreld_b64   s4, s4
			
 
				+    s_movreld_b64   s6, s6
			
 
				+    s_movreld_b64   s8, s8
			
 
				+    s_movreld_b64   s10, s10
			
 
				+    s_movreld_b64   s12, s12
			
 
				+    s_movreld_b64   s14, s14
			
 
				+
			
 
				+    s_cmp_eq_u32    m0, 0               //scc = (m0 < s_restore_alloc_size) ? 1 : 0
			
 
				+    s_cbranch_scc0  L_RESTORE_SGPR_LOOP             //SGPR restore (except s0) is complete?
			
 
				+
			
 
				+    /*      restore HW registers    */
			
 
				+    //////////////////////////////
			
 
				+  L_RESTORE_HWREG:
			
 
				+
			
 
				+
			
 
				+if G8SR_DEBUG_TIMESTAMP
			
 
				+      s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
			
 
				+      s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
			
 
				+end
			
 
				+
			
 
				+    // HWREG SR memory offset : size(VGPR)+size(SGPR)
			
 
				+    get_vgpr_size_bytes(s_restore_mem_offset)
			
 
				+    get_sgpr_size_bytes(s_restore_tmp)
			
 
				+    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
			
 
				+
			
 
				+
			
 
				+    s_mov_b32       s_restore_buf_rsrc2, 0x4                                                //NUM_RECORDS   in bytes
			
 
				+    if (SWIZZLE_EN)
			
 
				+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
			
 
				+    else
			
 
				+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
			
 
				+    end
			
 
				+
			
 
				+    read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)                    //M0
			
 
				+    read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)             //PC
			
 
				+    read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
			
 
				+    read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)               //EXEC
			
 
				+    read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
			
 
				+    read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)                //STATUS
			
 
				+    read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)               //TRAPSTS
			
 
				+    read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_LO
			
 
				+    read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_HI
			
 
				+    read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)              //MODE
			
 
				+    read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_LO
			
 
				+    read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_HI
			
 
				+
			
 
				+    s_waitcnt       lgkmcnt(0)                                                                                      //from now on, it is safe to restore STATUS and IB_STS
			
 
				+
			
 
				+    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff      //pc[47:32]        //Do it here in order not to affect STATUS
			
 
				+
			
 
				+    //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
			
 
				+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
			
 
				+        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8     //two back-to-back s_trap are used (first for save and second for restore)
			
 
				+        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
			
 
				+    end
			
 
				+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
			
 
				+        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
			
 
				+        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
			
 
				+    end
			
 
				+
			
 
				+    s_mov_b32       m0,         s_restore_m0
			
 
				+    s_mov_b32       exec_lo,    s_restore_exec_lo
			
 
				+    s_mov_b32       exec_hi,    s_restore_exec_hi
			
 
				+
			
 
				+    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
			
 
				+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
			
 
				+    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
			
 
				+    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
			
 
				+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
			
 
				+    //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
			
 
				+    s_setreg_b32    hwreg(HW_REG_MODE),     s_restore_mode
			
 
				+    //reuse s_restore_m0 as a temp register
			
 
				+    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
			
 
				+    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
			
 
				+    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
			
 
				+    s_mov_b32       s_restore_tmp, 0x0                                                                              //IB_STS is zero
			
 
				+    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
			
 
				+    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
			
 
				+    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
			
 
				+    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
			
 
				+    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
			
 
				+    s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
			
 
				+    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
			
 
				+    s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
			
 
				+
			
 
				+    s_and_b64    exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
			
 
				+    s_and_b64    vcc, vcc, vcc  // Restore STATUS.VCCZ, not writable by s_setreg_b32
			
 
				+    s_setreg_b32    hwreg(HW_REG_STATUS),   s_restore_status     // SCC is included, which is changed by previous salu
			
 
				+
			
 
				+    s_barrier                                                   //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
			
 
				+
			
 
				+if G8SR_DEBUG_TIMESTAMP
			
 
				+    s_memrealtime s_g8sr_ts_restore_d
			
 
				+    s_waitcnt lgkmcnt(0)
			
 
				+end
			
 
				+
			
 
				+//  s_rfe_b64 s_restore_pc_lo                                   //Return to the main shader program and resume execution
			
 
				+    s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0            // s_restore_m0[0] is used to set STATUS.inst_atc
			
 
				+
			
 
				+
			
 
				+/**************************************************************************/
			
 
				+/*                      the END                                           */
			
 
				+/**************************************************************************/
			
 
				+L_END_PGM:
			
 
				+    s_endpgm
			
 
				+
			
 
				+end
			
 
				+
			
 
				+
			
 
				+/**************************************************************************/
			
 
				+/*                      the helper functions                              */
			
 
				+/**************************************************************************/
			
 
				+
			
 
				+//Only for save hwreg to mem
			
 
				+function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
			
 
				+        s_mov_b32 exec_lo, m0                   //assuming exec_lo is not needed anymore from this point on
			
 
				+        s_mov_b32 m0, s_mem_offset
			
 
				+        s_buffer_store_dword s, s_rsrc, m0      glc:1
			
 
				+        s_add_u32       s_mem_offset, s_mem_offset, 4
			
 
				+        s_mov_b32   m0, exec_lo
			
 
				+end
			
 
				+
			
 
				+
			
 
				+// HWREG are saved before SGPRs, so all HWREG could be use.
			
 
				+function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
			
 
				+
			
 
				+        s_buffer_store_dwordx4 s[0], s_rsrc, 0  glc:1
			
 
				+        s_buffer_store_dwordx4 s[4], s_rsrc, 16  glc:1
			
 
				+        s_buffer_store_dwordx4 s[8], s_rsrc, 32  glc:1
			
 
				+        s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
			
 
				+        s_add_u32       s_rsrc[0], s_rsrc[0], 4*16
			
 
				+        s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0             // +scc
			
 
				+end
			
 
				+
			
 
				+
			
 
				+function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
			
 
				+    s_buffer_load_dword s, s_rsrc, s_mem_offset     glc:1
			
 
				+    s_add_u32       s_mem_offset, s_mem_offset, 4
			
 
				+end
			
 
				+
			
 
				+function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
			
 
				+    s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset      glc:1
			
 
				+    s_sub_u32       s_mem_offset, s_mem_offset, 4*16
			
 
				+end
			
 
				+
			
 
				+
			
 
				+
			
 
				+function get_lds_size_bytes(s_lds_size_byte)
			
 
				+    // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
			
 
				+    s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)          // lds_size
			
 
				+    s_lshl_b32     s_lds_size_byte, s_lds_size_byte, 8                      //LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
			
 
				+end
			
 
				+
			
 
				+function get_vgpr_size_bytes(s_vgpr_size_byte)
			
 
				+    s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)  //vpgr_size
			
 
				+    s_add_u32      s_vgpr_size_byte, s_vgpr_size_byte, 1
			
 
				+    s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)   //FIXME for GFX, zero is possible
			
 
				+end
			
 
				+
			
 
				+function get_sgpr_size_bytes(s_sgpr_size_byte)
			
 
				+    s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)  //spgr_size
			
 
				+    s_add_u32      s_sgpr_size_byte, s_sgpr_size_byte, 1
			
 
				+    s_lshl_b32     s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value)
			
 
				+end
			
 
				+
			
 
				+function get_hwreg_size_bytes
			
 
				+    return 128 //HWREG size 128 bytes
			
 
				+end
			
 
				+
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+static const uint32_t cwsr_trap_gfx8_hex[] = {
			
 
				+	0xbf820001, 0xbf820123,
			
 
				+	0xb8f4f802, 0x89748674,
			
 
				+	0xb8f5f803, 0x8675ff75,
			
 
				+	0x00000400, 0xbf850011,
			
 
				+	0xc00a1e37, 0x00000000,
			
 
				+	0xbf8c007f, 0x87777978,
			
 
				+	0xbf840002, 0xb974f802,
			
 
				+	0xbe801d78, 0xb8f5f803,
			
 
				+	0x8675ff75, 0x000001ff,
			
 
				+	0xbf850002, 0x80708470,
			
 
				+	0x82718071, 0x8671ff71,
			
 
				+	0x0000ffff, 0xb974f802,
			
 
				+	0xbe801f70, 0xb8f5f803,
			
 
				+	0x8675ff75, 0x00000100,
			
 
				+	0xbf840006, 0xbefa0080,
			
 
				+	0xb97a0203, 0x8671ff71,
			
 
				+	0x0000ffff, 0x80f08870,
			
 
				+	0x82f18071, 0xbefa0080,
			
 
				+	0xb97a0283, 0xbef60068,
			
 
				+	0xbef70069, 0xb8fa1c07,
			
 
				+	0x8e7a9c7a, 0x87717a71,
			
 
				+	0xb8fa03c7, 0x8e7a9b7a,
			
 
				+	0x87717a71, 0xb8faf807,
			
 
				+	0x867aff7a, 0x00007fff,
			
 
				+	0xb97af807, 0xbef2007e,
			
 
				+	0xbef3007f, 0xbefe0180,
			
 
				+	0xbf900004, 0xbf8e0002,
			
 
				+	0xbf88fffe, 0xbef8007e,
			
 
				+	0x8679ff7f, 0x0000ffff,
			
 
				+	0x8779ff79, 0x00040000,
			
 
				+	0xbefa0080, 0xbefb00ff,
			
 
				+	0x00807fac, 0x867aff7f,
			
 
				+	0x08000000, 0x8f7a837a,
			
 
				+	0x877b7a7b, 0x867aff7f,
			
 
				+	0x70000000, 0x8f7a817a,
			
 
				+	0x877b7a7b, 0xbeef007c,
			
 
				+	0xbeee0080, 0xb8ee2a05,
			
 
				+	0x806e816e, 0x8e6e8a6e,
			
 
				+	0xb8fa1605, 0x807a817a,
			
 
				+	0x8e7a867a, 0x806e7a6e,
			
 
				+	0xbefa0084, 0xbefa00ff,
			
 
				+	0x01000000, 0xbefe007c,
			
 
				+	0xbefc006e, 0xc0611bfc,
			
 
				+	0x0000007c, 0x806e846e,
			
 
				+	0xbefc007e, 0xbefe007c,
			
 
				+	0xbefc006e, 0xc0611c3c,
			
 
				+	0x0000007c, 0x806e846e,
			
 
				+	0xbefc007e, 0xbefe007c,
			
 
				+	0xbefc006e, 0xc0611c7c,
			
 
				+	0x0000007c, 0x806e846e,
			
 
				+	0xbefc007e, 0xbefe007c,
			
 
				+	0xbefc006e, 0xc0611cbc,
			
 
				+	0x0000007c, 0x806e846e,
			
 
				+	0xbefc007e, 0xbefe007c,
			
 
				+	0xbefc006e, 0xc0611cfc,
			
 
				+	0x0000007c, 0x806e846e,
			
 
				+	0xbefc007e, 0xbefe007c,
			
 
				+	0xbefc006e, 0xc0611d3c,
			
 
				+	0x0000007c, 0x806e846e,
			
 
				+	0xbefc007e, 0xb8f5f803,
			
 
				+	0xbefe007c, 0xbefc006e,
			
 
				+	0xc0611d7c, 0x0000007c,
			
 
				+	0x806e846e, 0xbefc007e,
			
 
				+	0xbefe007c, 0xbefc006e,
			
 
				+	0xc0611dbc, 0x0000007c,
			
 
				+	0x806e846e, 0xbefc007e,
			
 
				+	0xbefe007c, 0xbefc006e,
			
 
				+	0xc0611dfc, 0x0000007c,
			
 
				+	0x806e846e, 0xbefc007e,
			
 
				+	0xb8eff801, 0xbefe007c,
			
 
				+	0xbefc006e, 0xc0611bfc,
			
 
				+	0x0000007c, 0x806e846e,
			
 
				+	0xbefc007e, 0xbefe007c,
			
 
				+	0xbefc006e, 0xc0611b3c,
			
 
				+	0x0000007c, 0x806e846e,
			
 
				+	0xbefc007e, 0xbefe007c,
			
 
				+	0xbefc006e, 0xc0611b7c,
			
 
				+	0x0000007c, 0x806e846e,
			
 
				+	0xbefc007e, 0x867aff7f,
			
 
				+	0x04000000, 0xbef30080,
			
 
				+	0x8773737a, 0xb8ee2a05,
			
 
				+	0x806e816e, 0x8e6e8a6e,
			
 
				+	0xb8f51605, 0x80758175,
			
 
				+	0x8e758475, 0x8e7a8275,
			
 
				+	0xbefa00ff, 0x01000000,
			
 
				+	0xbef60178, 0x80786e78,
			
 
				+	0x82798079, 0xbefc0080,
			
 
				+	0xbe802b00, 0xbe822b02,
			
 
				+	0xbe842b04, 0xbe862b06,
			
 
				+	0xbe882b08, 0xbe8a2b0a,
			
 
				+	0xbe8c2b0c, 0xbe8e2b0e,
			
 
				+	0xc06b003c, 0x00000000,
			
 
				+	0xc06b013c, 0x00000010,
			
 
				+	0xc06b023c, 0x00000020,
			
 
				+	0xc06b033c, 0x00000030,
			
 
				+	0x8078c078, 0x82798079,
			
 
				+	0x807c907c, 0xbf0a757c,
			
 
				+	0xbf85ffeb, 0xbef80176,
			
 
				+	0xbeee0080, 0xbefe00c1,
			
 
				+	0xbeff00c1, 0xbefa00ff,
			
 
				+	0x01000000, 0xe0724000,
			
 
				+	0x6e1e0000, 0xe0724100,
			
 
				+	0x6e1e0100, 0xe0724200,
			
 
				+	0x6e1e0200, 0xe0724300,
			
 
				+	0x6e1e0300, 0xbefe00c1,
			
 
				+	0xbeff00c1, 0xb8f54306,
			
 
				+	0x8675c175, 0xbf84002c,
			
 
				+	0xbf8a0000, 0x867aff73,
			
 
				+	0x04000000, 0xbf840028,
			
 
				+	0x8e758675, 0x8e758275,
			
 
				+	0xbefa0075, 0xb8ee2a05,
			
 
				+	0x806e816e, 0x8e6e8a6e,
			
 
				+	0xb8fa1605, 0x807a817a,
			
 
				+	0x8e7a867a, 0x806e7a6e,
			
 
				+	0x806eff6e, 0x00000080,
			
 
				+	0xbefa00ff, 0x01000000,
			
 
				+	0xbefc0080, 0xd28c0002,
			
 
				+	0x000100c1, 0xd28d0003,
			
 
				+	0x000204c1, 0xd1060002,
			
 
				+	0x00011103, 0x7e0602ff,
			
 
				+	0x00000200, 0xbefc00ff,
			
 
				+	0x00010000, 0xbe80007b,
			
 
				+	0x867bff7b, 0xff7fffff,
			
 
				+	0x877bff7b, 0x00058000,
			
 
				+	0xd8ec0000, 0x00000002,
			
 
				+	0xbf8c007f, 0xe0765000,
			
 
				+	0x6e1e0002, 0x32040702,
			
 
				+	0xd0c9006a, 0x0000eb02,
			
 
				+	0xbf87fff7, 0xbefb0000,
			
 
				+	0xbeee00ff, 0x00000400,
			
 
				+	0xbefe00c1, 0xbeff00c1,
			
 
				+	0xb8f52a05, 0x80758175,
			
 
				+	0x8e758275, 0x8e7a8875,
			
 
				+	0xbefa00ff, 0x01000000,
			
 
				+	0xbefc0084, 0xbf0a757c,
			
 
				+	0xbf840015, 0xbf11017c,
			
 
				+	0x8075ff75, 0x00001000,
			
 
				+	0x7e000300, 0x7e020301,
			
 
				+	0x7e040302, 0x7e060303,
			
 
				+	0xe0724000, 0x6e1e0000,
			
 
				+	0xe0724100, 0x6e1e0100,
			
 
				+	0xe0724200, 0x6e1e0200,
			
 
				+	0xe0724300, 0x6e1e0300,
			
 
				+	0x807c847c, 0x806eff6e,
			
 
				+	0x00000400, 0xbf0a757c,
			
 
				+	0xbf85ffef, 0xbf9c0000,
			
 
				+	0xbf8200ca, 0xbef8007e,
			
 
				+	0x8679ff7f, 0x0000ffff,
			
 
				+	0x8779ff79, 0x00040000,
			
 
				+	0xbefa0080, 0xbefb00ff,
			
 
				+	0x00807fac, 0x8676ff7f,
			
 
				+	0x08000000, 0x8f768376,
			
 
				+	0x877b767b, 0x8676ff7f,
			
 
				+	0x70000000, 0x8f768176,
			
 
				+	0x877b767b, 0x8676ff7f,
			
 
				+	0x04000000, 0xbf84001e,
			
 
				+	0xbefe00c1, 0xbeff00c1,
			
 
				+	0xb8f34306, 0x8673c173,
			
 
				+	0xbf840019, 0x8e738673,
			
 
				+	0x8e738273, 0xbefa0073,
			
 
				+	0xb8f22a05, 0x80728172,
			
 
				+	0x8e728a72, 0xb8f61605,
			
 
				+	0x80768176, 0x8e768676,
			
 
				+	0x80727672, 0x8072ff72,
			
 
				+	0x00000080, 0xbefa00ff,
			
 
				+	0x01000000, 0xbefc0080,
			
 
				+	0xe0510000, 0x721e0000,
			
 
				+	0xe0510100, 0x721e0000,
			
 
				+	0x807cff7c, 0x00000200,
			
 
				+	0x8072ff72, 0x00000200,
			
 
				+	0xbf0a737c, 0xbf85fff6,
			
 
				+	0xbef20080, 0xbefe00c1,
			
 
				+	0xbeff00c1, 0xb8f32a05,
			
 
				+	0x80738173, 0x8e738273,
			
 
				+	0x8e7a8873, 0xbefa00ff,
			
 
				+	0x01000000, 0xbef60072,
			
 
				+	0x8072ff72, 0x00000400,
			
 
				+	0xbefc0084, 0xbf11087c,
			
 
				+	0x8073ff73, 0x00008000,
			
 
				+	0xe0524000, 0x721e0000,
			
 
				+	0xe0524100, 0x721e0100,
			
 
				+	0xe0524200, 0x721e0200,
			
 
				+	0xe0524300, 0x721e0300,
			
 
				+	0xbf8c0f70, 0x7e000300,
			
 
				+	0x7e020301, 0x7e040302,
			
 
				+	0x7e060303, 0x807c847c,
			
 
				+	0x8072ff72, 0x00000400,
			
 
				+	0xbf0a737c, 0xbf85ffee,
			
 
				+	0xbf9c0000, 0xe0524000,
			
 
				+	0x761e0000, 0xe0524100,
			
 
				+	0x761e0100, 0xe0524200,
			
 
				+	0x761e0200, 0xe0524300,
			
 
				+	0x761e0300, 0xb8f22a05,
			
 
				+	0x80728172, 0x8e728a72,
			
 
				+	0xb8f61605, 0x80768176,
			
 
				+	0x8e768676, 0x80727672,
			
 
				+	0x80f2c072, 0xb8f31605,
			
 
				+	0x80738173, 0x8e738473,
			
 
				+	0x8e7a8273, 0xbefa00ff,
			
 
				+	0x01000000, 0xbefc0073,
			
 
				+	0xc031003c, 0x00000072,
			
 
				+	0x80f2c072, 0xbf8c007f,
			
 
				+	0x80fc907c, 0xbe802d00,
			
 
				+	0xbe822d02, 0xbe842d04,
			
 
				+	0xbe862d06, 0xbe882d08,
			
 
				+	0xbe8a2d0a, 0xbe8c2d0c,
			
 
				+	0xbe8e2d0e, 0xbf06807c,
			
 
				+	0xbf84fff1, 0xb8f22a05,
			
 
				+	0x80728172, 0x8e728a72,
			
 
				+	0xb8f61605, 0x80768176,
			
 
				+	0x8e768676, 0x80727672,
			
 
				+	0xbefa0084, 0xbefa00ff,
			
 
				+	0x01000000, 0xc0211cfc,
			
 
				+	0x00000072, 0x80728472,
			
 
				+	0xc0211c3c, 0x00000072,
			
 
				+	0x80728472, 0xc0211c7c,
			
 
				+	0x00000072, 0x80728472,
			
 
				+	0xc0211bbc, 0x00000072,
			
 
				+	0x80728472, 0xc0211bfc,
			
 
				+	0x00000072, 0x80728472,
			
 
				+	0xc0211d3c, 0x00000072,
			
 
				+	0x80728472, 0xc0211d7c,
			
 
				+	0x00000072, 0x80728472,
			
 
				+	0xc0211a3c, 0x00000072,
			
 
				+	0x80728472, 0xc0211a7c,
			
 
				+	0x00000072, 0x80728472,
			
 
				+	0xc0211dfc, 0x00000072,
			
 
				+	0x80728472, 0xc0211b3c,
			
 
				+	0x00000072, 0x80728472,
			
 
				+	0xc0211b7c, 0x00000072,
			
 
				+	0x80728472, 0xbf8c007f,
			
 
				+	0x8671ff71, 0x0000ffff,
			
 
				+	0xbefc0073, 0xbefe006e,
			
 
				+	0xbeff006f, 0x867375ff,
			
 
				+	0x000003ff, 0xb9734803,
			
 
				+	0x867375ff, 0xfffff800,
			
 
				+	0x8f738b73, 0xb973a2c3,
			
 
				+	0xb977f801, 0x8673ff71,
			
 
				+	0xf0000000, 0x8f739c73,
			
 
				+	0x8e739073, 0xbef60080,
			
 
				+	0x87767376, 0x8673ff71,
			
 
				+	0x08000000, 0x8f739b73,
			
 
				+	0x8e738f73, 0x87767376,
			
 
				+	0x8673ff74, 0x00800000,
			
 
				+	0x8f739773, 0xb976f807,
			
 
				+	0x86fe7e7e, 0x86ea6a6a,
			
 
				+	0xb974f802, 0xbf8a0000,
			
 
				+	0x95807370, 0xbf810000,
			
 
				+};
			
 
				+
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -117,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file *filep)
 
				 		return -EPERM;
			
 
				 	}
			
 
				 
			
 
				-	process = kfd_create_process(current);
			
 
				+	process = kfd_create_process(filep);
			
 
				 	if (IS_ERR(process))
			
 
				 		return PTR_ERR(process);
			
 
				 
			
@@ -206,6 +206,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
 
				 	q_properties->ctx_save_restore_area_address =
			
 
				 			args->ctx_save_restore_address;
			
 
				 	q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
			
 
				+	q_properties->ctl_stack_size = args->ctl_stack_size;
			
 
				 	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
			
 
				 		args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
			
 
				 		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
			
@@ -431,6 +432,38 @@ out:
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				+static int kfd_ioctl_set_trap_handler(struct file *filep,
			
 
				+					struct kfd_process *p, void *data)
			
 
				+{
			
 
				+	struct kfd_ioctl_set_trap_handler_args *args = data;
			
 
				+	struct kfd_dev *dev;
			
 
				+	int err = 0;
			
 
				+	struct kfd_process_device *pdd;
			
 
				+
			
 
				+	dev = kfd_device_by_id(args->gpu_id);
			
 
				+	if (dev == NULL)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	mutex_lock(&p->mutex);
			
 
				+
			
 
				+	pdd = kfd_bind_process_to_device(dev, p);
			
 
				+	if (IS_ERR(pdd)) {
			
 
				+		err = -ESRCH;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (dev->dqm->ops.set_trap_handler(dev->dqm,
			
 
				+					&pdd->qpd,
			
 
				+					args->tba_addr,
			
 
				+					args->tma_addr))
			
 
				+		err = -EINVAL;
			
 
				+
			
 
				+out:
			
 
				+	mutex_unlock(&p->mutex);
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				 static int kfd_ioctl_dbg_register(struct file *filep,
			
 
				 				struct kfd_process *p, void *data)
			
 
				 {
			
@@ -493,7 +526,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep,
 
				 	long status;
			
 
				 
			
 
				 	dev = kfd_device_by_id(args->gpu_id);
			
 
				-	if (!dev)
			
 
				+	if (!dev || !dev->dbgmgr)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	if (dev->device_info->asic_family == CHIP_CARRIZO) {
			
@@ -979,7 +1012,10 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
				 			kfd_ioctl_set_scratch_backing_va, 0),
			
 
				 
			
 
				 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG,
			
 
				-			kfd_ioctl_get_tile_config, 0)
			
 
				+			kfd_ioctl_get_tile_config, 0),
			
 
				+
			
 
				+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER,
			
 
				+			kfd_ioctl_set_trap_handler, 0),
			
 
				 };
			
 
				 
			
 
				 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
			
@@ -1088,6 +1124,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
 
				 			KFD_MMAP_EVENTS_MASK) {
			
 
				 		vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK;
			
 
				 		return kfd_event_mmap(process, vma);
			
 
				+	} else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) ==
			
 
				+			KFD_MMAP_RESERVED_MEM_MASK) {
			
 
				+		vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK;
			
 
				+		return kfd_reserved_mem_mmap(process, vma);
			
 
				 	}
			
 
				 
			
 
				 	return -EFAULT;
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -0,0 +1,1267 @@
 
				+/*
			
 
				+ * Copyright 2015-2017 Advanced Micro Devices, Inc.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
			
 
				+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
			
 
				+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
			
 
				+ * OTHER DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/pci.h>
			
 
				+#include <linux/acpi.h>
			
 
				+#include <linux/amd-iommu.h>
			
 
				+#include "kfd_crat.h"
			
 
				+#include "kfd_priv.h"
			
 
				+#include "kfd_topology.h"
			
 
				+
			
 
				+/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
			
 
				+ * GPU processor ID are expressed with Bit[31]=1.
			
 
				+ * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs
			
 
				+ * used in the CRAT.
			
 
				+ */
			
 
				+static uint32_t gpu_processor_id_low = 0x80001000;
			
 
				+
			
 
				+/* Return the next available gpu_processor_id and increment it for next GPU
			
 
				+ *	@total_cu_count - Total CUs present in the GPU including ones
			
 
				+ *			  masked off
			
 
				+ */
			
 
				+static inline unsigned int get_and_inc_gpu_processor_id(
			
 
				+				unsigned int total_cu_count)
			
 
				+{
			
 
				+	int current_id = gpu_processor_id_low;
			
 
				+
			
 
				+	gpu_processor_id_low += total_cu_count;
			
 
				+	return current_id;
			
 
				+}
			
 
				+
			
 
				+/* Static table to describe GPU Cache information */
			
 
				+struct kfd_gpu_cache_info {
			
 
				+	uint32_t	cache_size;
			
 
				+	uint32_t	cache_level;
			
 
				+	uint32_t	flags;
			
 
				+	/* Indicates how many Compute Units share this cache
			
 
				+	 * Value = 1 indicates the cache is not shared
			
 
				+	 */
			
 
				+	uint32_t	num_cu_shared;
			
 
				+};
			
 
				+
			
 
				+static struct kfd_gpu_cache_info kaveri_cache_info[] = {
			
 
				+	{
			
 
				+		/* TCP L1 Cache per CU */
			
 
				+		.cache_size = 16,
			
 
				+		.cache_level = 1,
			
 
				+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
			
 
				+				CRAT_CACHE_FLAGS_DATA_CACHE |
			
 
				+				CRAT_CACHE_FLAGS_SIMD_CACHE),
			
 
				+		.num_cu_shared = 1,
			
 
				+
			
 
				+	},
			
 
				+	{
			
 
				+		/* Scalar L1 Instruction Cache (in SQC module) per bank */
			
 
				+		.cache_size = 16,
			
 
				+		.cache_level = 1,
			
 
				+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
			
 
				+				CRAT_CACHE_FLAGS_INST_CACHE |
			
 
				+				CRAT_CACHE_FLAGS_SIMD_CACHE),
			
 
				+		.num_cu_shared = 2,
			
 
				+	},
			
 
				+	{
			
 
				+		/* Scalar L1 Data Cache (in SQC module) per bank */
			
 
				+		.cache_size = 8,
			
 
				+		.cache_level = 1,
			
 
				+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
			
 
				+				CRAT_CACHE_FLAGS_DATA_CACHE |
			
 
				+				CRAT_CACHE_FLAGS_SIMD_CACHE),
			
 
				+		.num_cu_shared = 2,
			
 
				+	},
			
 
				+
			
 
				+	/* TODO: Add L2 Cache information */
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static struct kfd_gpu_cache_info carrizo_cache_info[] = {
			
 
				+	{
			
 
				+		/* TCP L1 Cache per CU */
			
 
				+		.cache_size = 16,
			
 
				+		.cache_level = 1,
			
 
				+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
			
 
				+				CRAT_CACHE_FLAGS_DATA_CACHE |
			
 
				+				CRAT_CACHE_FLAGS_SIMD_CACHE),
			
 
				+		.num_cu_shared = 1,
			
 
				+	},
			
 
				+	{
			
 
				+		/* Scalar L1 Instruction Cache (in SQC module) per bank */
			
 
				+		.cache_size = 8,
			
 
				+		.cache_level = 1,
			
 
				+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
			
 
				+				CRAT_CACHE_FLAGS_INST_CACHE |
			
 
				+				CRAT_CACHE_FLAGS_SIMD_CACHE),
			
 
				+		.num_cu_shared = 4,
			
 
				+	},
			
 
				+	{
			
 
				+		/* Scalar L1 Data Cache (in SQC module) per bank. */
			
 
				+		.cache_size = 4,
			
 
				+		.cache_level = 1,
			
 
				+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
			
 
				+				CRAT_CACHE_FLAGS_DATA_CACHE |
			
 
				+				CRAT_CACHE_FLAGS_SIMD_CACHE),
			
 
				+		.num_cu_shared = 4,
			
 
				+	},
			
 
				+
			
 
				+	/* TODO: Add L2 Cache information */
			
 
				+};
			
 
				+
			
 
				+/* NOTE: In future if more information is added to struct kfd_gpu_cache_info
			
 
				+ * the following ASICs may need a separate table.
			
 
				+ */
			
 
				+#define hawaii_cache_info kaveri_cache_info
			
 
				+#define tonga_cache_info carrizo_cache_info
			
 
				+#define fiji_cache_info  carrizo_cache_info
			
 
				+#define polaris10_cache_info carrizo_cache_info
			
 
				+#define polaris11_cache_info carrizo_cache_info
			
 
				+
			
 
				+static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
			
 
				+		struct crat_subtype_computeunit *cu)
			
 
				+{
			
 
				+	dev->node_props.cpu_cores_count = cu->num_cpu_cores;
			
 
				+	dev->node_props.cpu_core_id_base = cu->processor_id_low;
			
 
				+	if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
			
 
				+		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
			
 
				+
			
 
				+	pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
			
 
				+			cu->processor_id_low);
			
 
				+}
			
 
				+
			
 
				+static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
			
 
				+		struct crat_subtype_computeunit *cu)
			
 
				+{
			
 
				+	dev->node_props.simd_id_base = cu->processor_id_low;
			
 
				+	dev->node_props.simd_count = cu->num_simd_cores;
			
 
				+	dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
			
 
				+	dev->node_props.max_waves_per_simd = cu->max_waves_simd;
			
 
				+	dev->node_props.wave_front_size = cu->wave_front_size;
			
 
				+	dev->node_props.array_count = cu->array_count;
			
 
				+	dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
			
 
				+	dev->node_props.simd_per_cu = cu->num_simd_per_cu;
			
 
				+	dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
			
 
				+	if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
			
 
				+		dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
			
 
				+	pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);
			
 
				+}
			
 
				+
			
 
				+/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct
			
 
				+ * topology device present in the device_list
			
 
				+ */
			
 
				+static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,
			
 
				+				struct list_head *device_list)
			
 
				+{
			
 
				+	struct kfd_topology_device *dev;
			
 
				+
			
 
				+	pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
			
 
				+			cu->proximity_domain, cu->hsa_capability);
			
 
				+	list_for_each_entry(dev, device_list, list) {
			
 
				+		if (cu->proximity_domain == dev->proximity_domain) {
			
 
				+			if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
			
 
				+				kfd_populated_cu_info_cpu(dev, cu);
			
 
				+
			
 
				+			if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
			
 
				+				kfd_populated_cu_info_gpu(dev, cu);
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct
			
 
				+ * topology device present in the device_list
			
 
				+ */
			
 
				+static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,
			
 
				+				struct list_head *device_list)
			
 
				+{
			
 
				+	struct kfd_mem_properties *props;
			
 
				+	struct kfd_topology_device *dev;
			
 
				+
			
 
				+	pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",
			
 
				+			mem->proximity_domain);
			
 
				+	list_for_each_entry(dev, device_list, list) {
			
 
				+		if (mem->proximity_domain == dev->proximity_domain) {
			
 
				+			props = kfd_alloc_struct(props);
			
 
				+			if (!props)
			
 
				+				return -ENOMEM;
			
 
				+
			
 
				+			/* We're on GPU node */
			
 
				+			if (dev->node_props.cpu_cores_count == 0) {
			
 
				+				/* APU */
			
 
				+				if (mem->visibility_type == 0)
			
 
				+					props->heap_type =
			
 
				+						HSA_MEM_HEAP_TYPE_FB_PRIVATE;
			
 
				+				/* dGPU */
			
 
				+				else
			
 
				+					props->heap_type = mem->visibility_type;
			
 
				+			} else
			
 
				+				props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
			
 
				+
			
 
				+			if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
			
 
				+				props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
			
 
				+			if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
			
 
				+				props->flags |= HSA_MEM_FLAGS_NON_VOLATILE;
			
 
				+
			
 
				+			props->size_in_bytes =
			
 
				+				((uint64_t)mem->length_high << 32) +
			
 
				+							mem->length_low;
			
 
				+			props->width = mem->width;
			
 
				+
			
 
				+			dev->node_props.mem_banks_count++;
			
 
				+			list_add_tail(&props->list, &dev->mem_props);
			
 
				+
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct
			
 
				+ * topology device present in the device_list
			
 
				+ */
			
 
				+static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
			
 
				+			struct list_head *device_list)
			
 
				+{
			
 
				+	struct kfd_cache_properties *props;
			
 
				+	struct kfd_topology_device *dev;
			
 
				+	uint32_t id;
			
 
				+	uint32_t total_num_of_cu;
			
 
				+
			
 
				+	id = cache->processor_id_low;
			
 
				+
			
 
				+	pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id);
			
 
				+	list_for_each_entry(dev, device_list, list) {
			
 
				+		total_num_of_cu = (dev->node_props.array_count *
			
 
				+					dev->node_props.cu_per_simd_array);
			
 
				+
			
 
				+		/* Cache infomration in CRAT doesn't have proximity_domain
			
 
				+		 * information as it is associated with a CPU core or GPU
			
 
				+		 * Compute Unit. So map the cache using CPU core Id or SIMD
			
 
				+		 * (GPU) ID.
			
 
				+		 * TODO: This works because currently we can safely assume that
			
 
				+		 *  Compute Units are parsed before caches are parsed. In
			
 
				+		 *  future, remove this dependency
			
 
				+		 */
			
 
				+		if ((id >= dev->node_props.cpu_core_id_base &&
			
 
				+			id <= dev->node_props.cpu_core_id_base +
			
 
				+				dev->node_props.cpu_cores_count) ||
			
 
				+			(id >= dev->node_props.simd_id_base &&
			
 
				+			id < dev->node_props.simd_id_base +
			
 
				+				total_num_of_cu)) {
			
 
				+			props = kfd_alloc_struct(props);
			
 
				+			if (!props)
			
 
				+				return -ENOMEM;
			
 
				+
			
 
				+			props->processor_id_low = id;
			
 
				+			props->cache_level = cache->cache_level;
			
 
				+			props->cache_size = cache->cache_size;
			
 
				+			props->cacheline_size = cache->cache_line_size;
			
 
				+			props->cachelines_per_tag = cache->lines_per_tag;
			
 
				+			props->cache_assoc = cache->associativity;
			
 
				+			props->cache_latency = cache->cache_latency;
			
 
				+			memcpy(props->sibling_map, cache->sibling_map,
			
 
				+					sizeof(props->sibling_map));
			
 
				+
			
 
				+			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
			
 
				+				props->cache_type |= HSA_CACHE_TYPE_DATA;
			
 
				+			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
			
 
				+				props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
			
 
				+			if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
			
 
				+				props->cache_type |= HSA_CACHE_TYPE_CPU;
			
 
				+			if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
			
 
				+				props->cache_type |= HSA_CACHE_TYPE_HSACU;
			
 
				+
			
 
				+			dev->cache_count++;
			
 
				+			dev->node_props.caches_count++;
			
 
				+			list_add_tail(&props->list, &dev->cache_props);
			
 
				+
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct
			
 
				+ * topology device present in the device_list
			
 
				+ */
			
 
				+static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
			
 
				+					struct list_head *device_list)
			
 
				+{
			
 
				+	struct kfd_iolink_properties *props = NULL, *props2;
			
 
				+	struct kfd_topology_device *dev, *cpu_dev;
			
 
				+	uint32_t id_from;
			
 
				+	uint32_t id_to;
			
 
				+
			
 
				+	id_from = iolink->proximity_domain_from;
			
 
				+	id_to = iolink->proximity_domain_to;
			
 
				+
			
 
				+	pr_debug("Found IO link entry in CRAT table with id_from=%d\n",
			
 
				+			id_from);
			
 
				+	list_for_each_entry(dev, device_list, list) {
			
 
				+		if (id_from == dev->proximity_domain) {
			
 
				+			props = kfd_alloc_struct(props);
			
 
				+			if (!props)
			
 
				+				return -ENOMEM;
			
 
				+
			
 
				+			props->node_from = id_from;
			
 
				+			props->node_to = id_to;
			
 
				+			props->ver_maj = iolink->version_major;
			
 
				+			props->ver_min = iolink->version_minor;
			
 
				+			props->iolink_type = iolink->io_interface_type;
			
 
				+
			
 
				+			if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
			
 
				+				props->weight = 20;
			
 
				+			else
			
 
				+				props->weight = node_distance(id_from, id_to);
			
 
				+
			
 
				+			props->min_latency = iolink->minimum_latency;
			
 
				+			props->max_latency = iolink->maximum_latency;
			
 
				+			props->min_bandwidth = iolink->minimum_bandwidth_mbs;
			
 
				+			props->max_bandwidth = iolink->maximum_bandwidth_mbs;
			
 
				+			props->rec_transfer_size =
			
 
				+					iolink->recommended_transfer_size;
			
 
				+
			
 
				+			dev->io_link_count++;
			
 
				+			dev->node_props.io_links_count++;
			
 
				+			list_add_tail(&props->list, &dev->io_link_props);
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* CPU topology is created before GPUs are detected, so CPU->GPU
			
 
				+	 * links are not built at that time. If a PCIe type is discovered, it
			
 
				+	 * means a GPU is detected and we are adding GPU->CPU to the topology.
			
 
				+	 * At this time, also add the corresponded CPU->GPU link.
			
 
				+	 */
			
 
				+	if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) {
			
 
				+		cpu_dev = kfd_topology_device_by_proximity_domain(id_to);
			
 
				+		if (!cpu_dev)
			
 
				+			return -ENODEV;
			
 
				+		/* same everything but the other direction */
			
 
				+		props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
			
 
				+		props2->node_from = id_to;
			
 
				+		props2->node_to = id_from;
			
 
				+		props2->kobj = NULL;
			
 
				+		cpu_dev->io_link_count++;
			
 
				+		cpu_dev->node_props.io_links_count++;
			
 
				+		list_add_tail(&props2->list, &cpu_dev->io_link_props);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* kfd_parse_subtype - parse subtypes and attach it to correct topology device
			
 
				+ * present in the device_list
			
 
				+ *	@sub_type_hdr - subtype section of crat_image
			
 
				+ *	@device_list - list of topology devices present in this crat_image
			
 
				+ */
			
 
				+static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
			
 
				+				struct list_head *device_list)
			
 
				+{
			
 
				+	struct crat_subtype_computeunit *cu;
			
 
				+	struct crat_subtype_memory *mem;
			
 
				+	struct crat_subtype_cache *cache;
			
 
				+	struct crat_subtype_iolink *iolink;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	switch (sub_type_hdr->type) {
			
 
				+	case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
			
 
				+		cu = (struct crat_subtype_computeunit *)sub_type_hdr;
			
 
				+		ret = kfd_parse_subtype_cu(cu, device_list);
			
 
				+		break;
			
 
				+	case CRAT_SUBTYPE_MEMORY_AFFINITY:
			
 
				+		mem = (struct crat_subtype_memory *)sub_type_hdr;
			
 
				+		ret = kfd_parse_subtype_mem(mem, device_list);
			
 
				+		break;
			
 
				+	case CRAT_SUBTYPE_CACHE_AFFINITY:
			
 
				+		cache = (struct crat_subtype_cache *)sub_type_hdr;
			
 
				+		ret = kfd_parse_subtype_cache(cache, device_list);
			
 
				+		break;
			
 
				+	case CRAT_SUBTYPE_TLB_AFFINITY:
			
 
				+		/*
			
 
				+		 * For now, nothing to do here
			
 
				+		 */
			
 
				+		pr_debug("Found TLB entry in CRAT table (not processing)\n");
			
 
				+		break;
			
 
				+	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
			
 
				+		/*
			
 
				+		 * For now, nothing to do here
			
 
				+		 */
			
 
				+		pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");
			
 
				+		break;
			
 
				+	case CRAT_SUBTYPE_IOLINK_AFFINITY:
			
 
				+		iolink = (struct crat_subtype_iolink *)sub_type_hdr;
			
 
				+		ret = kfd_parse_subtype_iolink(iolink, device_list);
			
 
				+		break;
			
 
				+	default:
			
 
				+		pr_warn("Unknown subtype %d in CRAT\n",
			
 
				+				sub_type_hdr->type);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT
			
 
				+ * create a kfd_topology_device and add in to device_list. Also parse
			
 
				+ * CRAT subtypes and attach it to appropriate kfd_topology_device
			
 
				+ *	@crat_image - input image containing CRAT
			
 
				+ *	@device_list - [OUT] list of kfd_topology_device generated after
			
 
				+ *		       parsing crat_image
			
 
				+ *	@proximity_domain - Proximity domain of the first device in the table
			
 
				+ *
			
 
				+ *	Return - 0 if successful else -ve value
			
 
				+ */
			
 
				+int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
			
 
				+			 uint32_t proximity_domain)
			
 
				+{
			
 
				+	struct kfd_topology_device *top_dev = NULL;
			
 
				+	struct crat_subtype_generic *sub_type_hdr;
			
 
				+	uint16_t node_id;
			
 
				+	int ret = 0;
			
 
				+	struct crat_header *crat_table = (struct crat_header *)crat_image;
			
 
				+	uint16_t num_nodes;
			
 
				+	uint32_t image_len;
			
 
				+
			
 
				+	if (!crat_image)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (!list_empty(device_list)) {
			
 
				+		pr_warn("Error device list should be empty\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	num_nodes = crat_table->num_domains;
			
 
				+	image_len = crat_table->length;
			
 
				+
			
 
				+	pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
			
 
				+
			
 
				+	for (node_id = 0; node_id < num_nodes; node_id++) {
			
 
				+		top_dev = kfd_create_topology_device(device_list);
			
 
				+		if (!top_dev)
			
 
				+			break;
			
 
				+		top_dev->proximity_domain = proximity_domain++;
			
 
				+	}
			
 
				+
			
 
				+	if (!top_dev) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);
			
 
				+	memcpy(top_dev->oem_table_id, crat_table->oem_table_id,
			
 
				+			CRAT_OEMTABLEID_LENGTH);
			
 
				+	top_dev->oem_revision = crat_table->oem_revision;
			
 
				+
			
 
				+	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
			
 
				+	while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
			
 
				+			((char *)crat_image) + image_len) {
			
 
				+		if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
			
 
				+			ret = kfd_parse_subtype(sub_type_hdr, device_list);
			
 
				+			if (ret)
			
 
				+				break;
			
 
				+		}
			
 
				+
			
 
				+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
			
 
				+				sub_type_hdr->length);
			
 
				+	}
			
 
				+
			
 
				+err:
			
 
				+	if (ret)
			
 
				+		kfd_release_topology_device_list(device_list);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
			
 
				+static int fill_in_pcache(struct crat_subtype_cache *pcache,
			
 
				+				struct kfd_gpu_cache_info *pcache_info,
			
 
				+				struct kfd_cu_info *cu_info,
			
 
				+				int mem_available,
			
 
				+				int cu_bitmask,
			
 
				+				int cache_type, unsigned int cu_processor_id,
			
 
				+				int cu_block)
			
 
				+{
			
 
				+	unsigned int cu_sibling_map_mask;
			
 
				+	int first_active_cu;
			
 
				+
			
 
				+	/* First check if enough memory is available */
			
 
				+	if (sizeof(struct crat_subtype_cache) > mem_available)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	cu_sibling_map_mask = cu_bitmask;
			
 
				+	cu_sibling_map_mask >>= cu_block;
			
 
				+	cu_sibling_map_mask &=
			
 
				+		((1 << pcache_info[cache_type].num_cu_shared) - 1);
			
 
				+	first_active_cu = ffs(cu_sibling_map_mask);
			
 
				+
			
 
				+	/* CU could be inactive. In case of shared cache find the first active
			
 
				+	 * CU. and incase of non-shared cache check if the CU is inactive. If
			
 
				+	 * inactive active skip it
			
 
				+	 */
			
 
				+	if (first_active_cu) {
			
 
				+		memset(pcache, 0, sizeof(struct crat_subtype_cache));
			
 
				+		pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
			
 
				+		pcache->length = sizeof(struct crat_subtype_cache);
			
 
				+		pcache->flags = pcache_info[cache_type].flags;
			
 
				+		pcache->processor_id_low = cu_processor_id
			
 
				+					 + (first_active_cu - 1);
			
 
				+		pcache->cache_level = pcache_info[cache_type].cache_level;
			
 
				+		pcache->cache_size = pcache_info[cache_type].cache_size;
			
 
				+
			
 
				+		/* Sibling map is w.r.t processor_id_low, so shift out
			
 
				+		 * inactive CU
			
 
				+		 */
			
 
				+		cu_sibling_map_mask =
			
 
				+			cu_sibling_map_mask >> (first_active_cu - 1);
			
 
				+
			
 
				+		pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
			
 
				+		pcache->sibling_map[1] =
			
 
				+				(uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
			
 
				+		pcache->sibling_map[2] =
			
 
				+				(uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
			
 
				+		pcache->sibling_map[3] =
			
 
				+				(uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info
			
 
				+ * tables
			
 
				+ *
			
 
				+ *	@kdev - [IN] GPU device
			
 
				+ *	@gpu_processor_id - [IN] GPU processor ID to which these caches
			
 
				+ *			    associate
			
 
				+ *	@available_size - [IN] Amount of memory available in pcache
			
 
				+ *	@cu_info - [IN] Compute Unit info obtained from KGD
			
 
				+ *	@pcache - [OUT] memory into which cache data is to be filled in.
			
 
				+ *	@size_filled - [OUT] amount of data used up in pcache.
			
 
				+ *	@num_of_entries - [OUT] number of caches added
			
 
				+ */
			
 
				+static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
			
 
				+			int gpu_processor_id,
			
 
				+			int available_size,
			
 
				+			struct kfd_cu_info *cu_info,
			
 
				+			struct crat_subtype_cache *pcache,
			
 
				+			int *size_filled,
			
 
				+			int *num_of_entries)
			
 
				+{
			
 
				+	struct kfd_gpu_cache_info *pcache_info;
			
 
				+	int num_of_cache_types = 0;
			
 
				+	int i, j, k;
			
 
				+	int ct = 0;
			
 
				+	int mem_available = available_size;
			
 
				+	unsigned int cu_processor_id;
			
 
				+	int ret;
			
 
				+
			
 
				+	switch (kdev->device_info->asic_family) {
			
 
				+	case CHIP_KAVERI:
			
 
				+		pcache_info = kaveri_cache_info;
			
 
				+		num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
			
 
				+		break;
			
 
				+	case CHIP_HAWAII:
			
 
				+		pcache_info = hawaii_cache_info;
			
 
				+		num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
			
 
				+		break;
			
 
				+	case CHIP_CARRIZO:
			
 
				+		pcache_info = carrizo_cache_info;
			
 
				+		num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
			
 
				+		break;
			
 
				+	case CHIP_TONGA:
			
 
				+		pcache_info = tonga_cache_info;
			
 
				+		num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
			
 
				+		break;
			
 
				+	case CHIP_FIJI:
			
 
				+		pcache_info = fiji_cache_info;
			
 
				+		num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
			
 
				+		break;
			
 
				+	case CHIP_POLARIS10:
			
 
				+		pcache_info = polaris10_cache_info;
			
 
				+		num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
			
 
				+		break;
			
 
				+	case CHIP_POLARIS11:
			
 
				+		pcache_info = polaris11_cache_info;
			
 
				+		num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
			
 
				+		break;
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	*size_filled = 0;
			
 
				+	*num_of_entries = 0;
			
 
				+
			
 
				+	/* For each type of cache listed in the kfd_gpu_cache_info table,
			
 
				+	 * go through all available Compute Units.
			
 
				+	 * The [i,j,k] loop will
			
 
				+	 *		if kfd_gpu_cache_info.num_cu_shared = 1
			
 
				+	 *			will parse through all available CU
			
 
				+	 *		If (kfd_gpu_cache_info.num_cu_shared != 1)
			
 
				+	 *			then it will consider only one CU from
			
 
				+	 *			the shared unit
			
 
				+	 */
			
 
				+
			
 
				+	for (ct = 0; ct < num_of_cache_types; ct++) {
			
 
				+		cu_processor_id = gpu_processor_id;
			
 
				+		for (i = 0; i < cu_info->num_shader_engines; i++) {
			
 
				+			for (j = 0; j < cu_info->num_shader_arrays_per_engine;
			
 
				+				j++) {
			
 
				+				for (k = 0; k < cu_info->num_cu_per_sh;
			
 
				+					k += pcache_info[ct].num_cu_shared) {
			
 
				+
			
 
				+					ret = fill_in_pcache(pcache,
			
 
				+						pcache_info,
			
 
				+						cu_info,
			
 
				+						mem_available,
			
 
				+						cu_info->cu_bitmap[i][j],
			
 
				+						ct,
			
 
				+						cu_processor_id,
			
 
				+						k);
			
 
				+
			
 
				+					if (ret < 0)
			
 
				+						break;
			
 
				+
			
 
				+					if (!ret) {
			
 
				+						pcache++;
			
 
				+						(*num_of_entries)++;
			
 
				+						mem_available -=
			
 
				+							sizeof(*pcache);
			
 
				+						(*size_filled) +=
			
 
				+							sizeof(*pcache);
			
 
				+					}
			
 
				+
			
 
				+					/* Move to next CU block */
			
 
				+					cu_processor_id +=
			
 
				+						pcache_info[ct].num_cu_shared;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * kfd_create_crat_image_acpi - Allocates memory for CRAT image and
			
 
				+ * copies CRAT from ACPI (if available).
			
 
				+ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
			
 
				+ *
			
 
				+ *	@crat_image: CRAT read from ACPI. If no CRAT in ACPI then
			
 
				+ *		     crat_image will be NULL
			
 
				+ *	@size: [OUT] size of crat_image
			
 
				+ *
			
 
				+ *	Return 0 if successful else return error code
			
 
				+ */
			
 
				+int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
			
 
				+{
			
 
				+	struct acpi_table_header *crat_table;
			
 
				+	acpi_status status;
			
 
				+	void *pcrat_image;
			
 
				+
			
 
				+	if (!crat_image)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	*crat_image = NULL;
			
 
				+
			
 
				+	/* Fetch the CRAT table from ACPI */
			
 
				+	status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
			
 
				+	if (status == AE_NOT_FOUND) {
			
 
				+		pr_warn("CRAT table not found\n");
			
 
				+		return -ENODATA;
			
 
				+	} else if (ACPI_FAILURE(status)) {
			
 
				+		const char *err = acpi_format_exception(status);
			
 
				+
			
 
				+		pr_err("CRAT table error: %s\n", err);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (ignore_crat) {
			
 
				+		pr_info("CRAT table disabled by module option\n");
			
 
				+		return -ENODATA;
			
 
				+	}
			
 
				+
			
 
				+	pcrat_image = kmalloc(crat_table->length, GFP_KERNEL);
			
 
				+	if (!pcrat_image)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	memcpy(pcrat_image, crat_table, crat_table->length);
			
 
				+
			
 
				+	*crat_image = pcrat_image;
			
 
				+	*size = crat_table->length;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* Memory required to create Virtual CRAT.
			
 
				+ * Since there is no easy way to predict the amount of memory required, the
			
 
				+ * following amount are allocated for CPU and GPU Virtual CRAT. This is
			
 
				+ * expected to cover all known conditions. But to be safe additional check
			
 
				+ * is put in the code to ensure we don't overwrite.
			
 
				+ */
			
 
				+#define VCRAT_SIZE_FOR_CPU	(2 * PAGE_SIZE)
			
 
				+#define VCRAT_SIZE_FOR_GPU	(3 * PAGE_SIZE)
			
 
				+
			
 
				+/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
			
 
				+ *
			
 
				+ *	@numa_node_id: CPU NUMA node id
			
 
				+ *	@avail_size: Available size in the memory
			
 
				+ *	@sub_type_hdr: Memory into which compute info will be filled in
			
 
				+ *
			
 
				+ *	Return 0 if successful else return -ve value
			
 
				+ */
			
 
				+static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,
			
 
				+				int proximity_domain,
			
 
				+				struct crat_subtype_computeunit *sub_type_hdr)
			
 
				+{
			
 
				+	const struct cpumask *cpumask;
			
 
				+
			
 
				+	*avail_size -= sizeof(struct crat_subtype_computeunit);
			
 
				+	if (*avail_size < 0)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
			
 
				+
			
 
				+	/* Fill in subtype header data */
			
 
				+	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
			
 
				+	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
			
 
				+	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
			
 
				+
			
 
				+	cpumask = cpumask_of_node(numa_node_id);
			
 
				+
			
 
				+	/* Fill in CU data */
			
 
				+	sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;
			
 
				+	sub_type_hdr->proximity_domain = proximity_domain;
			
 
				+	sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);
			
 
				+	if (sub_type_hdr->processor_id_low == -1)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node
			
 
				+ *
			
 
				+ *	@numa_node_id: CPU NUMA node id
			
 
				+ *	@avail_size: Available size in the memory
			
 
				+ *	@sub_type_hdr: Memory into which compute info will be filled in
			
 
				+ *
			
 
				+ *	Return 0 if successful else return -ve value
			
 
				+ */
			
 
				+static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
			
 
				+			int proximity_domain,
			
 
				+			struct crat_subtype_memory *sub_type_hdr)
			
 
				+{
			
 
				+	uint64_t mem_in_bytes = 0;
			
 
				+	pg_data_t *pgdat;
			
 
				+	int zone_type;
			
 
				+
			
 
				+	*avail_size -= sizeof(struct crat_subtype_memory);
			
 
				+	if (*avail_size < 0)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
			
 
				+
			
 
				+	/* Fill in subtype header data */
			
 
				+	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
			
 
				+	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
			
 
				+	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
			
 
				+
			
 
				+	/* Fill in Memory Subunit data */
			
 
				+
			
 
				+	/* Unlike si_meminfo, si_meminfo_node is not exported. So
			
 
				+	 * the following lines are duplicated from si_meminfo_node
			
 
				+	 * function
			
 
				+	 */
			
 
				+	pgdat = NODE_DATA(numa_node_id);
			
 
				+	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
			
 
				+		mem_in_bytes += pgdat->node_zones[zone_type].managed_pages;
			
 
				+	mem_in_bytes <<= PAGE_SHIFT;
			
 
				+
			
 
				+	sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
			
 
				+	sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);
			
 
				+	sub_type_hdr->proximity_domain = proximity_domain;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size,
			
 
				+				uint32_t *num_entries,
			
 
				+				struct crat_subtype_iolink *sub_type_hdr)
			
 
				+{
			
 
				+	int nid;
			
 
				+	struct cpuinfo_x86 *c = &cpu_data(0);
			
 
				+	uint8_t link_type;
			
 
				+
			
 
				+	if (c->x86_vendor == X86_VENDOR_AMD)
			
 
				+		link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT;
			
 
				+	else
			
 
				+		link_type = CRAT_IOLINK_TYPE_QPI_1_1;
			
 
				+
			
 
				+	*num_entries = 0;
			
 
				+
			
 
				+	/* Create IO links from this node to other CPU nodes */
			
 
				+	for_each_online_node(nid) {
			
 
				+		if (nid == numa_node_id) /* node itself */
			
 
				+			continue;
			
 
				+
			
 
				+		*avail_size -= sizeof(struct crat_subtype_iolink);
			
 
				+		if (*avail_size < 0)
			
 
				+			return -ENOMEM;
			
 
				+
			
 
				+		memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
			
 
				+
			
 
				+		/* Fill in subtype header data */
			
 
				+		sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
			
 
				+		sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
			
 
				+		sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
			
 
				+
			
 
				+		/* Fill in IO link data */
			
 
				+		sub_type_hdr->proximity_domain_from = numa_node_id;
			
 
				+		sub_type_hdr->proximity_domain_to = nid;
			
 
				+		sub_type_hdr->io_interface_type = link_type;
			
 
				+
			
 
				+		(*num_entries)++;
			
 
				+		sub_type_hdr++;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU
			
 
				+ *
			
 
				+ *	@pcrat_image: Fill in VCRAT for CPU
			
 
				+ *	@size:	[IN] allocated size of crat_image.
			
 
				+ *		[OUT] actual size of data filled in crat_image
			
 
				+ */
			
 
				+static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
			
 
				+{
			
 
				+	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
			
 
				+	struct acpi_table_header *acpi_table;
			
 
				+	acpi_status status;
			
 
				+	struct crat_subtype_generic *sub_type_hdr;
			
 
				+	int avail_size = *size;
			
 
				+	int numa_node_id;
			
 
				+	uint32_t entries = 0;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* Fill in CRAT Header.
			
 
				+	 * Modify length and total_entries as subunits are added.
			
 
				+	 */
			
 
				+	avail_size -= sizeof(struct crat_header);
			
 
				+	if (avail_size < 0)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	memset(crat_table, 0, sizeof(struct crat_header));
			
 
				+	memcpy(&crat_table->signature, CRAT_SIGNATURE,
			
 
				+			sizeof(crat_table->signature));
			
 
				+	crat_table->length = sizeof(struct crat_header);
			
 
				+
			
 
				+	status = acpi_get_table("DSDT", 0, &acpi_table);
			
 
				+	if (status == AE_NOT_FOUND)
			
 
				+		pr_warn("DSDT table not found for OEM information\n");
			
 
				+	else {
			
 
				+		crat_table->oem_revision = acpi_table->revision;
			
 
				+		memcpy(crat_table->oem_id, acpi_table->oem_id,
			
 
				+				CRAT_OEMID_LENGTH);
			
 
				+		memcpy(crat_table->oem_table_id, acpi_table->oem_table_id,
			
 
				+				CRAT_OEMTABLEID_LENGTH);
			
 
				+	}
			
 
				+	crat_table->total_entries = 0;
			
 
				+	crat_table->num_domains = 0;
			
 
				+
			
 
				+	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
			
 
				+
			
 
				+	for_each_online_node(numa_node_id) {
			
 
				+		if (kfd_numa_node_to_apic_id(numa_node_id) == -1)
			
 
				+			continue;
			
 
				+
			
 
				+		/* Fill in Subtype: Compute Unit */
			
 
				+		ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,
			
 
				+			crat_table->num_domains,
			
 
				+			(struct crat_subtype_computeunit *)sub_type_hdr);
			
 
				+		if (ret < 0)
			
 
				+			return ret;
			
 
				+		crat_table->length += sub_type_hdr->length;
			
 
				+		crat_table->total_entries++;
			
 
				+
			
 
				+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
			
 
				+			sub_type_hdr->length);
			
 
				+
			
 
				+		/* Fill in Subtype: Memory */
			
 
				+		ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,
			
 
				+			crat_table->num_domains,
			
 
				+			(struct crat_subtype_memory *)sub_type_hdr);
			
 
				+		if (ret < 0)
			
 
				+			return ret;
			
 
				+		crat_table->length += sub_type_hdr->length;
			
 
				+		crat_table->total_entries++;
			
 
				+
			
 
				+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
			
 
				+			sub_type_hdr->length);
			
 
				+
			
 
				+		/* Fill in Subtype: IO Link */
			
 
				+		ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size,
			
 
				+				&entries,
			
 
				+				(struct crat_subtype_iolink *)sub_type_hdr);
			
 
				+		if (ret < 0)
			
 
				+			return ret;
			
 
				+		crat_table->length += (sub_type_hdr->length * entries);
			
 
				+		crat_table->total_entries += entries;
			
 
				+
			
 
				+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
			
 
				+				sub_type_hdr->length * entries);
			
 
				+
			
 
				+		crat_table->num_domains++;
			
 
				+	}
			
 
				+
			
 
				+	/* TODO: Add cache Subtype for CPU.
			
 
				+	 * Currently, CPU cache information is available in function
			
 
				+	 * detect_cache_attributes(cpu) defined in the file
			
 
				+	 * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not
			
 
				+	 * exported and to get the same information the code needs to be
			
 
				+	 * duplicated.
			
 
				+	 */
			
 
				+
			
 
				+	*size = crat_table->length;
			
 
				+	pr_info("Virtual CRAT table created for CPU\n");
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int kfd_fill_gpu_memory_affinity(int *avail_size,
			
 
				+		struct kfd_dev *kdev, uint8_t type, uint64_t size,
			
 
				+		struct crat_subtype_memory *sub_type_hdr,
			
 
				+		uint32_t proximity_domain,
			
 
				+		const struct kfd_local_mem_info *local_mem_info)
			
 
				+{
			
 
				+	*avail_size -= sizeof(struct crat_subtype_memory);
			
 
				+	if (*avail_size < 0)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
			
 
				+	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
			
 
				+	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
			
 
				+	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
			
 
				+
			
 
				+	sub_type_hdr->proximity_domain = proximity_domain;
			
 
				+
			
 
				+	pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n",
			
 
				+			type, size);
			
 
				+
			
 
				+	sub_type_hdr->length_low = lower_32_bits(size);
			
 
				+	sub_type_hdr->length_high = upper_32_bits(size);
			
 
				+
			
 
				+	sub_type_hdr->width = local_mem_info->vram_width;
			
 
				+	sub_type_hdr->visibility_type = type;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
			
 
				+ * to its NUMA node
			
 
				+ *	@avail_size: Available size in the memory
			
 
				+ *	@kdev - [IN] GPU device
			
 
				+ *	@sub_type_hdr: Memory into which io link info will be filled in
			
 
				+ *	@proximity_domain - proximity domain of the GPU node
			
 
				+ *
			
 
				+ *	Return 0 if successful else return -ve value
			
 
				+ */
			
 
				+static int kfd_fill_gpu_direct_io_link(int *avail_size,
			
 
				+			struct kfd_dev *kdev,
			
 
				+			struct crat_subtype_iolink *sub_type_hdr,
			
 
				+			uint32_t proximity_domain)
			
 
				+{
			
 
				+	*avail_size -= sizeof(struct crat_subtype_iolink);
			
 
				+	if (*avail_size < 0)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
			
 
				+
			
 
				+	/* Fill in subtype header data */
			
 
				+	sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
			
 
				+	sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
			
 
				+	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
			
 
				+
			
 
				+	/* Fill in IOLINK subtype.
			
 
				+	 * TODO: Fill-in other fields of iolink subtype
			
 
				+	 */
			
 
				+	sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
			
 
				+	sub_type_hdr->proximity_domain_from = proximity_domain;
			
 
				+#ifdef CONFIG_NUMA
			
 
				+	if (kdev->pdev->dev.numa_node == NUMA_NO_NODE)
			
 
				+		sub_type_hdr->proximity_domain_to = 0;
			
 
				+	else
			
 
				+		sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node;
			
 
				+#else
			
 
				+	sub_type_hdr->proximity_domain_to = 0;
			
 
				+#endif
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
			
 
				+ *
			
 
				+ *	@pcrat_image: Fill in VCRAT for GPU
			
 
				+ *	@size:	[IN] allocated size of crat_image.
			
 
				+ *		[OUT] actual size of data filled in crat_image
			
 
				+ */
			
 
				+static int kfd_create_vcrat_image_gpu(void *pcrat_image,
			
 
				+				      size_t *size, struct kfd_dev *kdev,
			
 
				+				      uint32_t proximity_domain)
			
 
				+{
			
 
				+	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
			
 
				+	struct crat_subtype_generic *sub_type_hdr;
			
 
				+	struct crat_subtype_computeunit *cu;
			
 
				+	struct kfd_cu_info cu_info;
			
 
				+	struct amd_iommu_device_info iommu_info;
			
 
				+	int avail_size = *size;
			
 
				+	uint32_t total_num_of_cu;
			
 
				+	int num_of_cache_entries = 0;
			
 
				+	int cache_mem_filled = 0;
			
 
				+	int ret = 0;
			
 
				+	const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
			
 
				+					 AMD_IOMMU_DEVICE_FLAG_PRI_SUP |
			
 
				+					 AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
			
 
				+	struct kfd_local_mem_info local_mem_info;
			
 
				+
			
 
				+	if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* Fill the CRAT Header.
			
 
				+	 * Modify length and total_entries as subunits are added.
			
 
				+	 */
			
 
				+	avail_size -= sizeof(struct crat_header);
			
 
				+	if (avail_size < 0)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	memset(crat_table, 0, sizeof(struct crat_header));
			
 
				+
			
 
				+	memcpy(&crat_table->signature, CRAT_SIGNATURE,
			
 
				+			sizeof(crat_table->signature));
			
 
				+	/* Change length as we add more subtypes*/
			
 
				+	crat_table->length = sizeof(struct crat_header);
			
 
				+	crat_table->num_domains = 1;
			
 
				+	crat_table->total_entries = 0;
			
 
				+
			
 
				+	/* Fill in Subtype: Compute Unit
			
 
				+	 * First fill in the sub type header and then sub type data
			
 
				+	 */
			
 
				+	avail_size -= sizeof(struct crat_subtype_computeunit);
			
 
				+	if (avail_size < 0)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
			
 
				+	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
			
 
				+
			
 
				+	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
			
 
				+	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
			
 
				+	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
			
 
				+
			
 
				+	/* Fill CU subtype data */
			
 
				+	cu = (struct crat_subtype_computeunit *)sub_type_hdr;
			
 
				+	cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
			
 
				+	cu->proximity_domain = proximity_domain;
			
 
				+
			
 
				+	kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info);
			
 
				+	cu->num_simd_per_cu = cu_info.simd_per_cu;
			
 
				+	cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number;
			
 
				+	cu->max_waves_simd = cu_info.max_waves_per_simd;
			
 
				+
			
 
				+	cu->wave_front_size = cu_info.wave_front_size;
			
 
				+	cu->array_count = cu_info.num_shader_arrays_per_engine *
			
 
				+		cu_info.num_shader_engines;
			
 
				+	total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
			
 
				+	cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
			
 
				+	cu->num_cu_per_array = cu_info.num_cu_per_sh;
			
 
				+	cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
			
 
				+	cu->num_banks = cu_info.num_shader_engines;
			
 
				+	cu->lds_size_in_kb = cu_info.lds_size;
			
 
				+
			
 
				+	cu->hsa_capability = 0;
			
 
				+
			
 
				+	/* Check if this node supports IOMMU. During parsing this flag will
			
 
				+	 * translate to HSA_CAP_ATS_PRESENT
			
 
				+	 */
			
 
				+	iommu_info.flags = 0;
			
 
				+	if (amd_iommu_device_info(kdev->pdev, &iommu_info) == 0) {
			
 
				+		if ((iommu_info.flags & required_iommu_flags) ==
			
 
				+				required_iommu_flags)
			
 
				+			cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT;
			
 
				+	}
			
 
				+
			
 
				+	crat_table->length += sub_type_hdr->length;
			
 
				+	crat_table->total_entries++;
			
 
				+
			
 
				+	/* Fill in Subtype: Memory. Only on systems with large BAR (no
			
 
				+	 * private FB), report memory as public. On other systems
			
 
				+	 * report the total FB size (public+private) as a single
			
 
				+	 * private heap.
			
 
				+	 */
			
 
				+	kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info);
			
 
				+	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
			
 
				+			sub_type_hdr->length);
			
 
				+
			
 
				+	if (local_mem_info.local_mem_size_private == 0)
			
 
				+		ret = kfd_fill_gpu_memory_affinity(&avail_size,
			
 
				+				kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,
			
 
				+				local_mem_info.local_mem_size_public,
			
 
				+				(struct crat_subtype_memory *)sub_type_hdr,
			
 
				+				proximity_domain,
			
 
				+				&local_mem_info);
			
 
				+	else
			
 
				+		ret = kfd_fill_gpu_memory_affinity(&avail_size,
			
 
				+				kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,
			
 
				+				local_mem_info.local_mem_size_public +
			
 
				+				local_mem_info.local_mem_size_private,
			
 
				+				(struct crat_subtype_memory *)sub_type_hdr,
			
 
				+				proximity_domain,
			
 
				+				&local_mem_info);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	crat_table->length += sizeof(struct crat_subtype_memory);
			
 
				+	crat_table->total_entries++;
			
 
				+
			
 
				+	/* TODO: Fill in cache information. This information is NOT readily
			
 
				+	 * available in KGD
			
 
				+	 */
			
 
				+	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
			
 
				+		sub_type_hdr->length);
			
 
				+	ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
			
 
				+				avail_size,
			
 
				+				&cu_info,
			
 
				+				(struct crat_subtype_cache *)sub_type_hdr,
			
 
				+				&cache_mem_filled,
			
 
				+				&num_of_cache_entries);
			
 
				+
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	crat_table->length += cache_mem_filled;
			
 
				+	crat_table->total_entries += num_of_cache_entries;
			
 
				+	avail_size -= cache_mem_filled;
			
 
				+
			
 
				+	/* Fill in Subtype: IO_LINKS
			
 
				+	 *  Only direct links are added here which is Link from GPU to
			
 
				+	 *  to its NUMA node. Indirect links are added by userspace.
			
 
				+	 */
			
 
				+	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
			
 
				+		cache_mem_filled);
			
 
				+	ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev,
			
 
				+		(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
			
 
				+
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	crat_table->length += sub_type_hdr->length;
			
 
				+	crat_table->total_entries++;
			
 
				+
			
 
				+	*size = crat_table->length;
			
 
				+	pr_info("Virtual CRAT table created for GPU\n");
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and
			
 
				+ *		creates a Virtual CRAT (VCRAT) image
			
 
				+ *
			
 
				+ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
			
 
				+ *
			
 
				+ *	@crat_image: VCRAT image created because ACPI does not have a
			
 
				+ *		     CRAT for this device
			
 
				+ *	@size: [OUT] size of virtual crat_image
			
 
				+ *	@flags:	COMPUTE_UNIT_CPU - Create VCRAT for CPU device
			
 
				+ *		COMPUTE_UNIT_GPU - Create VCRAT for GPU
			
 
				+ *		(COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU
			
 
				+ *			-- this option is not currently implemented.
			
 
				+ *			The assumption is that all AMD APUs will have CRAT
			
 
				+ *	@kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU
			
 
				+ *
			
 
				+ *	Return 0 if successful else return -ve value
			
 
				+ */
			
 
				+int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
			
 
				+				  int flags, struct kfd_dev *kdev,
			
 
				+				  uint32_t proximity_domain)
			
 
				+{
			
 
				+	void *pcrat_image = NULL;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (!crat_image)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	*crat_image = NULL;
			
 
				+
			
 
				+	/* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and
			
 
				+	 * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover
			
 
				+	 * all the current conditions. A check is put not to overwrite beyond
			
 
				+	 * allocated size
			
 
				+	 */
			
 
				+	switch (flags) {
			
 
				+	case COMPUTE_UNIT_CPU:
			
 
				+		pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL);
			
 
				+		if (!pcrat_image)
			
 
				+			return -ENOMEM;
			
 
				+		*size = VCRAT_SIZE_FOR_CPU;
			
 
				+		ret = kfd_create_vcrat_image_cpu(pcrat_image, size);
			
 
				+		break;
			
 
				+	case COMPUTE_UNIT_GPU:
			
 
				+		if (!kdev)
			
 
				+			return -EINVAL;
			
 
				+		pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
			
 
				+		if (!pcrat_image)
			
 
				+			return -ENOMEM;
			
 
				+		*size = VCRAT_SIZE_FOR_GPU;
			
 
				+		ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev,
			
 
				+						 proximity_domain);
			
 
				+		break;
			
 
				+	case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU):
			
 
				+		/* TODO: */
			
 
				+		ret = -EINVAL;
			
 
				+		pr_err("VCRAT not implemented for APU\n");
			
 
				+		break;
			
 
				+	default:
			
 
				+		ret = -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (!ret)
			
 
				+		*crat_image = pcrat_image;
			
 
				+	else
			
 
				+		kfree(pcrat_image);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* kfd_destroy_crat_image
			
 
				+ *
			
 
				+ *	@crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)
			
 
				+ *
			
 
				+ */
			
 
				+void kfd_destroy_crat_image(void *crat_image)
			
 
				+{
			
 
				+	kfree(crat_image);
			
 
				+}
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -44,6 +44,10 @@
 
				 
			
 
				 #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1)
			
 
				 
			
 
				+/* Compute Unit flags */
			
 
				+#define COMPUTE_UNIT_CPU	(1 << 0)  /* Create Virtual CRAT for CPU */
			
 
				+#define COMPUTE_UNIT_GPU	(1 << 1)  /* Create Virtual CRAT for GPU */
			
 
				+
			
 
				 struct crat_header {
			
 
				 	uint32_t	signature;
			
 
				 	uint32_t	length;
			
@@ -105,7 +109,7 @@ struct crat_subtype_computeunit {
 
				 	uint8_t		wave_front_size;
			
 
				 	uint8_t		num_banks;
			
 
				 	uint16_t	micro_engine_id;
			
 
				-	uint8_t		num_arrays;
			
 
				+	uint8_t		array_count;
			
 
				 	uint8_t		num_cu_per_array;
			
 
				 	uint8_t		num_simd_per_cu;
			
 
				 	uint8_t		max_slots_scatch_cu;
			
@@ -127,13 +131,14 @@ struct crat_subtype_memory {
 
				 	uint8_t		length;
			
 
				 	uint16_t	reserved;
			
 
				 	uint32_t	flags;
			
 
				-	uint32_t	promixity_domain;
			
 
				+	uint32_t	proximity_domain;
			
 
				 	uint32_t	base_addr_low;
			
 
				 	uint32_t	base_addr_high;
			
 
				 	uint32_t	length_low;
			
 
				 	uint32_t	length_high;
			
 
				 	uint32_t	width;
			
 
				-	uint8_t		reserved2[CRAT_MEMORY_RESERVED_LENGTH];
			
 
				+	uint8_t		visibility_type; /* for virtual (dGPU) CRAT */
			
 
				+	uint8_t		reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1];
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -222,9 +227,12 @@ struct crat_subtype_ccompute {
 
				 /*
			
 
				  * HSA IO Link Affinity structure and definitions
			
 
				  */
			
 
				-#define CRAT_IOLINK_FLAGS_ENABLED	0x00000001
			
 
				-#define CRAT_IOLINK_FLAGS_COHERENCY	0x00000002
			
 
				-#define CRAT_IOLINK_FLAGS_RESERVED	0xfffffffc
			
 
				+#define CRAT_IOLINK_FLAGS_ENABLED		(1 << 0)
			
 
				+#define CRAT_IOLINK_FLAGS_NON_COHERENT		(1 << 1)
			
 
				+#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT	(1 << 2)
			
 
				+#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT	(1 << 3)
			
 
				+#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA	(1 << 4)
			
 
				+#define CRAT_IOLINK_FLAGS_RESERVED_MASK		0xffffffe0
			
 
				 
			
 
				 /*
			
 
				  * IO interface types
			
@@ -232,10 +240,18 @@ struct crat_subtype_ccompute {
 
				 #define CRAT_IOLINK_TYPE_UNDEFINED	0
			
 
				 #define CRAT_IOLINK_TYPE_HYPERTRANSPORT	1
			
 
				 #define CRAT_IOLINK_TYPE_PCIEXPRESS	2
			
 
				-#define CRAT_IOLINK_TYPE_OTHER		3
			
 
				+#define CRAT_IOLINK_TYPE_AMBA		3
			
 
				+#define CRAT_IOLINK_TYPE_MIPI		4
			
 
				+#define CRAT_IOLINK_TYPE_QPI_1_1	5
			
 
				+#define CRAT_IOLINK_TYPE_RESERVED1	6
			
 
				+#define CRAT_IOLINK_TYPE_RESERVED2	7
			
 
				+#define CRAT_IOLINK_TYPE_RAPID_IO	8
			
 
				+#define CRAT_IOLINK_TYPE_INFINIBAND	9
			
 
				+#define CRAT_IOLINK_TYPE_RESERVED3	10
			
 
				+#define CRAT_IOLINK_TYPE_OTHER		11
			
 
				 #define CRAT_IOLINK_TYPE_MAX		255
			
 
				 
			
 
				-#define CRAT_IOLINK_RESERVED_LENGTH 24
			
 
				+#define CRAT_IOLINK_RESERVED_LENGTH	24
			
 
				 
			
 
				 struct crat_subtype_iolink {
			
 
				 	uint8_t		type;
			
@@ -291,4 +307,14 @@ struct cdit_header {
 
				 
			
 
				 #pragma pack()
			
 
				 
			
 
				+struct kfd_dev;
			
 
				+
			
 
				+int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
			
 
				+void kfd_destroy_crat_image(void *crat_image);
			
 
				+int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
			
 
				+			 uint32_t proximity_domain);
			
 
				+int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
			
 
				+				  int flags, struct kfd_dev *kdev,
			
 
				+				  uint32_t proximity_domain);
			
 
				+
			
 
				 #endif /* KFD_CRAT_H_INCLUDED */
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -95,7 +95,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
 
				 	ib_packet->bitfields3.ib_base_hi = largep->u.high_part;
			
 
				 
			
 
				 	ib_packet->control = (1 << 23) | (1 << 31) |
			
 
				-			((size_in_bytes / sizeof(uint32_t)) & 0xfffff);
			
 
				+			((size_in_bytes / 4) & 0xfffff);
			
 
				 
			
 
				 	ib_packet->bitfields5.pasid = pasid;
			
 
				 
			
@@ -126,8 +126,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
 
				 
			
 
				 	rm_packet->header.opcode = IT_RELEASE_MEM;
			
 
				 	rm_packet->header.type = PM4_TYPE_3;
			
 
				-	rm_packet->header.count = sizeof(struct pm4__release_mem) /
			
 
				-					sizeof(unsigned int) - 2;
			
 
				+	rm_packet->header.count = sizeof(struct pm4__release_mem) / 4 - 2;
			
 
				 
			
 
				 	rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
			
 
				 	rm_packet->bitfields2.event_index =
			
@@ -652,8 +651,7 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
 
				 	packets_vec[0].header.opcode = IT_SET_UCONFIG_REG;
			
 
				 	packets_vec[0].header.type = PM4_TYPE_3;
			
 
				 	packets_vec[0].bitfields2.reg_offset =
			
 
				-			GRBM_GFX_INDEX / (sizeof(uint32_t)) -
			
 
				-				USERCONFIG_REG_BASE;
			
 
				+			GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE;
			
 
				 
			
 
				 	packets_vec[0].bitfields2.insert_vmid = 0;
			
 
				 	packets_vec[0].reg_data[0] = reg_gfx_index.u32All;
			
@@ -661,8 +659,7 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
 
				 	packets_vec[1].header.count = 1;
			
 
				 	packets_vec[1].header.opcode = IT_SET_CONFIG_REG;
			
 
				 	packets_vec[1].header.type = PM4_TYPE_3;
			
 
				-	packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) -
			
 
				-						AMD_CONFIG_REG_BASE;
			
 
				+	packets_vec[1].bitfields2.reg_offset = SQ_CMD / 4 - AMD_CONFIG_REG_BASE;
			
 
				 
			
 
				 	packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET;
			
 
				 	packets_vec[1].bitfields2.insert_vmid = 1;
			
@@ -678,8 +675,7 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
 
				 
			
 
				 	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
			
 
				 	packets_vec[2].bitfields2.reg_offset =
			
 
				-				GRBM_GFX_INDEX / (sizeof(uint32_t)) -
			
 
				-					USERCONFIG_REG_BASE;
			
 
				+				GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE;
			
 
				 
			
 
				 	packets_vec[2].bitfields2.insert_vmid = 0;
			
 
				 	packets_vec[2].reg_data[0] = reg_gfx_index.u32All;
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -0,0 +1,75 @@
 
				+/*
			
 
				+ * Copyright 2016-2017 Advanced Micro Devices, Inc.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
			
 
				+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
			
 
				+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
			
 
				+ * OTHER DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/debugfs.h>
			
 
				+#include "kfd_priv.h"
			
 
				+
			
 
				+static struct dentry *debugfs_root;
			
 
				+
			
 
				+static int kfd_debugfs_open(struct inode *inode, struct file *file)
			
 
				+{
			
 
				+	int (*show)(struct seq_file *, void *) = inode->i_private;
			
 
				+
			
 
				+	return single_open(file, show, NULL);
			
 
				+}
			
 
				+
			
 
				+static const struct file_operations kfd_debugfs_fops = {
			
 
				+	.owner = THIS_MODULE,
			
 
				+	.open = kfd_debugfs_open,
			
 
				+	.read = seq_read,
			
 
				+	.llseek = seq_lseek,
			
 
				+	.release = single_release,
			
 
				+};
			
 
				+
			
 
				+void kfd_debugfs_init(void)
			
 
				+{
			
 
				+	struct dentry *ent;
			
 
				+
			
 
				+	debugfs_root = debugfs_create_dir("kfd", NULL);
			
 
				+	if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) {
			
 
				+		pr_warn("Failed to create kfd debugfs dir\n");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root,
			
 
				+				  kfd_debugfs_mqds_by_process,
			
 
				+				  &kfd_debugfs_fops);
			
 
				+	if (!ent)
			
 
				+		pr_warn("Failed to create mqds in kfd debugfs\n");
			
 
				+
			
 
				+	ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root,
			
 
				+				  kfd_debugfs_hqds_by_device,
			
 
				+				  &kfd_debugfs_fops);
			
 
				+	if (!ent)
			
 
				+		pr_warn("Failed to create hqds in kfd debugfs\n");
			
 
				+
			
 
				+	ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
			
 
				+				  kfd_debugfs_rls_by_device,
			
 
				+				  &kfd_debugfs_fops);
			
 
				+	if (!ent)
			
 
				+		pr_warn("Failed to create rls in kfd debugfs\n");
			
 
				+}
			
 
				+
			
 
				+void kfd_debugfs_fini(void)
			
 
				+{
			
 
				+	debugfs_remove_recursive(debugfs_root);
			
 
				+}
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -27,6 +27,7 @@
 
				 #include "kfd_priv.h"
			
 
				 #include "kfd_device_queue_manager.h"
			
 
				 #include "kfd_pm4_headers_vi.h"
			
 
				+#include "cwsr_trap_handler_gfx8.asm"
			
 
				 
			
 
				 #define MQD_SIZE_ALIGNED 768
			
 
				 
			
@@ -38,7 +39,8 @@ static const struct kfd_device_info kaveri_device_info = {
 
				 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
			
 
				 	.event_interrupt_class = &event_interrupt_class_cik,
			
 
				 	.num_of_watch_points = 4,
			
 
				-	.mqd_size_aligned = MQD_SIZE_ALIGNED
			
 
				+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
			
 
				+	.supports_cwsr = false,
			
 
				 };
			
 
				 
			
 
				 static const struct kfd_device_info carrizo_device_info = {
			
@@ -49,7 +51,8 @@ static const struct kfd_device_info carrizo_device_info = {
 
				 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
			
 
				 	.event_interrupt_class = &event_interrupt_class_cik,
			
 
				 	.num_of_watch_points = 4,
			
 
				-	.mqd_size_aligned = MQD_SIZE_ALIGNED
			
 
				+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
			
 
				+	.supports_cwsr = true,
			
 
				 };
			
 
				 
			
 
				 struct kfd_deviceid {
			
@@ -212,6 +215,17 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
 
				 	return AMD_IOMMU_INV_PRI_RSP_INVALID;
			
 
				 }
			
 
				 
			
 
				+static void kfd_cwsr_init(struct kfd_dev *kfd)
			
 
				+{
			
 
				+	if (cwsr_enable && kfd->device_info->supports_cwsr) {
			
 
				+		BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
			
 
				+
			
 
				+		kfd->cwsr_isa = cwsr_trap_gfx8_hex;
			
 
				+		kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
			
 
				+		kfd->cwsr_enabled = true;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 bool kgd2kfd_device_init(struct kfd_dev *kfd,
			
 
				 			 const struct kgd2kfd_shared_resources *gpu_resources)
			
 
				 {
			
@@ -224,6 +238,17 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 
				 	kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd
			
 
				 			- kfd->vm_info.first_vmid_kfd + 1;
			
 
				 
			
 
				+	/* Verify module parameters regarding mapped process number*/
			
 
				+	if ((hws_max_conc_proc < 0)
			
 
				+			|| (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) {
			
 
				+		dev_err(kfd_device,
			
 
				+			"hws_max_conc_proc %d must be between 0 and %d, use %d instead\n",
			
 
				+			hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
			
 
				+			kfd->vm_info.vmid_num_kfd);
			
 
				+		kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
			
 
				+	} else
			
 
				+		kfd->max_proc_per_quantum = hws_max_conc_proc;
			
 
				+
			
 
				 	/* calculate max size of mqds needed for queues */
			
 
				 	size = max_num_of_queues_per_device *
			
 
				 			kfd->device_info->mqd_size_aligned;
			
@@ -286,6 +311,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 
				 		goto device_iommu_pasid_error;
			
 
				 	}
			
 
				 
			
 
				+	kfd_cwsr_init(kfd);
			
 
				+
			
 
				 	if (kfd_resume(kfd))
			
 
				 		goto kfd_resume_error;
			
 
				 
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -149,8 +149,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
 
				 
			
 
				 static int create_queue_nocpsch(struct device_queue_manager *dqm,
			
 
				 				struct queue *q,
			
 
				-				struct qcm_process_device *qpd,
			
 
				-				int *allocated_vmid)
			
 
				+				struct qcm_process_device *qpd)
			
 
				 {
			
 
				 	int retval;
			
 
				 
			
@@ -170,9 +169,11 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 
				 		if (retval)
			
 
				 			goto out_unlock;
			
 
				 	}
			
 
				-	*allocated_vmid = qpd->vmid;
			
 
				 	q->properties.vmid = qpd->vmid;
			
 
				 
			
 
				+	q->properties.tba_addr = qpd->tba_addr;
			
 
				+	q->properties.tma_addr = qpd->tma_addr;
			
 
				+
			
 
				 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
			
 
				 		retval = create_compute_queue_nocpsch(dqm, q, qpd);
			
 
				 	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
			
@@ -181,10 +182,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 
				 		retval = -EINVAL;
			
 
				 
			
 
				 	if (retval) {
			
 
				-		if (list_empty(&qpd->queues_list)) {
			
 
				+		if (list_empty(&qpd->queues_list))
			
 
				 			deallocate_vmid(dqm, qpd, q);
			
 
				-			*allocated_vmid = 0;
			
 
				-		}
			
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
@@ -809,16 +808,13 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
 
				 }
			
 
				 
			
 
				 static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
			
 
				-			struct qcm_process_device *qpd, int *allocate_vmid)
			
 
				+			struct qcm_process_device *qpd)
			
 
				 {
			
 
				 	int retval;
			
 
				 	struct mqd_manager *mqd;
			
 
				 
			
 
				 	retval = 0;
			
 
				 
			
 
				-	if (allocate_vmid)
			
 
				-		*allocate_vmid = 0;
			
 
				-
			
 
				 	mutex_lock(&dqm->lock);
			
 
				 
			
 
				 	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
			
@@ -846,6 +842,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
				 	}
			
 
				 
			
 
				 	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
			
 
				+
			
 
				+	q->properties.tba_addr = qpd->tba_addr;
			
 
				+	q->properties.tma_addr = qpd->tma_addr;
			
 
				 	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
			
 
				 				&q->gart_mqd_addr, &q->properties);
			
 
				 	if (retval)
			
@@ -1110,6 +1109,26 @@ out:
 
				 	return retval;
			
 
				 }
			
 
				 
			
 
				+static int set_trap_handler(struct device_queue_manager *dqm,
			
 
				+				struct qcm_process_device *qpd,
			
 
				+				uint64_t tba_addr,
			
 
				+				uint64_t tma_addr)
			
 
				+{
			
 
				+	uint64_t *tma;
			
 
				+
			
 
				+	if (dqm->dev->cwsr_enabled) {
			
 
				+		/* Jump from CWSR trap handler to user trap */
			
 
				+		tma = (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
			
 
				+		tma[0] = tba_addr;
			
 
				+		tma[1] = tma_addr;
			
 
				+	} else {
			
 
				+		qpd->tba_addr = tba_addr;
			
 
				+		qpd->tma_addr = tma_addr;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int process_termination_nocpsch(struct device_queue_manager *dqm,
			
 
				 		struct qcm_process_device *qpd)
			
 
				 {
			
@@ -1241,6 +1260,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 
				 		dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
			
 
				 		dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
			
 
				 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
			
 
				+		dqm->ops.set_trap_handler = set_trap_handler;
			
 
				 		dqm->ops.process_termination = process_termination_cpsch;
			
 
				 		break;
			
 
				 	case KFD_SCHED_POLICY_NO_HWS:
			
@@ -1256,6 +1276,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 
				 		dqm->ops.initialize = initialize_nocpsch;
			
 
				 		dqm->ops.uninitialize = uninitialize;
			
 
				 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
			
 
				+		dqm->ops.set_trap_handler = set_trap_handler;
			
 
				 		dqm->ops.process_termination = process_termination_nocpsch;
			
 
				 		break;
			
 
				 	default:
			
@@ -1290,3 +1311,74 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
 
				 	dqm->ops.uninitialize(dqm);
			
 
				 	kfree(dqm);
			
 
				 }
			
 
				+
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+
			
 
				+static void seq_reg_dump(struct seq_file *m,
			
 
				+			 uint32_t (*dump)[2], uint32_t n_regs)
			
 
				+{
			
 
				+	uint32_t i, count;
			
 
				+
			
 
				+	for (i = 0, count = 0; i < n_regs; i++) {
			
 
				+		if (count == 0 ||
			
 
				+		    dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) {
			
 
				+			seq_printf(m, "%s    %08x: %08x",
			
 
				+				   i ? "\n" : "",
			
 
				+				   dump[i][0], dump[i][1]);
			
 
				+			count = 7;
			
 
				+		} else {
			
 
				+			seq_printf(m, " %08x", dump[i][1]);
			
 
				+			count--;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	seq_puts(m, "\n");
			
 
				+}
			
 
				+
			
 
				+int dqm_debugfs_hqds(struct seq_file *m, void *data)
			
 
				+{
			
 
				+	struct device_queue_manager *dqm = data;
			
 
				+	uint32_t (*dump)[2], n_regs;
			
 
				+	int pipe, queue;
			
 
				+	int r = 0;
			
 
				+
			
 
				+	for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
			
 
				+		int pipe_offset = pipe * get_queues_per_pipe(dqm);
			
 
				+
			
 
				+		for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) {
			
 
				+			if (!test_bit(pipe_offset + queue,
			
 
				+				      dqm->dev->shared_resources.queue_bitmap))
			
 
				+				continue;
			
 
				+
			
 
				+			r = dqm->dev->kfd2kgd->hqd_dump(
			
 
				+				dqm->dev->kgd, pipe, queue, &dump, &n_regs);
			
 
				+			if (r)
			
 
				+				break;
			
 
				+
			
 
				+			seq_printf(m, "  CP Pipe %d, Queue %d\n",
			
 
				+				  pipe, queue);
			
 
				+			seq_reg_dump(m, dump, n_regs);
			
 
				+
			
 
				+			kfree(dump);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (pipe = 0; pipe < CIK_SDMA_ENGINE_NUM; pipe++) {
			
 
				+		for (queue = 0; queue < CIK_SDMA_QUEUES_PER_ENGINE; queue++) {
			
 
				+			r = dqm->dev->kfd2kgd->hqd_sdma_dump(
			
 
				+				dqm->dev->kgd, pipe, queue, &dump, &n_regs);
			
 
				+			if (r)
			
 
				+				break;
			
 
				+
			
 
				+			seq_printf(m, "  SDMA Engine %d, RLC %d\n",
			
 
				+				  pipe, queue);
			
 
				+			seq_reg_dump(m, dump, n_regs);
			
 
				+
			
 
				+			kfree(dump);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -84,8 +84,7 @@ struct device_process_node {
 
				 struct device_queue_manager_ops {
			
 
				 	int	(*create_queue)(struct device_queue_manager *dqm,
			
 
				 				struct queue *q,
			
 
				-				struct qcm_process_device *qpd,
			
 
				-				int *allocate_vmid);
			
 
				+				struct qcm_process_device *qpd);
			
 
				 
			
 
				 	int	(*destroy_queue)(struct device_queue_manager *dqm,
			
 
				 				struct qcm_process_device *qpd,
			
@@ -123,6 +122,11 @@ struct device_queue_manager_ops {
 
				 					   void __user *alternate_aperture_base,
			
 
				 					   uint64_t alternate_aperture_size);
			
 
				 
			
 
				+	int	(*set_trap_handler)(struct device_queue_manager *dqm,
			
 
				+				    struct qcm_process_device *qpd,
			
 
				+				    uint64_t tba_addr,
			
 
				+				    uint64_t tma_addr);
			
 
				+
			
 
				 	int (*process_termination)(struct device_queue_manager *dqm,
			
 
				 			struct qcm_process_device *qpd);
			
 
				 };
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
@@ -116,8 +116,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd)
 
				 	pr_debug("doorbell aperture size  == 0x%08lX\n",
			
 
				 			kfd->shared_resources.doorbell_aperture_size);
			
 
				 
			
 
				-	pr_debug("doorbell kernel address == 0x%08lX\n",
			
 
				-			(uintptr_t)kfd->doorbell_kernel_ptr);
			
 
				+	pr_debug("doorbell kernel address == %p\n", kfd->doorbell_kernel_ptr);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -194,8 +193,8 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
 
				 
			
 
				 	pr_debug("Get kernel queue doorbell\n"
			
 
				 			 "     doorbell offset   == 0x%08X\n"
			
 
				-			 "     kernel address    == 0x%08lX\n",
			
 
				-		*doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx));
			
 
				+			 "     kernel address    == %p\n",
			
 
				+		*doorbell_off, (kfd->doorbell_kernel_ptr + inx));
			
 
				 
			
 
				 	return kfd->doorbell_kernel_ptr + inx;
			
 
				 }
			
@@ -215,7 +214,7 @@ inline void write_kernel_doorbell(u32 __iomem *db, u32 value)
 
				 {
			
 
				 	if (db) {
			
 
				 		writel(value, db);
			
 
				-		pr_debug("Writing %d to doorbell address 0x%p\n", value, db);
			
 
				+		pr_debug("Writing %d to doorbell address %p\n", value, db);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -441,7 +441,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
 
				 	/*
			
 
				 	 * Because we are called from arbitrary context (workqueue) as opposed
			
 
				 	 * to process context, kfd_process could attempt to exit while we are
			
 
				-	 * running so the lookup function returns a locked process.
			
 
				+	 * running so the lookup function increments the process ref count.
			
 
				 	 */
			
 
				 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
			
 
				 
			
@@ -493,7 +493,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
 
				 	}
			
 
				 
			
 
				 	mutex_unlock(&p->event_mutex);
			
 
				-	mutex_unlock(&p->mutex);
			
 
				+	kfd_unref_process(p);
			
 
				 }
			
 
				 
			
 
				 static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
			
@@ -847,7 +847,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 
				 	/*
			
 
				 	 * Because we are called from arbitrary context (workqueue) as opposed
			
 
				 	 * to process context, kfd_process could attempt to exit while we are
			
 
				-	 * running so the lookup function returns a locked process.
			
 
				+	 * running so the lookup function increments the process ref count.
			
 
				 	 */
			
 
				 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
			
 
				 	struct mm_struct *mm;
			
@@ -860,7 +860,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 
				 	 */
			
 
				 	mm = get_task_mm(p->lead_thread);
			
 
				 	if (!mm) {
			
 
				-		mutex_unlock(&p->mutex);
			
 
				+		kfd_unref_process(p);
			
 
				 		return; /* Process is exiting */
			
 
				 	}
			
 
				 
			
@@ -903,7 +903,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 
				 			&memory_exception_data);
			
 
				 
			
 
				 	mutex_unlock(&p->event_mutex);
			
 
				-	mutex_unlock(&p->mutex);
			
 
				+	kfd_unref_process(p);
			
 
				 }
			
 
				 
			
 
				 void kfd_signal_hw_exception_event(unsigned int pasid)
			
@@ -911,7 +911,7 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
 
				 	/*
			
 
				 	 * Because we are called from arbitrary context (workqueue) as opposed
			
 
				 	 * to process context, kfd_process could attempt to exit while we are
			
 
				-	 * running so the lookup function returns a locked process.
			
 
				+	 * running so the lookup function increments the process ref count.
			
 
				 	 */
			
 
				 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
			
 
				 
			
@@ -924,5 +924,5 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
 
				 	lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL);
			
 
				 
			
 
				 	mutex_unlock(&p->event_mutex);
			
 
				-	mutex_unlock(&p->mutex);
			
 
				+	kfd_unref_process(p);
			
 
				 }
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -300,9 +300,14 @@ int kfd_init_apertures(struct kfd_process *process)
 
				 	struct kfd_process_device *pdd;
			
 
				 
			
 
				 	/*Iterating over all devices*/
			
 
				-	while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL &&
			
 
				+	while (kfd_topology_enum_kfd_devices(id, &dev) == 0 &&
			
 
				 		id < NUM_OF_SUPPORTED_GPUS) {
			
 
				 
			
 
				+		if (!dev) {
			
 
				+			id++; /* Skip non GPU devices */
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				 		pdd = kfd_create_process_device_data(dev, process);
			
 
				 		if (!pdd) {
			
 
				 			pr_err("Failed to create process device data\n");
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -218,7 +218,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
 
				 	rptr = *kq->rptr_kernel;
			
 
				 	wptr = *kq->wptr_kernel;
			
 
				 	queue_address = (unsigned int *)kq->pq_kernel_addr;
			
 
				-	queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t);
			
 
				+	queue_size_dwords = kq->queue->properties.queue_size / 4;
			
 
				 
			
 
				 	pr_debug("rptr: %d\n", rptr);
			
 
				 	pr_debug("wptr: %d\n", wptr);
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -50,6 +50,15 @@ module_param(sched_policy, int, 0444);
 
				 MODULE_PARM_DESC(sched_policy,
			
 
				 	"Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)");
			
 
				 
			
 
				+int hws_max_conc_proc = 8;
			
 
				+module_param(hws_max_conc_proc, int, 0444);
			
 
				+MODULE_PARM_DESC(hws_max_conc_proc,
			
 
				+	"Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))");
			
 
				+
			
 
				+int cwsr_enable = 1;
			
 
				+module_param(cwsr_enable, int, 0444);
			
 
				+MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
			
 
				+
			
 
				 int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
			
 
				 module_param(max_num_of_queues_per_device, int, 0444);
			
 
				 MODULE_PARM_DESC(max_num_of_queues_per_device,
			
@@ -60,6 +69,11 @@ module_param(send_sigterm, int, 0444);
 
				 MODULE_PARM_DESC(send_sigterm,
			
 
				 	"Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)");
			
 
				 
			
 
				+int ignore_crat;
			
 
				+module_param(ignore_crat, int, 0444);
			
 
				+MODULE_PARM_DESC(ignore_crat,
			
 
				+	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
			
 
				+
			
 
				 static int amdkfd_init_completed;
			
 
				 
			
 
				 int kgd2kfd_init(unsigned int interface_version,
			
@@ -114,6 +128,8 @@ static int __init kfd_module_init(void)
 
				 
			
 
				 	kfd_process_create_wq();
			
 
				 
			
 
				+	kfd_debugfs_init();
			
 
				+
			
 
				 	amdkfd_init_completed = 1;
			
 
				 
			
 
				 	dev_info(kfd_device, "Initialized module\n");
			
@@ -130,6 +146,7 @@ static void __exit kfd_module_exit(void)
 
				 {
			
 
				 	amdkfd_init_completed = 0;
			
 
				 
			
 
				+	kfd_debugfs_fini();
			
 
				 	kfd_process_destroy_wq();
			
 
				 	kfd_topology_shutdown();
			
 
				 	kfd_chardev_exit();
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -85,6 +85,10 @@ struct mqd_manager {
 
				 				uint64_t queue_address,	uint32_t pipe_id,
			
 
				 				uint32_t queue_id);
			
 
				 
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+	int	(*debugfs_show_mqd)(struct seq_file *m, void *data);
			
 
				+#endif
			
 
				+
			
 
				 	struct mutex	mqd_mutex;
			
 
				 	struct kfd_dev	*dev;
			
 
				 };
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -36,6 +36,11 @@ static inline struct cik_mqd *get_mqd(void *mqd)
 
				 	return (struct cik_mqd *)mqd;
			
 
				 }
			
 
				 
			
 
				+static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
			
 
				+{
			
 
				+	return (struct cik_sdma_rlc_registers *)mqd;
			
 
				+}
			
 
				+
			
 
				 static int init_mqd(struct mqd_manager *mm, void **mqd,
			
 
				 		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
			
 
				 		struct queue_properties *q)
			
@@ -149,7 +154,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
 
				 {
			
 
				 	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
			
 
				 	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
			
 
				-	uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1);
			
 
				+	uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1);
			
 
				 
			
 
				 	return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
			
 
				 					  (uint32_t __user *)p->write_ptr,
			
@@ -160,7 +165,9 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
 
				 			 uint32_t pipe_id, uint32_t queue_id,
			
 
				 			 struct queue_properties *p, struct mm_struct *mms)
			
 
				 {
			
 
				-	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd);
			
 
				+	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
			
 
				+					       (uint32_t __user *)p->write_ptr,
			
 
				+					       mms);
			
 
				 }
			
 
				 
			
 
				 static int update_mqd(struct mqd_manager *mm, void *mqd,
			
@@ -176,8 +183,7 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
 
				 	 * Calculating queue size which is log base 2 of actual queue size -1
			
 
				 	 * dwords and another -1 for ffs
			
 
				 	 */
			
 
				-	m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int))
			
 
				-								- 1 - 1;
			
 
				+	m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
			
 
				 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
			
 
				 	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
			
 
				 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
			
@@ -202,7 +208,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
 
				 	struct cik_sdma_rlc_registers *m;
			
 
				 
			
 
				 	m = get_sdma_mqd(mqd);
			
 
				-	m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
			
 
				+	m->sdma_rlc_rb_cntl = order_base_2(q->queue_size / 4)
			
 
				 			<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
			
 
				 			q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
			
 
				 			1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
			
@@ -343,8 +349,7 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 
				 	 * Calculating queue size which is log base 2 of actual queue
			
 
				 	 * size -1 dwords
			
 
				 	 */
			
 
				-	m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int))
			
 
				-								- 1 - 1;
			
 
				+	m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
			
 
				 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
			
 
				 	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
			
 
				 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
			
@@ -360,15 +365,25 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
			
 
				-{
			
 
				-	struct cik_sdma_rlc_registers *m;
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				 
			
 
				-	m = (struct cik_sdma_rlc_registers *)mqd;
			
 
				+static int debugfs_show_mqd(struct seq_file *m, void *data)
			
 
				+{
			
 
				+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
			
 
				+		     data, sizeof(struct cik_mqd), false);
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				-	return m;
			
 
				+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
			
 
				+{
			
 
				+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
			
 
				+		     data, sizeof(struct cik_sdma_rlc_registers), false);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				 struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
			
 
				 		struct kfd_dev *dev)
			
 
				 {
			
@@ -392,6 +407,9 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
 
				 		mqd->update_mqd = update_mqd;
			
 
				 		mqd->destroy_mqd = destroy_mqd;
			
 
				 		mqd->is_occupied = is_occupied;
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+		mqd->debugfs_show_mqd = debugfs_show_mqd;
			
 
				+#endif
			
 
				 		break;
			
 
				 	case KFD_MQD_TYPE_HIQ:
			
 
				 		mqd->init_mqd = init_mqd_hiq;
			
@@ -400,6 +418,9 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
 
				 		mqd->update_mqd = update_mqd_hiq;
			
 
				 		mqd->destroy_mqd = destroy_mqd;
			
 
				 		mqd->is_occupied = is_occupied;
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+		mqd->debugfs_show_mqd = debugfs_show_mqd;
			
 
				+#endif
			
 
				 		break;
			
 
				 	case KFD_MQD_TYPE_SDMA:
			
 
				 		mqd->init_mqd = init_mqd_sdma;
			
@@ -408,6 +429,9 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
 
				 		mqd->update_mqd = update_mqd_sdma;
			
 
				 		mqd->destroy_mqd = destroy_mqd_sdma;
			
 
				 		mqd->is_occupied = is_occupied_sdma;
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
			
 
				+#endif
			
 
				 		break;
			
 
				 	default:
			
 
				 		kfree(mqd);
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -30,7 +30,7 @@
 
				 #include "vi_structs.h"
			
 
				 #include "gca/gfx_8_0_sh_mask.h"
			
 
				 #include "gca/gfx_8_0_enum.h"
			
 
				-
			
 
				+#include "oss/oss_3_0_sh_mask.h"
			
 
				 #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8
			
 
				 
			
 
				 static inline struct vi_mqd *get_mqd(void *mqd)
			
@@ -38,6 +38,11 @@ static inline struct vi_mqd *get_mqd(void *mqd)
 
				 	return (struct vi_mqd *)mqd;
			
 
				 }
			
 
				 
			
 
				+static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
			
 
				+{
			
 
				+	return (struct vi_sdma_mqd *)mqd;
			
 
				+}
			
 
				+
			
 
				 static int init_mqd(struct mqd_manager *mm, void **mqd,
			
 
				 			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
			
 
				 			struct queue_properties *q)
			
@@ -84,6 +89,28 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
 
				 	if (q->format == KFD_QUEUE_FORMAT_AQL)
			
 
				 		m->cp_hqd_iq_rptr = 1;
			
 
				 
			
 
				+	if (q->tba_addr) {
			
 
				+		m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8);
			
 
				+		m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8);
			
 
				+		m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8);
			
 
				+		m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8);
			
 
				+		m->compute_pgm_rsrc2 |=
			
 
				+			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
			
 
				+	}
			
 
				+
			
 
				+	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
			
 
				+		m->cp_hqd_persistent_state |=
			
 
				+			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
			
 
				+		m->cp_hqd_ctx_save_base_addr_lo =
			
 
				+			lower_32_bits(q->ctx_save_restore_area_address);
			
 
				+		m->cp_hqd_ctx_save_base_addr_hi =
			
 
				+			upper_32_bits(q->ctx_save_restore_area_address);
			
 
				+		m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
			
 
				+		m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
			
 
				+		m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
			
 
				+		m->cp_hqd_wg_state_offset = q->ctl_stack_size;
			
 
				+	}
			
 
				+
			
 
				 	*mqd = m;
			
 
				 	if (gart_addr)
			
 
				 		*gart_addr = addr;
			
@@ -98,7 +125,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd,
 
				 {
			
 
				 	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
			
 
				 	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
			
 
				-	uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1);
			
 
				+	uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1);
			
 
				 
			
 
				 	return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
			
 
				 					  (uint32_t __user *)p->write_ptr,
			
@@ -116,8 +143,7 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
 
				 	m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT |
			
 
				 			atc_bit << CP_HQD_PQ_CONTROL__PQ_ATC__SHIFT |
			
 
				 			mtype << CP_HQD_PQ_CONTROL__MTYPE__SHIFT;
			
 
				-	m->cp_hqd_pq_control |=
			
 
				-			ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1;
			
 
				+	m->cp_hqd_pq_control |=	order_base_2(q->queue_size / 4) - 1;
			
 
				 	pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
			
 
				 
			
 
				 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
			
@@ -147,7 +173,7 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
 
				 	 * is safe, giving a maximum field value of 0xA.
			
 
				 	 */
			
 
				 	m->cp_hqd_eop_control |= min(0xA,
			
 
				-		ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1);
			
 
				+		order_base_2(q->eop_ring_buffer_size / 4) - 1);
			
 
				 	m->cp_hqd_eop_base_addr_lo =
			
 
				 			lower_32_bits(q->eop_ring_buffer_address >> 8);
			
 
				 	m->cp_hqd_eop_base_addr_hi =
			
@@ -163,6 +189,11 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
 
				 				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT;
			
 
				 	}
			
 
				 
			
 
				+	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address)
			
 
				+		m->cp_hqd_ctx_save_control =
			
 
				+			atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT |
			
 
				+			mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
			
 
				+
			
 
				 	q->is_active = (q->queue_size > 0 &&
			
 
				 			q->queue_address != 0 &&
			
 
				 			q->queue_percent > 0);
			
@@ -234,6 +265,117 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 
				 	return retval;
			
 
				 }
			
 
				 
			
 
				+static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
			
 
				+		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
			
 
				+		struct queue_properties *q)
			
 
				+{
			
 
				+	int retval;
			
 
				+	struct vi_sdma_mqd *m;
			
 
				+
			
 
				+
			
 
				+	retval = kfd_gtt_sa_allocate(mm->dev,
			
 
				+			sizeof(struct vi_sdma_mqd),
			
 
				+			mqd_mem_obj);
			
 
				+
			
 
				+	if (retval != 0)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
			
 
				+
			
 
				+	memset(m, 0, sizeof(struct vi_sdma_mqd));
			
 
				+
			
 
				+	*mqd = m;
			
 
				+	if (gart_addr != NULL)
			
 
				+		*gart_addr = (*mqd_mem_obj)->gpu_addr;
			
 
				+
			
 
				+	retval = mm->update_mqd(mm, m, q);
			
 
				+
			
 
				+	return retval;
			
 
				+}
			
 
				+
			
 
				+static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
			
 
				+		struct kfd_mem_obj *mqd_mem_obj)
			
 
				+{
			
 
				+	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
			
 
				+}
			
 
				+
			
 
				+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
			
 
				+		uint32_t pipe_id, uint32_t queue_id,
			
 
				+		struct queue_properties *p, struct mm_struct *mms)
			
 
				+{
			
 
				+	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
			
 
				+					       (uint32_t __user *)p->write_ptr,
			
 
				+					       mms);
			
 
				+}
			
 
				+
			
 
				+static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
			
 
				+		struct queue_properties *q)
			
 
				+{
			
 
				+	struct vi_sdma_mqd *m;
			
 
				+
			
 
				+	m = get_sdma_mqd(mqd);
			
 
				+	m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4)
			
 
				+		<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
			
 
				+		q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
			
 
				+		1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
			
 
				+		6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
			
 
				+
			
 
				+	m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
			
 
				+	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
			
 
				+	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
			
 
				+	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
			
 
				+	m->sdmax_rlcx_doorbell =
			
 
				+		q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT;
			
 
				+
			
 
				+	m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr;
			
 
				+
			
 
				+	m->sdma_engine_id = q->sdma_engine_id;
			
 
				+	m->sdma_queue_id = q->sdma_queue_id;
			
 
				+
			
 
				+	q->is_active = (q->queue_size > 0 &&
			
 
				+			q->queue_address != 0 &&
			
 
				+			q->queue_percent > 0);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *  * preempt type here is ignored because there is only one way
			
 
				+ *  * to preempt sdma queue
			
 
				+ */
			
 
				+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
			
 
				+		enum kfd_preempt_type type,
			
 
				+		unsigned int timeout, uint32_t pipe_id,
			
 
				+		uint32_t queue_id)
			
 
				+{
			
 
				+	return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
			
 
				+}
			
 
				+
			
 
				+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
			
 
				+		uint64_t queue_address, uint32_t pipe_id,
			
 
				+		uint32_t queue_id)
			
 
				+{
			
 
				+	return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
			
 
				+}
			
 
				+
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+
			
 
				+static int debugfs_show_mqd(struct seq_file *m, void *data)
			
 
				+{
			
 
				+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
			
 
				+		     data, sizeof(struct vi_mqd), false);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
			
 
				+{
			
 
				+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
			
 
				+		     data, sizeof(struct vi_sdma_mqd), false);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				 struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
			
 
				 		struct kfd_dev *dev)
			
 
				 {
			
@@ -257,6 +399,9 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
 
				 		mqd->update_mqd = update_mqd;
			
 
				 		mqd->destroy_mqd = destroy_mqd;
			
 
				 		mqd->is_occupied = is_occupied;
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+		mqd->debugfs_show_mqd = debugfs_show_mqd;
			
 
				+#endif
			
 
				 		break;
			
 
				 	case KFD_MQD_TYPE_HIQ:
			
 
				 		mqd->init_mqd = init_mqd_hiq;
			
@@ -265,8 +410,20 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
 
				 		mqd->update_mqd = update_mqd_hiq;
			
 
				 		mqd->destroy_mqd = destroy_mqd;
			
 
				 		mqd->is_occupied = is_occupied;
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+		mqd->debugfs_show_mqd = debugfs_show_mqd;
			
 
				+#endif
			
 
				 		break;
			
 
				 	case KFD_MQD_TYPE_SDMA:
			
 
				+		mqd->init_mqd = init_mqd_sdma;
			
 
				+		mqd->uninit_mqd = uninit_mqd_sdma;
			
 
				+		mqd->load_mqd = load_mqd_sdma;
			
 
				+		mqd->update_mqd = update_mqd_sdma;
			
 
				+		mqd->destroy_mqd = destroy_mqd_sdma;
			
 
				+		mqd->is_occupied = is_occupied_sdma;
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
			
 
				+#endif
			
 
				 		break;
			
 
				 	default:
			
 
				 		kfree(mqd);
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -45,7 +45,7 @@ static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size)
 
				 
			
 
				 	header.u32All = 0;
			
 
				 	header.opcode = opcode;
			
 
				-	header.count = packet_size/sizeof(uint32_t) - 2;
			
 
				+	header.count = packet_size / 4 - 2;
			
 
				 	header.type = PM4_TYPE_3;
			
 
				 
			
 
				 	return header.u32All;
			
@@ -55,15 +55,27 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
 
				 				unsigned int *rlib_size,
			
 
				 				bool *over_subscription)
			
 
				 {
			
 
				-	unsigned int process_count, queue_count;
			
 
				+	unsigned int process_count, queue_count, compute_queue_count;
			
 
				 	unsigned int map_queue_size;
			
 
				+	unsigned int max_proc_per_quantum = 1;
			
 
				+	struct kfd_dev *dev = pm->dqm->dev;
			
 
				 
			
 
				 	process_count = pm->dqm->processes_count;
			
 
				 	queue_count = pm->dqm->queue_count;
			
 
				+	compute_queue_count = queue_count - pm->dqm->sdma_queue_count;
			
 
				 
			
 
				-	/* check if there is over subscription*/
			
 
				+	/* check if there is over subscription
			
 
				+	 * Note: the arbitration between the number of VMIDs and
			
 
				+	 * hws_max_conc_proc has been done in
			
 
				+	 * kgd2kfd_device_init().
			
 
				+	 */
			
 
				 	*over_subscription = false;
			
 
				-	if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) {
			
 
				+
			
 
				+	if (dev->max_proc_per_quantum > 1)
			
 
				+		max_proc_per_quantum = dev->max_proc_per_quantum;
			
 
				+
			
 
				+	if ((process_count > max_proc_per_quantum) ||
			
 
				+	    compute_queue_count > get_queues_num(pm->dqm)) {
			
 
				 		*over_subscription = true;
			
 
				 		pr_debug("Over subscribed runlist\n");
			
 
				 	}
			
@@ -116,10 +128,24 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
 
				 			uint64_t ib, size_t ib_size_in_dwords, bool chain)
			
 
				 {
			
 
				 	struct pm4_mes_runlist *packet;
			
 
				+	int concurrent_proc_cnt = 0;
			
 
				+	struct kfd_dev *kfd = pm->dqm->dev;
			
 
				 
			
 
				 	if (WARN_ON(!ib))
			
 
				 		return -EFAULT;
			
 
				 
			
 
				+	/* Determine the number of processes to map together to HW:
			
 
				+	 * it can not exceed the number of VMIDs available to the
			
 
				+	 * scheduler, and it is determined by the smaller of the number
			
 
				+	 * of processes in the runlist and kfd module parameter
			
 
				+	 * hws_max_conc_proc.
			
 
				+	 * Note: the arbitration between the number of VMIDs and
			
 
				+	 * hws_max_conc_proc has been done in
			
 
				+	 * kgd2kfd_device_init().
			
 
				+	 */
			
 
				+	concurrent_proc_cnt = min(pm->dqm->processes_count,
			
 
				+			kfd->max_proc_per_quantum);
			
 
				+
			
 
				 	packet = (struct pm4_mes_runlist *)buffer;
			
 
				 
			
 
				 	memset(buffer, 0, sizeof(struct pm4_mes_runlist));
			
@@ -130,6 +156,7 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
 
				 	packet->bitfields4.chain = chain ? 1 : 0;
			
 
				 	packet->bitfields4.offload_polling = 0;
			
 
				 	packet->bitfields4.valid = 1;
			
 
				+	packet->bitfields4.process_cnt = concurrent_proc_cnt;
			
 
				 	packet->ordinal2 = lower_32_bits(ib);
			
 
				 	packet->bitfields3.ib_base_hi = upper_32_bits(ib);
			
 
				 
			
@@ -251,6 +278,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
 
				 		return retval;
			
 
				 
			
 
				 	*rl_size_bytes = alloc_size_bytes;
			
 
				+	pm->ib_size_bytes = alloc_size_bytes;
			
 
				 
			
 
				 	pr_debug("Building runlist ib process count: %d queues count %d\n",
			
 
				 		pm->dqm->processes_count, pm->dqm->queue_count);
			
@@ -564,3 +592,26 @@ void pm_release_ib(struct packet_manager *pm)
 
				 	}
			
 
				 	mutex_unlock(&pm->lock);
			
 
				 }
			
 
				+
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+
			
 
				+int pm_debugfs_runlist(struct seq_file *m, void *data)
			
 
				+{
			
 
				+	struct packet_manager *pm = data;
			
 
				+
			
 
				+	mutex_lock(&pm->lock);
			
 
				+
			
 
				+	if (!pm->allocated) {
			
 
				+		seq_puts(m, "  No active runlist\n");
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	seq_hex_dump(m, "  ", DUMP_PREFIX_OFFSET, 32, 4,
			
 
				+		     pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false);
			
 
				+
			
 
				+out:
			
 
				+	mutex_unlock(&pm->lock);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
@@ -59,7 +59,7 @@ unsigned int kfd_pasid_alloc(void)
 
				 		struct kfd_dev *dev = NULL;
			
 
				 		unsigned int i = 0;
			
 
				 
			
 
				-		while ((dev = kfd_topology_enum_kfd_devices(i)) != NULL) {
			
 
				+		while ((kfd_topology_enum_kfd_devices(i, &dev)) == 0) {
			
 
				 			if (dev && dev->kfd2kgd) {
			
 
				 				kfd2kgd = dev->kfd2kgd;
			
 
				 				break;
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -33,6 +33,8 @@
 
				 #include <linux/kfd_ioctl.h>
			
 
				 #include <linux/idr.h>
			
 
				 #include <linux/kfifo.h>
			
 
				+#include <linux/seq_file.h>
			
 
				+#include <linux/kref.h>
			
 
				 #include <kgd_kfd_interface.h>
			
 
				 
			
 
				 #include "amd_shared.h"
			
@@ -41,6 +43,7 @@
 
				 
			
 
				 #define KFD_MMAP_DOORBELL_MASK 0x8000000000000
			
 
				 #define KFD_MMAP_EVENTS_MASK 0x4000000000000
			
 
				+#define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000
			
 
				 
			
 
				 /*
			
 
				  * When working with cp scheduler we should assign the HIQ manually or via
			
@@ -62,6 +65,15 @@
 
				 #define KFD_MAX_NUM_OF_PROCESSES 512
			
 
				 #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
			
 
				 
			
 
				+/*
			
 
				+ * Size of the per-process TBA+TMA buffer: 2 pages
			
 
				+ *
			
 
				+ * The first page is the TBA used for the CWSR ISA code. The second
			
 
				+ * page is used as TMA for daisy changing a user-mode trap handler.
			
 
				+ */
			
 
				+#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2)
			
 
				+#define KFD_CWSR_TMA_OFFSET PAGE_SIZE
			
 
				+
			
 
				 /*
			
 
				  * Kernel module parameter to specify maximum number of supported queues per
			
 
				  * device
			
@@ -78,12 +90,26 @@ extern int max_num_of_queues_per_device;
 
				 /* Kernel module parameter to specify the scheduling policy */
			
 
				 extern int sched_policy;
			
 
				 
			
 
				+/*
			
 
				+ * Kernel module parameter to specify the maximum process
			
 
				+ * number per HW scheduler
			
 
				+ */
			
 
				+extern int hws_max_conc_proc;
			
 
				+
			
 
				+extern int cwsr_enable;
			
 
				+
			
 
				 /*
			
 
				  * Kernel module parameter to specify whether to send sigterm to HSA process on
			
 
				  * unhandled exception
			
 
				  */
			
 
				 extern int send_sigterm;
			
 
				 
			
 
				+/*
			
 
				+ * Ignore CRAT table during KFD initialization, can be used to work around
			
 
				+ * broken CRAT tables on some AMD systems
			
 
				+ */
			
 
				+extern int ignore_crat;
			
 
				+
			
 
				 /**
			
 
				  * enum kfd_sched_policy
			
 
				  *
			
@@ -131,6 +157,7 @@ struct kfd_device_info {
 
				 	size_t ih_ring_entry_size;
			
 
				 	uint8_t num_of_watch_points;
			
 
				 	uint16_t mqd_size_aligned;
			
 
				+	bool supports_cwsr;
			
 
				 };
			
 
				 
			
 
				 struct kfd_mem_obj {
			
@@ -200,6 +227,14 @@ struct kfd_dev {
 
				 
			
 
				 	/* Debug manager */
			
 
				 	struct kfd_dbgmgr           *dbgmgr;
			
 
				+
			
 
				+	/* Maximum process number mapped to HW scheduler */
			
 
				+	unsigned int max_proc_per_quantum;
			
 
				+
			
 
				+	/* CWSR */
			
 
				+	bool cwsr_enabled;
			
 
				+	const void *cwsr_isa;
			
 
				+	unsigned int cwsr_isa_size;
			
 
				 };
			
 
				 
			
 
				 /* KGD2KFD callbacks */
			
@@ -332,6 +367,9 @@ struct queue_properties {
 
				 	uint32_t eop_ring_buffer_size;
			
 
				 	uint64_t ctx_save_restore_area_address;
			
 
				 	uint32_t ctx_save_restore_area_size;
			
 
				+	uint32_t ctl_stack_size;
			
 
				+	uint64_t tba_addr;
			
 
				+	uint64_t tma_addr;
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -439,6 +477,11 @@ struct qcm_process_device {
 
				 	uint32_t num_gws;
			
 
				 	uint32_t num_oac;
			
 
				 	uint32_t sh_hidden_private_base;
			
 
				+
			
 
				+	/* CWSR memory */
			
 
				+	void *cwsr_kaddr;
			
 
				+	uint64_t tba_addr;
			
 
				+	uint64_t tma_addr;
			
 
				 };
			
 
				 
			
 
				 
			
@@ -501,6 +544,9 @@ struct kfd_process {
 
				 	 */
			
 
				 	void *mm;
			
 
				 
			
 
				+	struct kref ref;
			
 
				+	struct work_struct release_work;
			
 
				+
			
 
				 	struct mutex mutex;
			
 
				 
			
 
				 	/*
			
@@ -563,9 +609,10 @@ struct amdkfd_ioctl_desc {
 
				 
			
 
				 void kfd_process_create_wq(void);
			
 
				 void kfd_process_destroy_wq(void);
			
 
				-struct kfd_process *kfd_create_process(const struct task_struct *);
			
 
				+struct kfd_process *kfd_create_process(struct file *filep);
			
 
				 struct kfd_process *kfd_get_process(const struct task_struct *);
			
 
				 struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
			
 
				+void kfd_unref_process(struct kfd_process *p);
			
 
				 
			
 
				 struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
			
 
				 						struct kfd_process *p);
			
@@ -577,6 +624,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 
				 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
			
 
				 							struct kfd_process *p);
			
 
				 
			
 
				+int kfd_reserved_mem_mmap(struct kfd_process *process,
			
 
				+			  struct vm_area_struct *vma);
			
 
				+
			
 
				 /* Process device data iterator */
			
 
				 struct kfd_process_device *kfd_get_first_process_device_data(
			
 
				 							struct kfd_process *p);
			
@@ -624,9 +674,12 @@ int kfd_topology_init(void);
 
				 void kfd_topology_shutdown(void);
			
 
				 int kfd_topology_add_device(struct kfd_dev *gpu);
			
 
				 int kfd_topology_remove_device(struct kfd_dev *gpu);
			
 
				+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
			
 
				+						uint32_t proximity_domain);
			
 
				 struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
			
 
				 struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
			
 
				-struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx);
			
 
				+int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev);
			
 
				+int kfd_numa_node_to_apic_id(int numa_node_id);
			
 
				 
			
 
				 /* Interrupts */
			
 
				 int kfd_interrupt_init(struct kfd_dev *dev);
			
@@ -643,8 +696,6 @@ int kgd2kfd_resume(struct kfd_dev *kfd);
 
				 int kfd_init_apertures(struct kfd_process *process);
			
 
				 
			
 
				 /* Queue Context Management */
			
 
				-struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd);
			
 
				-
			
 
				 int init_queue(struct queue **q, const struct queue_properties *properties);
			
 
				 void uninit_queue(struct queue *q);
			
 
				 void print_queue_properties(struct queue_properties *q);
			
@@ -699,6 +750,7 @@ struct packet_manager {
 
				 	struct mutex lock;
			
 
				 	bool allocated;
			
 
				 	struct kfd_mem_obj *ib_buffer_obj;
			
 
				+	unsigned int ib_size_bytes;
			
 
				 };
			
 
				 
			
 
				 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
			
@@ -745,4 +797,23 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
				 
			
 
				 int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
			
 
				 
			
 
				+/* Debugfs */
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+
			
 
				+void kfd_debugfs_init(void);
			
 
				+void kfd_debugfs_fini(void);
			
 
				+int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data);
			
 
				+int pqm_debugfs_mqds(struct seq_file *m, void *data);
			
 
				+int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data);
			
 
				+int dqm_debugfs_hqds(struct seq_file *m, void *data);
			
 
				+int kfd_debugfs_rls_by_device(struct seq_file *m, void *data);
			
 
				+int pm_debugfs_runlist(struct seq_file *m, void *data);
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+static inline void kfd_debugfs_init(void) {}
			
 
				+static inline void kfd_debugfs_fini(void) {}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				 #endif
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -24,10 +24,12 @@
 
				 #include <linux/log2.h>
			
 
				 #include <linux/sched.h>
			
 
				 #include <linux/sched/mm.h>
			
 
				+#include <linux/sched/task.h>
			
 
				 #include <linux/slab.h>
			
 
				 #include <linux/amd-iommu.h>
			
 
				 #include <linux/notifier.h>
			
 
				 #include <linux/compat.h>
			
 
				+#include <linux/mman.h>
			
 
				 
			
 
				 struct mm_struct;
			
 
				 
			
@@ -46,13 +48,12 @@ DEFINE_STATIC_SRCU(kfd_processes_srcu);
 
				 
			
 
				 static struct workqueue_struct *kfd_process_wq;
			
 
				 
			
 
				-struct kfd_process_release_work {
			
 
				-	struct work_struct kfd_work;
			
 
				-	struct kfd_process *p;
			
 
				-};
			
 
				-
			
 
				 static struct kfd_process *find_process(const struct task_struct *thread);
			
 
				-static struct kfd_process *create_process(const struct task_struct *thread);
			
 
				+static void kfd_process_ref_release(struct kref *ref);
			
 
				+static struct kfd_process *create_process(const struct task_struct *thread,
			
 
				+					struct file *filep);
			
 
				+static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep);
			
 
				+
			
 
				 
			
 
				 void kfd_process_create_wq(void)
			
 
				 {
			
@@ -68,9 +69,10 @@ void kfd_process_destroy_wq(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-struct kfd_process *kfd_create_process(const struct task_struct *thread)
			
 
				+struct kfd_process *kfd_create_process(struct file *filep)
			
 
				 {
			
 
				 	struct kfd_process *process;
			
 
				+	struct task_struct *thread = current;
			
 
				 
			
 
				 	if (!thread->mm)
			
 
				 		return ERR_PTR(-EINVAL);
			
@@ -79,9 +81,6 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
 
				 	if (thread->group_leader->mm != thread->mm)
			
 
				 		return ERR_PTR(-EINVAL);
			
 
				 
			
 
				-	/* Take mmap_sem because we call __mmu_notifier_register inside */
			
 
				-	down_write(&thread->mm->mmap_sem);
			
 
				-
			
 
				 	/*
			
 
				 	 * take kfd processes mutex before starting of process creation
			
 
				 	 * so there won't be a case where two threads of the same process
			
@@ -93,14 +92,11 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
 
				 	process = find_process(thread);
			
 
				 	if (process)
			
 
				 		pr_debug("Process already found\n");
			
 
				-
			
 
				-	if (!process)
			
 
				-		process = create_process(thread);
			
 
				+	else
			
 
				+		process = create_process(thread, filep);
			
 
				 
			
 
				 	mutex_unlock(&kfd_processes_mutex);
			
 
				 
			
 
				-	up_write(&thread->mm->mmap_sem);
			
 
				-
			
 
				 	return process;
			
 
				 }
			
 
				 
			
@@ -144,63 +140,75 @@ static struct kfd_process *find_process(const struct task_struct *thread)
 
				 	return p;
			
 
				 }
			
 
				 
			
 
				-static void kfd_process_wq_release(struct work_struct *work)
			
 
				+void kfd_unref_process(struct kfd_process *p)
			
 
				+{
			
 
				+	kref_put(&p->ref, kfd_process_ref_release);
			
 
				+}
			
 
				+
			
 
				+static void kfd_process_destroy_pdds(struct kfd_process *p)
			
 
				 {
			
 
				-	struct kfd_process_release_work *my_work;
			
 
				 	struct kfd_process_device *pdd, *temp;
			
 
				-	struct kfd_process *p;
			
 
				 
			
 
				-	my_work = (struct kfd_process_release_work *) work;
			
 
				+	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
			
 
				+				 per_device_list) {
			
 
				+		pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n",
			
 
				+				pdd->dev->id, p->pasid);
			
 
				 
			
 
				-	p = my_work->p;
			
 
				+		list_del(&pdd->per_device_list);
			
 
				 
			
 
				-	pr_debug("Releasing process (pasid %d) in workqueue\n",
			
 
				-			p->pasid);
			
 
				+		if (pdd->qpd.cwsr_kaddr)
			
 
				+			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
			
 
				+				get_order(KFD_CWSR_TBA_TMA_SIZE));
			
 
				 
			
 
				-	mutex_lock(&p->mutex);
			
 
				+		kfree(pdd);
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				-	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
			
 
				-							per_device_list) {
			
 
				-		pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
			
 
				-				pdd->dev->id, p->pasid);
			
 
				+/* No process locking is needed in this function, because the process
			
 
				+ * is not findable any more. We must assume that no other thread is
			
 
				+ * using it any more, otherwise we couldn't safely free the process
			
 
				+ * structure in the end.
			
 
				+ */
			
 
				+static void kfd_process_wq_release(struct work_struct *work)
			
 
				+{
			
 
				+	struct kfd_process *p = container_of(work, struct kfd_process,
			
 
				+					     release_work);
			
 
				+	struct kfd_process_device *pdd;
			
 
				+
			
 
				+	pr_debug("Releasing process (pasid %d) in workqueue\n", p->pasid);
			
 
				 
			
 
				+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
			
 
				 		if (pdd->bound == PDD_BOUND)
			
 
				 			amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
			
 
				-
			
 
				-		list_del(&pdd->per_device_list);
			
 
				-		kfree(pdd);
			
 
				 	}
			
 
				 
			
 
				+	kfd_process_destroy_pdds(p);
			
 
				+
			
 
				 	kfd_event_free_process(p);
			
 
				 
			
 
				 	kfd_pasid_free(p->pasid);
			
 
				 	kfd_free_process_doorbells(p);
			
 
				 
			
 
				-	mutex_unlock(&p->mutex);
			
 
				-
			
 
				 	mutex_destroy(&p->mutex);
			
 
				 
			
 
				-	kfree(p);
			
 
				+	put_task_struct(p->lead_thread);
			
 
				 
			
 
				-	kfree(work);
			
 
				+	kfree(p);
			
 
				 }
			
 
				 
			
 
				-static void kfd_process_destroy_delayed(struct rcu_head *rcu)
			
 
				+static void kfd_process_ref_release(struct kref *ref)
			
 
				 {
			
 
				-	struct kfd_process_release_work *work;
			
 
				-	struct kfd_process *p;
			
 
				+	struct kfd_process *p = container_of(ref, struct kfd_process, ref);
			
 
				 
			
 
				-	p = container_of(rcu, struct kfd_process, rcu);
			
 
				-
			
 
				-	mmdrop(p->mm);
			
 
				+	INIT_WORK(&p->release_work, kfd_process_wq_release);
			
 
				+	queue_work(kfd_process_wq, &p->release_work);
			
 
				+}
			
 
				 
			
 
				-	work = kmalloc(sizeof(struct kfd_process_release_work), GFP_ATOMIC);
			
 
				+static void kfd_process_destroy_delayed(struct rcu_head *rcu)
			
 
				+{
			
 
				+	struct kfd_process *p = container_of(rcu, struct kfd_process, rcu);
			
 
				 
			
 
				-	if (work) {
			
 
				-		INIT_WORK((struct work_struct *) work, kfd_process_wq_release);
			
 
				-		work->p = p;
			
 
				-		queue_work(kfd_process_wq, (struct work_struct *) work);
			
 
				-	}
			
 
				+	kfd_unref_process(p);
			
 
				 }
			
 
				 
			
 
				 static void kfd_process_notifier_release(struct mmu_notifier *mn,
			
@@ -244,15 +252,12 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 
				 	kfd_process_dequeue_from_all_devices(p);
			
 
				 	pqm_uninit(&p->pqm);
			
 
				 
			
 
				+	/* Indicate to other users that MM is no longer valid */
			
 
				+	p->mm = NULL;
			
 
				+
			
 
				 	mutex_unlock(&p->mutex);
			
 
				 
			
 
				-	/*
			
 
				-	 * Because we drop mm_count inside kfd_process_destroy_delayed
			
 
				-	 * and because the mmu_notifier_unregister function also drop
			
 
				-	 * mm_count we need to take an extra count here.
			
 
				-	 */
			
 
				-	mmgrab(p->mm);
			
 
				-	mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm);
			
 
				+	mmu_notifier_unregister_no_release(&p->mmu_notifier, mm);
			
 
				 	mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
			
 
				 }
			
 
				 
			
@@ -260,7 +265,44 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
 
				 	.release = kfd_process_notifier_release,
			
 
				 };
			
 
				 
			
 
				-static struct kfd_process *create_process(const struct task_struct *thread)
			
 
				+static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep)
			
 
				+{
			
 
				+	unsigned long  offset;
			
 
				+	struct kfd_process_device *pdd = NULL;
			
 
				+	struct kfd_dev *dev = NULL;
			
 
				+	struct qcm_process_device *qpd = NULL;
			
 
				+
			
 
				+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
			
 
				+		dev = pdd->dev;
			
 
				+		qpd = &pdd->qpd;
			
 
				+		if (!dev->cwsr_enabled || qpd->cwsr_kaddr)
			
 
				+			continue;
			
 
				+		offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << PAGE_SHIFT;
			
 
				+		qpd->tba_addr = (int64_t)vm_mmap(filep, 0,
			
 
				+			KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
			
 
				+			MAP_SHARED, offset);
			
 
				+
			
 
				+		if (IS_ERR_VALUE(qpd->tba_addr)) {
			
 
				+			int err = qpd->tba_addr;
			
 
				+
			
 
				+			pr_err("Failure to set tba address. error %d.\n", err);
			
 
				+			qpd->tba_addr = 0;
			
 
				+			qpd->cwsr_kaddr = NULL;
			
 
				+			return err;
			
 
				+		}
			
 
				+
			
 
				+		memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
			
 
				+
			
 
				+		qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
			
 
				+		pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
			
 
				+			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct kfd_process *create_process(const struct task_struct *thread,
			
 
				+					struct file *filep)
			
 
				 {
			
 
				 	struct kfd_process *process;
			
 
				 	int err = -ENOMEM;
			
@@ -277,13 +319,15 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 
				 	if (kfd_alloc_process_doorbells(process) < 0)
			
 
				 		goto err_alloc_doorbells;
			
 
				 
			
 
				+	kref_init(&process->ref);
			
 
				+
			
 
				 	mutex_init(&process->mutex);
			
 
				 
			
 
				 	process->mm = thread->mm;
			
 
				 
			
 
				 	/* register notifier */
			
 
				 	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
			
 
				-	err = __mmu_notifier_register(&process->mmu_notifier, process->mm);
			
 
				+	err = mmu_notifier_register(&process->mmu_notifier, process->mm);
			
 
				 	if (err)
			
 
				 		goto err_mmu_notifier;
			
 
				 
			
@@ -291,6 +335,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 
				 			(uintptr_t)process->mm);
			
 
				 
			
 
				 	process->lead_thread = thread->group_leader;
			
 
				+	get_task_struct(process->lead_thread);
			
 
				 
			
 
				 	INIT_LIST_HEAD(&process->per_device_data);
			
 
				 
			
@@ -306,8 +351,14 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 
				 	if (err != 0)
			
 
				 		goto err_init_apertures;
			
 
				 
			
 
				+	err = kfd_process_init_cwsr(process, filep);
			
 
				+	if (err)
			
 
				+		goto err_init_cwsr;
			
 
				+
			
 
				 	return process;
			
 
				 
			
 
				+err_init_cwsr:
			
 
				+	kfd_process_destroy_pdds(process);
			
 
				 err_init_apertures:
			
 
				 	pqm_uninit(&process->pqm);
			
 
				 err_process_pqm_init:
			
@@ -343,16 +394,18 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 
				 	struct kfd_process_device *pdd = NULL;
			
 
				 
			
 
				 	pdd = kzalloc(sizeof(*pdd), GFP_KERNEL);
			
 
				-	if (pdd != NULL) {
			
 
				-		pdd->dev = dev;
			
 
				-		INIT_LIST_HEAD(&pdd->qpd.queues_list);
			
 
				-		INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
			
 
				-		pdd->qpd.dqm = dev->dqm;
			
 
				-		pdd->process = p;
			
 
				-		pdd->bound = PDD_UNBOUND;
			
 
				-		pdd->already_dequeued = false;
			
 
				-		list_add(&pdd->per_device_list, &p->per_device_data);
			
 
				-	}
			
 
				+	if (!pdd)
			
 
				+		return NULL;
			
 
				+
			
 
				+	pdd->dev = dev;
			
 
				+	INIT_LIST_HEAD(&pdd->qpd.queues_list);
			
 
				+	INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
			
 
				+	pdd->qpd.dqm = dev->dqm;
			
 
				+	pdd->qpd.pqm = &p->pqm;
			
 
				+	pdd->process = p;
			
 
				+	pdd->bound = PDD_UNBOUND;
			
 
				+	pdd->already_dequeued = false;
			
 
				+	list_add(&pdd->per_device_list, &p->per_device_data);
			
 
				 
			
 
				 	return pdd;
			
 
				 }
			
@@ -483,6 +536,8 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
 
				 
			
 
				 	mutex_unlock(kfd_get_dbgmgr_mutex());
			
 
				 
			
 
				+	mutex_lock(&p->mutex);
			
 
				+
			
 
				 	pdd = kfd_get_process_device_data(dev, p);
			
 
				 	if (pdd)
			
 
				 		/* For GPU relying on IOMMU, we need to dequeue here
			
@@ -491,6 +546,8 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
 
				 		kfd_process_dequeue_from_device(pdd);
			
 
				 
			
 
				 	mutex_unlock(&p->mutex);
			
 
				+
			
 
				+	kfd_unref_process(p);
			
 
				 }
			
 
				 
			
 
				 struct kfd_process_device *kfd_get_first_process_device_data(
			
@@ -515,22 +572,86 @@ bool kfd_has_process_device_data(struct kfd_process *p)
 
				 	return !(list_empty(&p->per_device_data));
			
 
				 }
			
 
				 
			
 
				-/* This returns with process->mutex locked. */
			
 
				+/* This increments the process->ref counter. */
			
 
				 struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
			
 
				 {
			
 
				-	struct kfd_process *p;
			
 
				+	struct kfd_process *p, *ret_p = NULL;
			
 
				 	unsigned int temp;
			
 
				 
			
 
				 	int idx = srcu_read_lock(&kfd_processes_srcu);
			
 
				 
			
 
				 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
			
 
				 		if (p->pasid == pasid) {
			
 
				-			mutex_lock(&p->mutex);
			
 
				+			kref_get(&p->ref);
			
 
				+			ret_p = p;
			
 
				 			break;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	srcu_read_unlock(&kfd_processes_srcu, idx);
			
 
				 
			
 
				-	return p;
			
 
				+	return ret_p;
			
 
				 }
			
 
				+
			
 
				+int kfd_reserved_mem_mmap(struct kfd_process *process,
			
 
				+			  struct vm_area_struct *vma)
			
 
				+{
			
 
				+	struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff);
			
 
				+	struct kfd_process_device *pdd;
			
 
				+	struct qcm_process_device *qpd;
			
 
				+
			
 
				+	if (!dev)
			
 
				+		return -EINVAL;
			
 
				+	if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) {
			
 
				+		pr_err("Incorrect CWSR mapping size.\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	pdd = kfd_get_process_device_data(dev, process);
			
 
				+	if (!pdd)
			
 
				+		return -EINVAL;
			
 
				+	qpd = &pdd->qpd;
			
 
				+
			
 
				+	qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
			
 
				+					get_order(KFD_CWSR_TBA_TMA_SIZE));
			
 
				+	if (!qpd->cwsr_kaddr) {
			
 
				+		pr_err("Error allocating per process CWSR buffer.\n");
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND
			
 
				+		| VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP;
			
 
				+	/* Mapping pages to user process */
			
 
				+	return remap_pfn_range(vma, vma->vm_start,
			
 
				+			       PFN_DOWN(__pa(qpd->cwsr_kaddr)),
			
 
				+			       KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
			
 
				+}
			
 
				+
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+
			
 
				+int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
			
 
				+{
			
 
				+	struct kfd_process *p;
			
 
				+	unsigned int temp;
			
 
				+	int r = 0;
			
 
				+
			
 
				+	int idx = srcu_read_lock(&kfd_processes_srcu);
			
 
				+
			
 
				+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
			
 
				+		seq_printf(m, "Process %d PASID %d:\n",
			
 
				+			   p->lead_thread->tgid, p->pasid);
			
 
				+
			
 
				+		mutex_lock(&p->mutex);
			
 
				+		r = pqm_debugfs_mqds(m, &p->pqm);
			
 
				+		mutex_unlock(&p->mutex);
			
 
				+
			
 
				+		if (r)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	srcu_read_unlock(&kfd_processes_srcu, idx);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -178,10 +178,8 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 
				 		return retval;
			
 
				 
			
 
				 	if (list_empty(&pdd->qpd.queues_list) &&
			
 
				-	    list_empty(&pdd->qpd.priv_queue_list)) {
			
 
				-		pdd->qpd.pqm = pqm;
			
 
				+	    list_empty(&pdd->qpd.priv_queue_list))
			
 
				 		dev->dqm->ops.register_process(dev->dqm, &pdd->qpd);
			
 
				-	}
			
 
				 
			
 
				 	pqn = kzalloc(sizeof(*pqn), GFP_KERNEL);
			
 
				 	if (!pqn) {
			
@@ -203,8 +201,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 
				 			goto err_create_queue;
			
 
				 		pqn->q = q;
			
 
				 		pqn->kq = NULL;
			
 
				-		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd,
			
 
				-						&q->properties.vmid);
			
 
				+		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd);
			
 
				 		pr_debug("DQM returned %d for create_queue\n", retval);
			
 
				 		print_queue(q);
			
 
				 		break;
			
@@ -224,8 +221,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 
				 			goto err_create_queue;
			
 
				 		pqn->q = q;
			
 
				 		pqn->kq = NULL;
			
 
				-		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd,
			
 
				-						&q->properties.vmid);
			
 
				+		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd);
			
 
				 		pr_debug("DQM returned %d for create_queue\n", retval);
			
 
				 		print_queue(q);
			
 
				 		break;
			
@@ -315,6 +311,10 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 
				 	if (pqn->q) {
			
 
				 		dqm = pqn->q->device->dqm;
			
 
				 		retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
			
 
				+		if (retval) {
			
 
				+			pr_debug("Destroy queue failed, returned %d\n", retval);
			
 
				+			goto err_destroy_queue;
			
 
				+		}
			
 
				 		uninit_queue(pqn->q);
			
 
				 	}
			
 
				 
			
@@ -326,6 +326,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 
				 	    list_empty(&pdd->qpd.priv_queue_list))
			
 
				 		dqm->ops.unregister_process(dqm, &pdd->qpd);
			
 
				 
			
 
				+err_destroy_queue:
			
 
				 	return retval;
			
 
				 }
			
 
				 
			
@@ -367,4 +368,67 @@ struct kernel_queue *pqm_get_kernel_queue(
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+
			
 
				+int pqm_debugfs_mqds(struct seq_file *m, void *data)
			
 
				+{
			
 
				+	struct process_queue_manager *pqm = data;
			
 
				+	struct process_queue_node *pqn;
			
 
				+	struct queue *q;
			
 
				+	enum KFD_MQD_TYPE mqd_type;
			
 
				+	struct mqd_manager *mqd_manager;
			
 
				+	int r = 0;
			
 
				+
			
 
				+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
			
 
				+		if (pqn->q) {
			
 
				+			q = pqn->q;
			
 
				+			switch (q->properties.type) {
			
 
				+			case KFD_QUEUE_TYPE_SDMA:
			
 
				+				seq_printf(m, "  SDMA queue on device %x\n",
			
 
				+					   q->device->id);
			
 
				+				mqd_type = KFD_MQD_TYPE_SDMA;
			
 
				+				break;
			
 
				+			case KFD_QUEUE_TYPE_COMPUTE:
			
 
				+				seq_printf(m, "  Compute queue on device %x\n",
			
 
				+					   q->device->id);
			
 
				+				mqd_type = KFD_MQD_TYPE_CP;
			
 
				+				break;
			
 
				+			default:
			
 
				+				seq_printf(m,
			
 
				+				"  Bad user queue type %d on device %x\n",
			
 
				+					   q->properties.type, q->device->id);
			
 
				+				continue;
			
 
				+			}
			
 
				+			mqd_manager = q->device->dqm->ops.get_mqd_manager(
			
 
				+				q->device->dqm, mqd_type);
			
 
				+		} else if (pqn->kq) {
			
 
				+			q = pqn->kq->queue;
			
 
				+			mqd_manager = pqn->kq->mqd;
			
 
				+			switch (q->properties.type) {
			
 
				+			case KFD_QUEUE_TYPE_DIQ:
			
 
				+				seq_printf(m, "  DIQ on device %x\n",
			
 
				+					   pqn->kq->dev->id);
			
 
				+				mqd_type = KFD_MQD_TYPE_HIQ;
			
 
				+				break;
			
 
				+			default:
			
 
				+				seq_printf(m,
			
 
				+				"  Bad kernel queue type %d on device %x\n",
			
 
				+					   q->properties.type,
			
 
				+					   pqn->kq->dev->id);
			
 
				+				continue;
			
 
				+			}
			
 
				+		} else {
			
 
				+			seq_printf(m,
			
 
				+		"  Weird: Queue node with neither kernel nor user queue\n");
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		r = mqd_manager->debugfs_show_mqd(m, q->mqd);
			
 
				+		if (r != 0)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				 
			
 
				+#endif
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -28,27 +28,32 @@
 
				 #include <linux/hash.h>
			
 
				 #include <linux/cpufreq.h>
			
 
				 #include <linux/log2.h>
			
 
				+#include <linux/dmi.h>
			
 
				+#include <linux/atomic.h>
			
 
				 
			
 
				 #include "kfd_priv.h"
			
 
				 #include "kfd_crat.h"
			
 
				 #include "kfd_topology.h"
			
 
				+#include "kfd_device_queue_manager.h"
			
 
				 
			
 
				+/* topology_device_list - Master list of all topology devices */
			
 
				 static struct list_head topology_device_list;
			
 
				-static int topology_crat_parsed;
			
 
				 static struct kfd_system_properties sys_props;
			
 
				 
			
 
				 static DECLARE_RWSEM(topology_lock);
			
 
				+static atomic_t topology_crat_proximity_domain;
			
 
				 
			
 
				-struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
			
 
				+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
			
 
				+						uint32_t proximity_domain)
			
 
				 {
			
 
				 	struct kfd_topology_device *top_dev;
			
 
				-	struct kfd_dev *device = NULL;
			
 
				+	struct kfd_topology_device *device = NULL;
			
 
				 
			
 
				 	down_read(&topology_lock);
			
 
				 
			
 
				 	list_for_each_entry(top_dev, &topology_device_list, list)
			
 
				-		if (top_dev->gpu_id == gpu_id) {
			
 
				-			device = top_dev->gpu;
			
 
				+		if (top_dev->proximity_domain == proximity_domain) {
			
 
				+			device = top_dev;
			
 
				 			break;
			
 
				 		}
			
 
				 
			
@@ -57,7 +62,7 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
 
				 	return device;
			
 
				 }
			
 
				 
			
 
				-struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
			
 
				+struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
			
 
				 {
			
 
				 	struct kfd_topology_device *top_dev;
			
 
				 	struct kfd_dev *device = NULL;
			
@@ -65,7 +70,7 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
 
				 	down_read(&topology_lock);
			
 
				 
			
 
				 	list_for_each_entry(top_dev, &topology_device_list, list)
			
 
				-		if (top_dev->gpu->pdev == pdev) {
			
 
				+		if (top_dev->gpu_id == gpu_id) {
			
 
				 			device = top_dev->gpu;
			
 
				 			break;
			
 
				 		}
			
@@ -75,282 +80,31 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
 
				 	return device;
			
 
				 }
			
 
				 
			
 
				-static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size)
			
 
				-{
			
 
				-	struct acpi_table_header *crat_table;
			
 
				-	acpi_status status;
			
 
				-
			
 
				-	if (!size)
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	/*
			
 
				-	 * Fetch the CRAT table from ACPI
			
 
				-	 */
			
 
				-	status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
			
 
				-	if (status == AE_NOT_FOUND) {
			
 
				-		pr_warn("CRAT table not found\n");
			
 
				-		return -ENODATA;
			
 
				-	} else if (ACPI_FAILURE(status)) {
			
 
				-		const char *err = acpi_format_exception(status);
			
 
				-
			
 
				-		pr_err("CRAT table error: %s\n", err);
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-
			
 
				-	if (*size >= crat_table->length && crat_image != NULL)
			
 
				-		memcpy(crat_image, crat_table, crat_table->length);
			
 
				-
			
 
				-	*size = crat_table->length;
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
			
 
				-		struct crat_subtype_computeunit *cu)
			
 
				-{
			
 
				-	dev->node_props.cpu_cores_count = cu->num_cpu_cores;
			
 
				-	dev->node_props.cpu_core_id_base = cu->processor_id_low;
			
 
				-	if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
			
 
				-		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
			
 
				-
			
 
				-	pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
			
 
				-			cu->processor_id_low);
			
 
				-}
			
 
				-
			
 
				-static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
			
 
				-		struct crat_subtype_computeunit *cu)
			
 
				-{
			
 
				-	dev->node_props.simd_id_base = cu->processor_id_low;
			
 
				-	dev->node_props.simd_count = cu->num_simd_cores;
			
 
				-	dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
			
 
				-	dev->node_props.max_waves_per_simd = cu->max_waves_simd;
			
 
				-	dev->node_props.wave_front_size = cu->wave_front_size;
			
 
				-	dev->node_props.mem_banks_count = cu->num_banks;
			
 
				-	dev->node_props.array_count = cu->num_arrays;
			
 
				-	dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
			
 
				-	dev->node_props.simd_per_cu = cu->num_simd_per_cu;
			
 
				-	dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
			
 
				-	if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
			
 
				-		dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
			
 
				-	pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores,
			
 
				-				cu->processor_id_low);
			
 
				-}
			
 
				-
			
 
				-/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */
			
 
				-static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu)
			
 
				-{
			
 
				-	struct kfd_topology_device *dev;
			
 
				-	int i = 0;
			
 
				-
			
 
				-	pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
			
 
				-			cu->proximity_domain, cu->hsa_capability);
			
 
				-	list_for_each_entry(dev, &topology_device_list, list) {
			
 
				-		if (cu->proximity_domain == i) {
			
 
				-			if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
			
 
				-				kfd_populated_cu_info_cpu(dev, cu);
			
 
				-
			
 
				-			if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
			
 
				-				kfd_populated_cu_info_gpu(dev, cu);
			
 
				-			break;
			
 
				-		}
			
 
				-		i++;
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * kfd_parse_subtype_mem is called when the topology mutex is
			
 
				- * already acquired
			
 
				- */
			
 
				-static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem)
			
 
				-{
			
 
				-	struct kfd_mem_properties *props;
			
 
				-	struct kfd_topology_device *dev;
			
 
				-	int i = 0;
			
 
				-
			
 
				-	pr_info("Found memory entry in CRAT table with proximity_domain=%d\n",
			
 
				-			mem->promixity_domain);
			
 
				-	list_for_each_entry(dev, &topology_device_list, list) {
			
 
				-		if (mem->promixity_domain == i) {
			
 
				-			props = kfd_alloc_struct(props);
			
 
				-			if (props == NULL)
			
 
				-				return -ENOMEM;
			
 
				-
			
 
				-			if (dev->node_props.cpu_cores_count == 0)
			
 
				-				props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE;
			
 
				-			else
			
 
				-				props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
			
 
				-
			
 
				-			if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
			
 
				-				props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
			
 
				-			if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
			
 
				-				props->flags |= HSA_MEM_FLAGS_NON_VOLATILE;
			
 
				-
			
 
				-			props->size_in_bytes =
			
 
				-				((uint64_t)mem->length_high << 32) +
			
 
				-							mem->length_low;
			
 
				-			props->width = mem->width;
			
 
				-
			
 
				-			dev->mem_bank_count++;
			
 
				-			list_add_tail(&props->list, &dev->mem_props);
			
 
				-
			
 
				-			break;
			
 
				-		}
			
 
				-		i++;
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * kfd_parse_subtype_cache is called when the topology mutex
			
 
				- * is already acquired
			
 
				- */
			
 
				-static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache)
			
 
				-{
			
 
				-	struct kfd_cache_properties *props;
			
 
				-	struct kfd_topology_device *dev;
			
 
				-	uint32_t id;
			
 
				-
			
 
				-	id = cache->processor_id_low;
			
 
				-
			
 
				-	pr_info("Found cache entry in CRAT table with processor_id=%d\n", id);
			
 
				-	list_for_each_entry(dev, &topology_device_list, list)
			
 
				-		if (id == dev->node_props.cpu_core_id_base ||
			
 
				-		    id == dev->node_props.simd_id_base) {
			
 
				-			props = kfd_alloc_struct(props);
			
 
				-			if (props == NULL)
			
 
				-				return -ENOMEM;
			
 
				-
			
 
				-			props->processor_id_low = id;
			
 
				-			props->cache_level = cache->cache_level;
			
 
				-			props->cache_size = cache->cache_size;
			
 
				-			props->cacheline_size = cache->cache_line_size;
			
 
				-			props->cachelines_per_tag = cache->lines_per_tag;
			
 
				-			props->cache_assoc = cache->associativity;
			
 
				-			props->cache_latency = cache->cache_latency;
			
 
				-
			
 
				-			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
			
 
				-				props->cache_type |= HSA_CACHE_TYPE_DATA;
			
 
				-			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
			
 
				-				props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
			
 
				-			if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
			
 
				-				props->cache_type |= HSA_CACHE_TYPE_CPU;
			
 
				-			if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
			
 
				-				props->cache_type |= HSA_CACHE_TYPE_HSACU;
			
 
				-
			
 
				-			dev->cache_count++;
			
 
				-			dev->node_props.caches_count++;
			
 
				-			list_add_tail(&props->list, &dev->cache_props);
			
 
				-
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * kfd_parse_subtype_iolink is called when the topology mutex
			
 
				- * is already acquired
			
 
				- */
			
 
				-static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink)
			
 
				+struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
			
 
				 {
			
 
				-	struct kfd_iolink_properties *props;
			
 
				-	struct kfd_topology_device *dev;
			
 
				-	uint32_t i = 0;
			
 
				-	uint32_t id_from;
			
 
				-	uint32_t id_to;
			
 
				-
			
 
				-	id_from = iolink->proximity_domain_from;
			
 
				-	id_to = iolink->proximity_domain_to;
			
 
				+	struct kfd_topology_device *top_dev;
			
 
				+	struct kfd_dev *device = NULL;
			
 
				 
			
 
				-	pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from);
			
 
				-	list_for_each_entry(dev, &topology_device_list, list) {
			
 
				-		if (id_from == i) {
			
 
				-			props = kfd_alloc_struct(props);
			
 
				-			if (props == NULL)
			
 
				-				return -ENOMEM;
			
 
				-
			
 
				-			props->node_from = id_from;
			
 
				-			props->node_to = id_to;
			
 
				-			props->ver_maj = iolink->version_major;
			
 
				-			props->ver_min = iolink->version_minor;
			
 
				-
			
 
				-			/*
			
 
				-			 * weight factor (derived from CDIR), currently always 1
			
 
				-			 */
			
 
				-			props->weight = 1;
			
 
				-
			
 
				-			props->min_latency = iolink->minimum_latency;
			
 
				-			props->max_latency = iolink->maximum_latency;
			
 
				-			props->min_bandwidth = iolink->minimum_bandwidth_mbs;
			
 
				-			props->max_bandwidth = iolink->maximum_bandwidth_mbs;
			
 
				-			props->rec_transfer_size =
			
 
				-					iolink->recommended_transfer_size;
			
 
				-
			
 
				-			dev->io_link_count++;
			
 
				-			dev->node_props.io_links_count++;
			
 
				-			list_add_tail(&props->list, &dev->io_link_props);
			
 
				+	down_read(&topology_lock);
			
 
				 
			
 
				+	list_for_each_entry(top_dev, &topology_device_list, list)
			
 
				+		if (top_dev->gpu->pdev == pdev) {
			
 
				+			device = top_dev->gpu;
			
 
				 			break;
			
 
				 		}
			
 
				-		i++;
			
 
				-	}
			
 
				 
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr)
			
 
				-{
			
 
				-	struct crat_subtype_computeunit *cu;
			
 
				-	struct crat_subtype_memory *mem;
			
 
				-	struct crat_subtype_cache *cache;
			
 
				-	struct crat_subtype_iolink *iolink;
			
 
				-	int ret = 0;
			
 
				-
			
 
				-	switch (sub_type_hdr->type) {
			
 
				-	case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
			
 
				-		cu = (struct crat_subtype_computeunit *)sub_type_hdr;
			
 
				-		ret = kfd_parse_subtype_cu(cu);
			
 
				-		break;
			
 
				-	case CRAT_SUBTYPE_MEMORY_AFFINITY:
			
 
				-		mem = (struct crat_subtype_memory *)sub_type_hdr;
			
 
				-		ret = kfd_parse_subtype_mem(mem);
			
 
				-		break;
			
 
				-	case CRAT_SUBTYPE_CACHE_AFFINITY:
			
 
				-		cache = (struct crat_subtype_cache *)sub_type_hdr;
			
 
				-		ret = kfd_parse_subtype_cache(cache);
			
 
				-		break;
			
 
				-	case CRAT_SUBTYPE_TLB_AFFINITY:
			
 
				-		/*
			
 
				-		 * For now, nothing to do here
			
 
				-		 */
			
 
				-		pr_info("Found TLB entry in CRAT table (not processing)\n");
			
 
				-		break;
			
 
				-	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
			
 
				-		/*
			
 
				-		 * For now, nothing to do here
			
 
				-		 */
			
 
				-		pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n");
			
 
				-		break;
			
 
				-	case CRAT_SUBTYPE_IOLINK_AFFINITY:
			
 
				-		iolink = (struct crat_subtype_iolink *)sub_type_hdr;
			
 
				-		ret = kfd_parse_subtype_iolink(iolink);
			
 
				-		break;
			
 
				-	default:
			
 
				-		pr_warn("Unknown subtype (%d) in CRAT\n",
			
 
				-				sub_type_hdr->type);
			
 
				-	}
			
 
				+	up_read(&topology_lock);
			
 
				 
			
 
				-	return ret;
			
 
				+	return device;
			
 
				 }
			
 
				 
			
 
				+/* Called with write topology_lock acquired */
			
 
				 static void kfd_release_topology_device(struct kfd_topology_device *dev)
			
 
				 {
			
 
				 	struct kfd_mem_properties *mem;
			
 
				 	struct kfd_cache_properties *cache;
			
 
				 	struct kfd_iolink_properties *iolink;
			
 
				+	struct kfd_perf_properties *perf;
			
 
				 
			
 
				 	list_del(&dev->list);
			
 
				 
			
@@ -375,25 +129,35 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev)
 
				 		kfree(iolink);
			
 
				 	}
			
 
				 
			
 
				-	kfree(dev);
			
 
				+	while (dev->perf_props.next != &dev->perf_props) {
			
 
				+		perf = container_of(dev->perf_props.next,
			
 
				+				struct kfd_perf_properties, list);
			
 
				+		list_del(&perf->list);
			
 
				+		kfree(perf);
			
 
				+	}
			
 
				 
			
 
				-	sys_props.num_devices--;
			
 
				+	kfree(dev);
			
 
				 }
			
 
				 
			
 
				-static void kfd_release_live_view(void)
			
 
				+void kfd_release_topology_device_list(struct list_head *device_list)
			
 
				 {
			
 
				 	struct kfd_topology_device *dev;
			
 
				 
			
 
				-	while (topology_device_list.next != &topology_device_list) {
			
 
				-		dev = container_of(topology_device_list.next,
			
 
				-				 struct kfd_topology_device, list);
			
 
				+	while (!list_empty(device_list)) {
			
 
				+		dev = list_first_entry(device_list,
			
 
				+				       struct kfd_topology_device, list);
			
 
				 		kfd_release_topology_device(dev);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				+static void kfd_release_live_view(void)
			
 
				+{
			
 
				+	kfd_release_topology_device_list(&topology_device_list);
			
 
				 	memset(&sys_props, 0, sizeof(sys_props));
			
 
				 }
			
 
				 
			
 
				-static struct kfd_topology_device *kfd_create_topology_device(void)
			
 
				+struct kfd_topology_device *kfd_create_topology_device(
			
 
				+				struct list_head *device_list)
			
 
				 {
			
 
				 	struct kfd_topology_device *dev;
			
 
				 
			
@@ -406,65 +170,13 @@ static struct kfd_topology_device *kfd_create_topology_device(void)
 
				 	INIT_LIST_HEAD(&dev->mem_props);
			
 
				 	INIT_LIST_HEAD(&dev->cache_props);
			
 
				 	INIT_LIST_HEAD(&dev->io_link_props);
			
 
				+	INIT_LIST_HEAD(&dev->perf_props);
			
 
				 
			
 
				-	list_add_tail(&dev->list, &topology_device_list);
			
 
				-	sys_props.num_devices++;
			
 
				+	list_add_tail(&dev->list, device_list);
			
 
				 
			
 
				 	return dev;
			
 
				 }
			
 
				 
			
 
				-static int kfd_parse_crat_table(void *crat_image)
			
 
				-{
			
 
				-	struct kfd_topology_device *top_dev;
			
 
				-	struct crat_subtype_generic *sub_type_hdr;
			
 
				-	uint16_t node_id;
			
 
				-	int ret;
			
 
				-	struct crat_header *crat_table = (struct crat_header *)crat_image;
			
 
				-	uint16_t num_nodes;
			
 
				-	uint32_t image_len;
			
 
				-
			
 
				-	if (!crat_image)
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	num_nodes = crat_table->num_domains;
			
 
				-	image_len = crat_table->length;
			
 
				-
			
 
				-	pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
			
 
				-
			
 
				-	for (node_id = 0; node_id < num_nodes; node_id++) {
			
 
				-		top_dev = kfd_create_topology_device();
			
 
				-		if (!top_dev) {
			
 
				-			kfd_release_live_view();
			
 
				-			return -ENOMEM;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	sys_props.platform_id =
			
 
				-		(*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK;
			
 
				-	sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id);
			
 
				-	sys_props.platform_rev = crat_table->revision;
			
 
				-
			
 
				-	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
			
 
				-	while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
			
 
				-			((char *)crat_image) + image_len) {
			
 
				-		if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
			
 
				-			ret = kfd_parse_subtype(sub_type_hdr);
			
 
				-			if (ret != 0) {
			
 
				-				kfd_release_live_view();
			
 
				-				return ret;
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
			
 
				-				sub_type_hdr->length);
			
 
				-	}
			
 
				-
			
 
				-	sys_props.generation_count++;
			
 
				-	topology_crat_parsed = 1;
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 
			
 
				 #define sysfs_show_gen_prop(buffer, fmt, ...) \
			
 
				 		snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__)
			
@@ -501,11 +213,17 @@ static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static void kfd_topology_kobj_release(struct kobject *kobj)
			
 
				+{
			
 
				+	kfree(kobj);
			
 
				+}
			
 
				+
			
 
				 static const struct sysfs_ops sysprops_ops = {
			
 
				 	.show = sysprops_show,
			
 
				 };
			
 
				 
			
 
				 static struct kobj_type sysprops_type = {
			
 
				+	.release = kfd_topology_kobj_release,
			
 
				 	.sysfs_ops = &sysprops_ops,
			
 
				 };
			
 
				 
			
@@ -541,6 +259,7 @@ static const struct sysfs_ops iolink_ops = {
 
				 };
			
 
				 
			
 
				 static struct kobj_type iolink_type = {
			
 
				+	.release = kfd_topology_kobj_release,
			
 
				 	.sysfs_ops = &iolink_ops,
			
 
				 };
			
 
				 
			
@@ -568,6 +287,7 @@ static const struct sysfs_ops mem_ops = {
 
				 };
			
 
				 
			
 
				 static struct kobj_type mem_type = {
			
 
				+	.release = kfd_topology_kobj_release,
			
 
				 	.sysfs_ops = &mem_ops,
			
 
				 };
			
 
				 
			
@@ -575,7 +295,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
 
				 		char *buffer)
			
 
				 {
			
 
				 	ssize_t ret;
			
 
				-	uint32_t i;
			
 
				+	uint32_t i, j;
			
 
				 	struct kfd_cache_properties *cache;
			
 
				 
			
 
				 	/* Making sure that the buffer is an empty string */
			
@@ -593,12 +313,18 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
 
				 	sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency);
			
 
				 	sysfs_show_32bit_prop(buffer, "type", cache->cache_type);
			
 
				 	snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer);
			
 
				-	for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++)
			
 
				-		ret = snprintf(buffer, PAGE_SIZE, "%s%d%s",
			
 
				-				buffer, cache->sibling_map[i],
			
 
				-				(i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ?
			
 
				-						"\n" : ",");
			
 
				-
			
 
				+	for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
			
 
				+		for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) {
			
 
				+			/* Check each bit */
			
 
				+			if (cache->sibling_map[i] & (1 << j))
			
 
				+				ret = snprintf(buffer, PAGE_SIZE,
			
 
				+					 "%s%d%s", buffer, 1, ",");
			
 
				+			else
			
 
				+				ret = snprintf(buffer, PAGE_SIZE,
			
 
				+					 "%s%d%s", buffer, 0, ",");
			
 
				+		}
			
 
				+	/* Replace the last "," with end of line */
			
 
				+	*(buffer + strlen(buffer) - 1) = 0xA;
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -607,9 +333,43 @@ static const struct sysfs_ops cache_ops = {
 
				 };
			
 
				 
			
 
				 static struct kobj_type cache_type = {
			
 
				+	.release = kfd_topology_kobj_release,
			
 
				 	.sysfs_ops = &cache_ops,
			
 
				 };
			
 
				 
			
 
				+/****** Sysfs of Performance Counters ******/
			
 
				+
			
 
				+struct kfd_perf_attr {
			
 
				+	struct kobj_attribute attr;
			
 
				+	uint32_t data;
			
 
				+};
			
 
				+
			
 
				+static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs,
			
 
				+			char *buf)
			
 
				+{
			
 
				+	struct kfd_perf_attr *attr;
			
 
				+
			
 
				+	buf[0] = 0;
			
 
				+	attr = container_of(attrs, struct kfd_perf_attr, attr);
			
 
				+	if (!attr->data) /* invalid data for PMC */
			
 
				+		return 0;
			
 
				+	else
			
 
				+		return sysfs_show_32bit_val(buf, attr->data);
			
 
				+}
			
 
				+
			
 
				+#define KFD_PERF_DESC(_name, _data)			\
			
 
				+{							\
			
 
				+	.attr  = __ATTR(_name, 0444, perf_show, NULL),	\
			
 
				+	.data = _data,					\
			
 
				+}
			
 
				+
			
 
				+static struct kfd_perf_attr perf_attr_iommu[] = {
			
 
				+	KFD_PERF_DESC(max_concurrent, 0),
			
 
				+	KFD_PERF_DESC(num_counters, 0),
			
 
				+	KFD_PERF_DESC(counter_ids, 0),
			
 
				+};
			
 
				+/****************************************/
			
 
				+
			
 
				 static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
			
 
				 		char *buffer)
			
 
				 {
			
@@ -646,18 +406,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 
				 			dev->node_props.cpu_cores_count);
			
 
				 	sysfs_show_32bit_prop(buffer, "simd_count",
			
 
				 			dev->node_props.simd_count);
			
 
				-
			
 
				-	if (dev->mem_bank_count < dev->node_props.mem_banks_count) {
			
 
				-		pr_info_once("mem_banks_count truncated from %d to %d\n",
			
 
				-				dev->node_props.mem_banks_count,
			
 
				-				dev->mem_bank_count);
			
 
				-		sysfs_show_32bit_prop(buffer, "mem_banks_count",
			
 
				-				dev->mem_bank_count);
			
 
				-	} else {
			
 
				-		sysfs_show_32bit_prop(buffer, "mem_banks_count",
			
 
				-				dev->node_props.mem_banks_count);
			
 
				-	}
			
 
				-
			
 
				+	sysfs_show_32bit_prop(buffer, "mem_banks_count",
			
 
				+			dev->node_props.mem_banks_count);
			
 
				 	sysfs_show_32bit_prop(buffer, "caches_count",
			
 
				 			dev->node_props.caches_count);
			
 
				 	sysfs_show_32bit_prop(buffer, "io_links_count",
			
@@ -705,9 +455,12 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 
				 				HSA_CAP_WATCH_POINTS_TOTALBITS_MASK);
			
 
				 		}
			
 
				 
			
 
				+		if (dev->gpu->device_info->asic_family == CHIP_TONGA)
			
 
				+			dev->node_props.capability |=
			
 
				+					HSA_CAP_AQL_QUEUE_DOUBLE_MAP;
			
 
				+
			
 
				 		sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute",
			
 
				-			dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(
			
 
				-					dev->gpu->kgd));
			
 
				+			dev->node_props.max_engine_clk_fcompute);
			
 
				 
			
 
				 		sysfs_show_64bit_prop(buffer, "local_mem_size",
			
 
				 				(unsigned long long int) 0);
			
@@ -729,6 +482,7 @@ static const struct sysfs_ops node_ops = {
 
				 };
			
 
				 
			
 
				 static struct kobj_type node_type = {
			
 
				+	.release = kfd_topology_kobj_release,
			
 
				 	.sysfs_ops = &node_ops,
			
 
				 };
			
 
				 
			
@@ -744,6 +498,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
 
				 	struct kfd_iolink_properties *iolink;
			
 
				 	struct kfd_cache_properties *cache;
			
 
				 	struct kfd_mem_properties *mem;
			
 
				+	struct kfd_perf_properties *perf;
			
 
				 
			
 
				 	if (dev->kobj_iolink) {
			
 
				 		list_for_each_entry(iolink, &dev->io_link_props, list)
			
@@ -780,6 +535,16 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
 
				 		dev->kobj_mem = NULL;
			
 
				 	}
			
 
				 
			
 
				+	if (dev->kobj_perf) {
			
 
				+		list_for_each_entry(perf, &dev->perf_props, list) {
			
 
				+			kfree(perf->attr_group);
			
 
				+			perf->attr_group = NULL;
			
 
				+		}
			
 
				+		kobject_del(dev->kobj_perf);
			
 
				+		kobject_put(dev->kobj_perf);
			
 
				+		dev->kobj_perf = NULL;
			
 
				+	}
			
 
				+
			
 
				 	if (dev->kobj_node) {
			
 
				 		sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid);
			
 
				 		sysfs_remove_file(dev->kobj_node, &dev->attr_name);
			
@@ -796,8 +561,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
 
				 	struct kfd_iolink_properties *iolink;
			
 
				 	struct kfd_cache_properties *cache;
			
 
				 	struct kfd_mem_properties *mem;
			
 
				+	struct kfd_perf_properties *perf;
			
 
				 	int ret;
			
 
				-	uint32_t i;
			
 
				+	uint32_t i, num_attrs;
			
 
				+	struct attribute **attrs;
			
 
				 
			
 
				 	if (WARN_ON(dev->kobj_node))
			
 
				 		return -EEXIST;
			
@@ -826,6 +593,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
 
				 	if (!dev->kobj_iolink)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				+	dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node);
			
 
				+	if (!dev->kobj_perf)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				 	/*
			
 
				 	 * Creating sysfs files for node properties
			
 
				 	 */
			
@@ -903,11 +674,38 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
 
				 		if (ret < 0)
			
 
				 			return ret;
			
 
				 		i++;
			
 
				-}
			
 
				+	}
			
 
				+
			
 
				+	/* All hardware blocks have the same number of attributes. */
			
 
				+	num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr);
			
 
				+	list_for_each_entry(perf, &dev->perf_props, list) {
			
 
				+		perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr)
			
 
				+			* num_attrs + sizeof(struct attribute_group),
			
 
				+			GFP_KERNEL);
			
 
				+		if (!perf->attr_group)
			
 
				+			return -ENOMEM;
			
 
				+
			
 
				+		attrs = (struct attribute **)(perf->attr_group + 1);
			
 
				+		if (!strcmp(perf->block_name, "iommu")) {
			
 
				+		/* Information of IOMMU's num_counters and counter_ids is shown
			
 
				+		 * under /sys/bus/event_source/devices/amd_iommu. We don't
			
 
				+		 * duplicate here.
			
 
				+		 */
			
 
				+			perf_attr_iommu[0].data = perf->max_concurrent;
			
 
				+			for (i = 0; i < num_attrs; i++)
			
 
				+				attrs[i] = &perf_attr_iommu[i].attr.attr;
			
 
				+		}
			
 
				+		perf->attr_group->name = perf->block_name;
			
 
				+		perf->attr_group->attrs = attrs;
			
 
				+		ret = sysfs_create_group(dev->kobj_perf, perf->attr_group);
			
 
				+		if (ret < 0)
			
 
				+			return ret;
			
 
				+	}
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/* Called with write topology lock acquired */
			
 
				 static int kfd_build_sysfs_node_tree(void)
			
 
				 {
			
 
				 	struct kfd_topology_device *dev;
			
@@ -924,6 +722,7 @@ static int kfd_build_sysfs_node_tree(void)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/* Called with write topology lock acquired */
			
 
				 static void kfd_remove_sysfs_node_tree(void)
			
 
				 {
			
 
				 	struct kfd_topology_device *dev;
			
@@ -995,75 +794,246 @@ static void kfd_topology_release_sysfs(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+/* Called with write topology_lock acquired */
			
 
				+static void kfd_topology_update_device_list(struct list_head *temp_list,
			
 
				+					struct list_head *master_list)
			
 
				+{
			
 
				+	while (!list_empty(temp_list)) {
			
 
				+		list_move_tail(temp_list->next, master_list);
			
 
				+		sys_props.num_devices++;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void kfd_debug_print_topology(void)
			
 
				+{
			
 
				+	struct kfd_topology_device *dev;
			
 
				+
			
 
				+	down_read(&topology_lock);
			
 
				+
			
 
				+	dev = list_last_entry(&topology_device_list,
			
 
				+			struct kfd_topology_device, list);
			
 
				+	if (dev) {
			
 
				+		if (dev->node_props.cpu_cores_count &&
			
 
				+				dev->node_props.simd_count) {
			
 
				+			pr_info("Topology: Add APU node [0x%0x:0x%0x]\n",
			
 
				+				dev->node_props.device_id,
			
 
				+				dev->node_props.vendor_id);
			
 
				+		} else if (dev->node_props.cpu_cores_count)
			
 
				+			pr_info("Topology: Add CPU node\n");
			
 
				+		else if (dev->node_props.simd_count)
			
 
				+			pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n",
			
 
				+				dev->node_props.device_id,
			
 
				+				dev->node_props.vendor_id);
			
 
				+	}
			
 
				+	up_read(&topology_lock);
			
 
				+}
			
 
				+
			
 
				+/* Helper function for intializing platform_xx members of
			
 
				+ * kfd_system_properties. Uses OEM info from the last CPU/APU node.
			
 
				+ */
			
 
				+static void kfd_update_system_properties(void)
			
 
				+{
			
 
				+	struct kfd_topology_device *dev;
			
 
				+
			
 
				+	down_read(&topology_lock);
			
 
				+	dev = list_last_entry(&topology_device_list,
			
 
				+			struct kfd_topology_device, list);
			
 
				+	if (dev) {
			
 
				+		sys_props.platform_id =
			
 
				+			(*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK;
			
 
				+		sys_props.platform_oem = *((uint64_t *)dev->oem_table_id);
			
 
				+		sys_props.platform_rev = dev->oem_revision;
			
 
				+	}
			
 
				+	up_read(&topology_lock);
			
 
				+}
			
 
				+
			
 
				+static void find_system_memory(const struct dmi_header *dm,
			
 
				+	void *private)
			
 
				+{
			
 
				+	struct kfd_mem_properties *mem;
			
 
				+	u16 mem_width, mem_clock;
			
 
				+	struct kfd_topology_device *kdev =
			
 
				+		(struct kfd_topology_device *)private;
			
 
				+	const u8 *dmi_data = (const u8 *)(dm + 1);
			
 
				+
			
 
				+	if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) {
			
 
				+		mem_width = (u16)(*(const u16 *)(dmi_data + 0x6));
			
 
				+		mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11));
			
 
				+		list_for_each_entry(mem, &kdev->mem_props, list) {
			
 
				+			if (mem_width != 0xFFFF && mem_width != 0)
			
 
				+				mem->width = mem_width;
			
 
				+			if (mem_clock != 0)
			
 
				+				mem->mem_clk_max = mem_clock;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Performance counters information is not part of CRAT but we would like to
			
 
				+ * put them in the sysfs under topology directory for Thunk to get the data.
			
 
				+ * This function is called before updating the sysfs.
			
 
				+ */
			
 
				+static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev)
			
 
				+{
			
 
				+	struct kfd_perf_properties *props;
			
 
				+
			
 
				+	if (amd_iommu_pc_supported()) {
			
 
				+		props = kfd_alloc_struct(props);
			
 
				+		if (!props)
			
 
				+			return -ENOMEM;
			
 
				+		strcpy(props->block_name, "iommu");
			
 
				+		props->max_concurrent = amd_iommu_pc_get_max_banks(0) *
			
 
				+			amd_iommu_pc_get_max_counters(0); /* assume one iommu */
			
 
				+		list_add_tail(&props->list, &kdev->perf_props);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* kfd_add_non_crat_information - Add information that is not currently
			
 
				+ *	defined in CRAT but is necessary for KFD topology
			
 
				+ * @dev - topology device to which addition info is added
			
 
				+ */
			
 
				+static void kfd_add_non_crat_information(struct kfd_topology_device *kdev)
			
 
				+{
			
 
				+	/* Check if CPU only node. */
			
 
				+	if (!kdev->gpu) {
			
 
				+		/* Add system memory information */
			
 
				+		dmi_walk(find_system_memory, kdev);
			
 
				+	}
			
 
				+	/* TODO: For GPU node, rearrange code from kfd_topology_add_device */
			
 
				+}
			
 
				+
			
 
				+/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices.
			
 
				+ *	Ignore CRAT for all other devices. AMD APU is identified if both CPU
			
 
				+ *	and GPU cores are present.
			
 
				+ * @device_list - topology device list created by parsing ACPI CRAT table.
			
 
				+ * @return - TRUE if invalid, FALSE is valid.
			
 
				+ */
			
 
				+static bool kfd_is_acpi_crat_invalid(struct list_head *device_list)
			
 
				+{
			
 
				+	struct kfd_topology_device *dev;
			
 
				+
			
 
				+	list_for_each_entry(dev, device_list, list) {
			
 
				+		if (dev->node_props.cpu_cores_count &&
			
 
				+			dev->node_props.simd_count)
			
 
				+			return false;
			
 
				+	}
			
 
				+	pr_info("Ignoring ACPI CRAT on non-APU system\n");
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				 int kfd_topology_init(void)
			
 
				 {
			
 
				 	void *crat_image = NULL;
			
 
				 	size_t image_size = 0;
			
 
				 	int ret;
			
 
				-
			
 
				-	/*
			
 
				-	 * Initialize the head for the topology device list
			
 
				+	struct list_head temp_topology_device_list;
			
 
				+	int cpu_only_node = 0;
			
 
				+	struct kfd_topology_device *kdev;
			
 
				+	int proximity_domain;
			
 
				+
			
 
				+	/* topology_device_list - Master list of all topology devices
			
 
				+	 * temp_topology_device_list - temporary list created while parsing CRAT
			
 
				+	 * or VCRAT. Once parsing is complete the contents of list is moved to
			
 
				+	 * topology_device_list
			
 
				 	 */
			
 
				+
			
 
				+	/* Initialize the head for the both the lists */
			
 
				 	INIT_LIST_HEAD(&topology_device_list);
			
 
				+	INIT_LIST_HEAD(&temp_topology_device_list);
			
 
				 	init_rwsem(&topology_lock);
			
 
				-	topology_crat_parsed = 0;
			
 
				 
			
 
				 	memset(&sys_props, 0, sizeof(sys_props));
			
 
				 
			
 
				+	/* Proximity domains in ACPI CRAT tables start counting at
			
 
				+	 * 0. The same should be true for virtual CRAT tables created
			
 
				+	 * at this stage. GPUs added later in kfd_topology_add_device
			
 
				+	 * use a counter.
			
 
				+	 */
			
 
				+	proximity_domain = 0;
			
 
				+
			
 
				 	/*
			
 
				-	 * Get the CRAT image from the ACPI
			
 
				+	 * Get the CRAT image from the ACPI. If ACPI doesn't have one
			
 
				+	 * or if ACPI CRAT is invalid create a virtual CRAT.
			
 
				+	 * NOTE: The current implementation expects all AMD APUs to have
			
 
				+	 *	CRAT. If no CRAT is available, it is assumed to be a CPU
			
 
				 	 */
			
 
				-	ret = kfd_topology_get_crat_acpi(crat_image, &image_size);
			
 
				-	if (ret == 0 && image_size > 0) {
			
 
				-		pr_info("Found CRAT image with size=%zd\n", image_size);
			
 
				-		crat_image = kmalloc(image_size, GFP_KERNEL);
			
 
				-		if (!crat_image) {
			
 
				-			ret = -ENOMEM;
			
 
				-			pr_err("No memory for allocating CRAT image\n");
			
 
				-			goto err;
			
 
				+	ret = kfd_create_crat_image_acpi(&crat_image, &image_size);
			
 
				+	if (!ret) {
			
 
				+		ret = kfd_parse_crat_table(crat_image,
			
 
				+					   &temp_topology_device_list,
			
 
				+					   proximity_domain);
			
 
				+		if (ret ||
			
 
				+		    kfd_is_acpi_crat_invalid(&temp_topology_device_list)) {
			
 
				+			kfd_release_topology_device_list(
			
 
				+				&temp_topology_device_list);
			
 
				+			kfd_destroy_crat_image(crat_image);
			
 
				+			crat_image = NULL;
			
 
				 		}
			
 
				-		ret = kfd_topology_get_crat_acpi(crat_image, &image_size);
			
 
				-
			
 
				-		if (ret == 0) {
			
 
				-			down_write(&topology_lock);
			
 
				-			ret = kfd_parse_crat_table(crat_image);
			
 
				-			if (ret == 0)
			
 
				-				ret = kfd_topology_update_sysfs();
			
 
				-			up_write(&topology_lock);
			
 
				-		} else {
			
 
				-			pr_err("Couldn't get CRAT table size from ACPI\n");
			
 
				+	}
			
 
				+
			
 
				+	if (!crat_image) {
			
 
				+		ret = kfd_create_crat_image_virtual(&crat_image, &image_size,
			
 
				+						    COMPUTE_UNIT_CPU, NULL,
			
 
				+						    proximity_domain);
			
 
				+		cpu_only_node = 1;
			
 
				+		if (ret) {
			
 
				+			pr_err("Error creating VCRAT table for CPU\n");
			
 
				+			return ret;
			
 
				 		}
			
 
				-		kfree(crat_image);
			
 
				-	} else if (ret == -ENODATA) {
			
 
				-		ret = 0;
			
 
				-	} else {
			
 
				-		pr_err("Couldn't get CRAT table size from ACPI\n");
			
 
				+
			
 
				+		ret = kfd_parse_crat_table(crat_image,
			
 
				+					   &temp_topology_device_list,
			
 
				+					   proximity_domain);
			
 
				+		if (ret) {
			
 
				+			pr_err("Error parsing VCRAT table for CPU\n");
			
 
				+			goto err;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	kdev = list_first_entry(&temp_topology_device_list,
			
 
				+				struct kfd_topology_device, list);
			
 
				+	kfd_add_perf_to_topology(kdev);
			
 
				+
			
 
				+	down_write(&topology_lock);
			
 
				+	kfd_topology_update_device_list(&temp_topology_device_list,
			
 
				+					&topology_device_list);
			
 
				+	atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1);
			
 
				+	ret = kfd_topology_update_sysfs();
			
 
				+	up_write(&topology_lock);
			
 
				+
			
 
				+	if (!ret) {
			
 
				+		sys_props.generation_count++;
			
 
				+		kfd_update_system_properties();
			
 
				+		kfd_debug_print_topology();
			
 
				+		pr_info("Finished initializing topology\n");
			
 
				+	} else
			
 
				+		pr_err("Failed to update topology in sysfs ret=%d\n", ret);
			
 
				+
			
 
				+	/* For nodes with GPU, this information gets added
			
 
				+	 * when GPU is detected (kfd_topology_add_device).
			
 
				+	 */
			
 
				+	if (cpu_only_node) {
			
 
				+		/* Add additional information to CPU only node created above */
			
 
				+		down_write(&topology_lock);
			
 
				+		kdev = list_first_entry(&topology_device_list,
			
 
				+				struct kfd_topology_device, list);
			
 
				+		up_write(&topology_lock);
			
 
				+		kfd_add_non_crat_information(kdev);
			
 
				 	}
			
 
				 
			
 
				 err:
			
 
				-	pr_info("Finished initializing topology ret=%d\n", ret);
			
 
				+	kfd_destroy_crat_image(crat_image);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				 void kfd_topology_shutdown(void)
			
 
				 {
			
 
				+	down_write(&topology_lock);
			
 
				 	kfd_topology_release_sysfs();
			
 
				 	kfd_release_live_view();
			
 
				-}
			
 
				-
			
 
				-static void kfd_debug_print_topology(void)
			
 
				-{
			
 
				-	struct kfd_topology_device *dev;
			
 
				-	uint32_t i = 0;
			
 
				-
			
 
				-	pr_info("DEBUG PRINT OF TOPOLOGY:");
			
 
				-	list_for_each_entry(dev, &topology_device_list, list) {
			
 
				-		pr_info("Node: %d\n", i);
			
 
				-		pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no"));
			
 
				-		pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count);
			
 
				-		pr_info("\tSIMD count: %d", dev->node_props.simd_count);
			
 
				-		i++;
			
 
				-	}
			
 
				+	up_write(&topology_lock);
			
 
				 }
			
 
				 
			
 
				 static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
			
@@ -1072,11 +1042,15 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
 
				 	uint32_t buf[7];
			
 
				 	uint64_t local_mem_size;
			
 
				 	int i;
			
 
				+	struct kfd_local_mem_info local_mem_info;
			
 
				 
			
 
				 	if (!gpu)
			
 
				 		return 0;
			
 
				 
			
 
				-	local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd);
			
 
				+	gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info);
			
 
				+
			
 
				+	local_mem_size = local_mem_info.local_mem_size_private +
			
 
				+			local_mem_info.local_mem_size_public;
			
 
				 
			
 
				 	buf[0] = gpu->pdev->devfn;
			
 
				 	buf[1] = gpu->pdev->subsystem_vendor;
			
@@ -1091,19 +1065,26 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
 
				 
			
 
				 	return hashout;
			
 
				 }
			
 
				-
			
 
				+/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
			
 
				+ *		the GPU device is not already present in the topology device
			
 
				+ *		list then return NULL. This means a new topology device has to
			
 
				+ *		be created for this GPU.
			
 
				+ * TODO: Rather than assiging @gpu to first topology device withtout
			
 
				+ *		gpu attached, it will better to have more stringent check.
			
 
				+ */
			
 
				 static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
			
 
				 {
			
 
				 	struct kfd_topology_device *dev;
			
 
				 	struct kfd_topology_device *out_dev = NULL;
			
 
				 
			
 
				+	down_write(&topology_lock);
			
 
				 	list_for_each_entry(dev, &topology_device_list, list)
			
 
				 		if (!dev->gpu && (dev->node_props.simd_count > 0)) {
			
 
				 			dev->gpu = gpu;
			
 
				 			out_dev = dev;
			
 
				 			break;
			
 
				 		}
			
 
				-
			
 
				+	up_write(&topology_lock);
			
 
				 	return out_dev;
			
 
				 }
			
 
				 
			
@@ -1115,84 +1096,196 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival)
 
				 	 */
			
 
				 }
			
 
				 
			
 
				+/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info,
			
 
				+ *		patch this after CRAT parsing.
			
 
				+ */
			
 
				+static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev)
			
 
				+{
			
 
				+	struct kfd_mem_properties *mem;
			
 
				+	struct kfd_local_mem_info local_mem_info;
			
 
				+
			
 
				+	if (!dev)
			
 
				+		return;
			
 
				+
			
 
				+	/* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with
			
 
				+	 * single bank of VRAM local memory.
			
 
				+	 * for dGPUs - VCRAT reports only one bank of Local Memory
			
 
				+	 * for APUs - If CRAT from ACPI reports more than one bank, then
			
 
				+	 *	all the banks will report the same mem_clk_max information
			
 
				+	 */
			
 
				+	dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd,
			
 
				+		&local_mem_info);
			
 
				+
			
 
				+	list_for_each_entry(mem, &dev->mem_props, list)
			
 
				+		mem->mem_clk_max = local_mem_info.mem_clk_max;
			
 
				+}
			
 
				+
			
 
				+static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
			
 
				+{
			
 
				+	struct kfd_iolink_properties *link;
			
 
				+
			
 
				+	if (!dev || !dev->gpu)
			
 
				+		return;
			
 
				+
			
 
				+	/* GPU only creates direck links so apply flags setting to all */
			
 
				+	if (dev->gpu->device_info->asic_family == CHIP_HAWAII)
			
 
				+		list_for_each_entry(link, &dev->io_link_props, list)
			
 
				+			link->flags = CRAT_IOLINK_FLAGS_ENABLED |
			
 
				+				CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
			
 
				+				CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT;
			
 
				+}
			
 
				+
			
 
				 int kfd_topology_add_device(struct kfd_dev *gpu)
			
 
				 {
			
 
				 	uint32_t gpu_id;
			
 
				 	struct kfd_topology_device *dev;
			
 
				-	int res;
			
 
				+	struct kfd_cu_info cu_info;
			
 
				+	int res = 0;
			
 
				+	struct list_head temp_topology_device_list;
			
 
				+	void *crat_image = NULL;
			
 
				+	size_t image_size = 0;
			
 
				+	int proximity_domain;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&temp_topology_device_list);
			
 
				 
			
 
				 	gpu_id = kfd_generate_gpu_id(gpu);
			
 
				 
			
 
				 	pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
			
 
				 
			
 
				-	down_write(&topology_lock);
			
 
				-	/*
			
 
				-	 * Try to assign the GPU to existing topology device (generated from
			
 
				-	 * CRAT table
			
 
				+	proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
			
 
				+
			
 
				+	/* Check to see if this gpu device exists in the topology_device_list.
			
 
				+	 * If so, assign the gpu to that device,
			
 
				+	 * else create a Virtual CRAT for this gpu device and then parse that
			
 
				+	 * CRAT to create a new topology device. Once created assign the gpu to
			
 
				+	 * that topology device
			
 
				 	 */
			
 
				 	dev = kfd_assign_gpu(gpu);
			
 
				 	if (!dev) {
			
 
				-		pr_info("GPU was not found in the current topology. Extending.\n");
			
 
				-		kfd_debug_print_topology();
			
 
				-		dev = kfd_create_topology_device();
			
 
				-		if (!dev) {
			
 
				-			res = -ENOMEM;
			
 
				+		res = kfd_create_crat_image_virtual(&crat_image, &image_size,
			
 
				+						    COMPUTE_UNIT_GPU, gpu,
			
 
				+						    proximity_domain);
			
 
				+		if (res) {
			
 
				+			pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
			
 
				+			       gpu_id);
			
 
				+			return res;
			
 
				+		}
			
 
				+		res = kfd_parse_crat_table(crat_image,
			
 
				+					   &temp_topology_device_list,
			
 
				+					   proximity_domain);
			
 
				+		if (res) {
			
 
				+			pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
			
 
				+			       gpu_id);
			
 
				 			goto err;
			
 
				 		}
			
 
				-		dev->gpu = gpu;
			
 
				 
			
 
				-		/*
			
 
				-		 * TODO: Make a call to retrieve topology information from the
			
 
				-		 * GPU vBIOS
			
 
				-		 */
			
 
				+		down_write(&topology_lock);
			
 
				+		kfd_topology_update_device_list(&temp_topology_device_list,
			
 
				+			&topology_device_list);
			
 
				 
			
 
				 		/* Update the SYSFS tree, since we added another topology
			
 
				 		 * device
			
 
				 		 */
			
 
				-		if (kfd_topology_update_sysfs() < 0)
			
 
				-			kfd_topology_release_sysfs();
			
 
				-
			
 
				+		res = kfd_topology_update_sysfs();
			
 
				+		up_write(&topology_lock);
			
 
				+
			
 
				+		if (!res)
			
 
				+			sys_props.generation_count++;
			
 
				+		else
			
 
				+			pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
			
 
				+						gpu_id, res);
			
 
				+		dev = kfd_assign_gpu(gpu);
			
 
				+		if (WARN_ON(!dev)) {
			
 
				+			res = -ENODEV;
			
 
				+			goto err;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	dev->gpu_id = gpu_id;
			
 
				 	gpu->id = gpu_id;
			
 
				+
			
 
				+	/* TODO: Move the following lines to function
			
 
				+	 *	kfd_add_non_crat_information
			
 
				+	 */
			
 
				+
			
 
				+	/* Fill-in additional information that is not available in CRAT but
			
 
				+	 * needed for the topology
			
 
				+	 */
			
 
				+
			
 
				+	dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info);
			
 
				+	dev->node_props.simd_arrays_per_engine =
			
 
				+		cu_info.num_shader_arrays_per_engine;
			
 
				+
			
 
				 	dev->node_props.vendor_id = gpu->pdev->vendor;
			
 
				 	dev->node_props.device_id = gpu->pdev->device;
			
 
				-	dev->node_props.location_id = (gpu->pdev->bus->number << 24) +
			
 
				-			(gpu->pdev->devfn & 0xffffff);
			
 
				-	/*
			
 
				-	 * TODO: Retrieve max engine clock values from KGD
			
 
				-	 */
			
 
				+	dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number,
			
 
				+		gpu->pdev->devfn);
			
 
				+	dev->node_props.max_engine_clk_fcompute =
			
 
				+		dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd);
			
 
				+	dev->node_props.max_engine_clk_ccompute =
			
 
				+		cpufreq_quick_get_max(0) / 1000;
			
 
				+
			
 
				+	kfd_fill_mem_clk_max_info(dev);
			
 
				+	kfd_fill_iolink_non_crat_info(dev);
			
 
				+
			
 
				+	switch (dev->gpu->device_info->asic_family) {
			
 
				+	case CHIP_KAVERI:
			
 
				+	case CHIP_HAWAII:
			
 
				+	case CHIP_TONGA:
			
 
				+		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 <<
			
 
				+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
			
 
				+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
			
 
				+		break;
			
 
				+	case CHIP_CARRIZO:
			
 
				+	case CHIP_FIJI:
			
 
				+	case CHIP_POLARIS10:
			
 
				+	case CHIP_POLARIS11:
			
 
				+		pr_debug("Adding doorbell packet type capability\n");
			
 
				+		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 <<
			
 
				+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
			
 
				+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
			
 
				+		break;
			
 
				+	default:
			
 
				+		WARN(1, "Unexpected ASIC family %u",
			
 
				+		     dev->gpu->device_info->asic_family);
			
 
				+	}
			
 
				 
			
 
				+	/* Fix errors in CZ CRAT.
			
 
				+	 * simd_count: Carrizo CRAT reports wrong simd_count, probably
			
 
				+	 *		because it doesn't consider masked out CUs
			
 
				+	 * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd
			
 
				+	 * capability flag: Carrizo CRAT doesn't report IOMMU flags
			
 
				+	 */
			
 
				 	if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
			
 
				-		dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE;
			
 
				-		pr_info("Adding doorbell packet type capability\n");
			
 
				+		dev->node_props.simd_count =
			
 
				+			cu_info.simd_per_cu * cu_info.cu_active_number;
			
 
				+		dev->node_props.max_waves_per_simd = 10;
			
 
				+		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
			
 
				 	}
			
 
				 
			
 
				-	res = 0;
			
 
				-
			
 
				-err:
			
 
				-	up_write(&topology_lock);
			
 
				+	kfd_debug_print_topology();
			
 
				 
			
 
				-	if (res == 0)
			
 
				+	if (!res)
			
 
				 		kfd_notify_gpu_change(gpu_id, 1);
			
 
				-
			
 
				+err:
			
 
				+	kfd_destroy_crat_image(crat_image);
			
 
				 	return res;
			
 
				 }
			
 
				 
			
 
				 int kfd_topology_remove_device(struct kfd_dev *gpu)
			
 
				 {
			
 
				-	struct kfd_topology_device *dev;
			
 
				+	struct kfd_topology_device *dev, *tmp;
			
 
				 	uint32_t gpu_id;
			
 
				 	int res = -ENODEV;
			
 
				 
			
 
				 	down_write(&topology_lock);
			
 
				 
			
 
				-	list_for_each_entry(dev, &topology_device_list, list)
			
 
				+	list_for_each_entry_safe(dev, tmp, &topology_device_list, list)
			
 
				 		if (dev->gpu == gpu) {
			
 
				 			gpu_id = dev->gpu_id;
			
 
				 			kfd_remove_sysfs_node_entry(dev);
			
 
				 			kfd_release_topology_device(dev);
			
 
				+			sys_props.num_devices--;
			
 
				 			res = 0;
			
 
				 			if (kfd_topology_update_sysfs() < 0)
			
 
				 				kfd_topology_release_sysfs();
			
@@ -1201,28 +1294,32 @@ int kfd_topology_remove_device(struct kfd_dev *gpu)
 
				 
			
 
				 	up_write(&topology_lock);
			
 
				 
			
 
				-	if (res == 0)
			
 
				+	if (!res)
			
 
				 		kfd_notify_gpu_change(gpu_id, 0);
			
 
				 
			
 
				 	return res;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * When idx is out of bounds, the function will return NULL
			
 
				+/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD
			
 
				+ *	topology. If GPU device is found @idx, then valid kfd_dev pointer is
			
 
				+ *	returned through @kdev
			
 
				+ * Return -	0: On success (@kdev will be NULL for non GPU nodes)
			
 
				+ *		-1: If end of list
			
 
				  */
			
 
				-struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx)
			
 
				+int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev)
			
 
				 {
			
 
				 
			
 
				 	struct kfd_topology_device *top_dev;
			
 
				-	struct kfd_dev *device = NULL;
			
 
				 	uint8_t device_idx = 0;
			
 
				 
			
 
				+	*kdev = NULL;
			
 
				 	down_read(&topology_lock);
			
 
				 
			
 
				 	list_for_each_entry(top_dev, &topology_device_list, list) {
			
 
				 		if (device_idx == idx) {
			
 
				-			device = top_dev->gpu;
			
 
				-			break;
			
 
				+			*kdev = top_dev->gpu;
			
 
				+			up_read(&topology_lock);
			
 
				+			return 0;
			
 
				 		}
			
 
				 
			
 
				 		device_idx++;
			
@@ -1230,6 +1327,88 @@ struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx)
 
				 
			
 
				 	up_read(&topology_lock);
			
 
				 
			
 
				-	return device;
			
 
				+	return -1;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask)
			
 
				+{
			
 
				+	const struct cpuinfo_x86 *cpuinfo;
			
 
				+	int first_cpu_of_numa_node;
			
 
				+
			
 
				+	if (!cpumask || cpumask == cpu_none_mask)
			
 
				+		return -1;
			
 
				+	first_cpu_of_numa_node = cpumask_first(cpumask);
			
 
				+	if (first_cpu_of_numa_node >= nr_cpu_ids)
			
 
				+		return -1;
			
 
				+	cpuinfo = &cpu_data(first_cpu_of_numa_node);
			
 
				 
			
 
				+	return cpuinfo->apicid;
			
 
				 }
			
 
				+
			
 
				+/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor
			
 
				+ *	of the given NUMA node (numa_node_id)
			
 
				+ * Return -1 on failure
			
 
				+ */
			
 
				+int kfd_numa_node_to_apic_id(int numa_node_id)
			
 
				+{
			
 
				+	if (numa_node_id == -1) {
			
 
				+		pr_warn("Invalid NUMA Node. Use online CPU mask\n");
			
 
				+		return kfd_cpumask_to_apic_id(cpu_online_mask);
			
 
				+	}
			
 
				+	return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id));
			
 
				+}
			
 
				+
			
 
				+#if defined(CONFIG_DEBUG_FS)
			
 
				+
			
 
				+int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data)
			
 
				+{
			
 
				+	struct kfd_topology_device *dev;
			
 
				+	unsigned int i = 0;
			
 
				+	int r = 0;
			
 
				+
			
 
				+	down_read(&topology_lock);
			
 
				+
			
 
				+	list_for_each_entry(dev, &topology_device_list, list) {
			
 
				+		if (!dev->gpu) {
			
 
				+			i++;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id);
			
 
				+		r = dqm_debugfs_hqds(m, dev->gpu->dqm);
			
 
				+		if (r)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	up_read(&topology_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int kfd_debugfs_rls_by_device(struct seq_file *m, void *data)
			
 
				+{
			
 
				+	struct kfd_topology_device *dev;
			
 
				+	unsigned int i = 0;
			
 
				+	int r = 0;
			
 
				+
			
 
				+	down_read(&topology_lock);
			
 
				+
			
 
				+	list_for_each_entry(dev, &topology_device_list, list) {
			
 
				+		if (!dev->gpu) {
			
 
				+			i++;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id);
			
 
				+		r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets);
			
 
				+		if (r)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	up_read(&topology_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -39,8 +39,13 @@
 
				 #define HSA_CAP_WATCH_POINTS_SUPPORTED		0x00000080
			
 
				 #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK	0x00000f00
			
 
				 #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT	8
			
 
				-#define HSA_CAP_RESERVED			0xfffff000
			
 
				-#define HSA_CAP_DOORBELL_PACKET_TYPE		0x00001000
			
 
				+#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK	0x00003000
			
 
				+#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT	12
			
 
				+#define HSA_CAP_RESERVED			0xffffc000
			
 
				+
			
 
				+#define HSA_CAP_DOORBELL_TYPE_PRE_1_0		0x0
			
 
				+#define HSA_CAP_DOORBELL_TYPE_1_0		0x1
			
 
				+#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
			
 
				 
			
 
				 struct kfd_node_properties {
			
 
				 	uint32_t cpu_cores_count;
			
@@ -91,8 +96,6 @@ struct kfd_mem_properties {
 
				 	struct attribute	attr;
			
 
				 };
			
 
				 
			
 
				-#define KFD_TOPOLOGY_CPU_SIBLINGS 256
			
 
				-
			
 
				 #define HSA_CACHE_TYPE_DATA		0x00000001
			
 
				 #define HSA_CACHE_TYPE_INSTRUCTION	0x00000002
			
 
				 #define HSA_CACHE_TYPE_CPU		0x00000004
			
@@ -109,7 +112,7 @@ struct kfd_cache_properties {
 
				 	uint32_t		cache_assoc;
			
 
				 	uint32_t		cache_latency;
			
 
				 	uint32_t		cache_type;
			
 
				-	uint8_t			sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS];
			
 
				+	uint8_t			sibling_map[CRAT_SIBLINGMAP_SIZE];
			
 
				 	struct kobject		*kobj;
			
 
				 	struct attribute	attr;
			
 
				 };
			
@@ -132,24 +135,36 @@ struct kfd_iolink_properties {
 
				 	struct attribute	attr;
			
 
				 };
			
 
				 
			
 
				+struct kfd_perf_properties {
			
 
				+	struct list_head	list;
			
 
				+	char			block_name[16];
			
 
				+	uint32_t		max_concurrent;
			
 
				+	struct attribute_group	*attr_group;
			
 
				+};
			
 
				+
			
 
				 struct kfd_topology_device {
			
 
				 	struct list_head		list;
			
 
				 	uint32_t			gpu_id;
			
 
				+	uint32_t			proximity_domain;
			
 
				 	struct kfd_node_properties	node_props;
			
 
				-	uint32_t			mem_bank_count;
			
 
				 	struct list_head		mem_props;
			
 
				 	uint32_t			cache_count;
			
 
				 	struct list_head		cache_props;
			
 
				 	uint32_t			io_link_count;
			
 
				 	struct list_head		io_link_props;
			
 
				+	struct list_head		perf_props;
			
 
				 	struct kfd_dev			*gpu;
			
 
				 	struct kobject			*kobj_node;
			
 
				 	struct kobject			*kobj_mem;
			
 
				 	struct kobject			*kobj_cache;
			
 
				 	struct kobject			*kobj_iolink;
			
 
				+	struct kobject			*kobj_perf;
			
 
				 	struct attribute		attr_gpuid;
			
 
				 	struct attribute		attr_name;
			
 
				 	struct attribute		attr_props;
			
 
				+	uint8_t				oem_id[CRAT_OEMID_LENGTH];
			
 
				+	uint8_t				oem_table_id[CRAT_OEMTABLEID_LENGTH];
			
 
				+	uint32_t			oem_revision;
			
 
				 };
			
 
				 
			
 
				 struct kfd_system_properties {
			
@@ -164,6 +179,12 @@ struct kfd_system_properties {
 
				 	struct attribute	attr_props;
			
 
				 };
			
 
				 
			
 
				+struct kfd_topology_device *kfd_create_topology_device(
			
 
				+		struct list_head *device_list);
			
 
				+void kfd_release_topology_device_list(struct list_head *device_list);
			
 
				 
			
 
				+extern bool amd_iommu_pc_supported(void);
			
 
				+extern u8 amd_iommu_pc_get_max_banks(u16 devid);
			
 
				+extern u8 amd_iommu_pc_get_max_counters(u16 devid);
			
 
				 
			
 
				 #endif /* __KFD_TOPOLOGY_H__ */
			
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -46,6 +46,28 @@ enum kfd_preempt_type {
 
				 	KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
			
 
				 };
			
 
				 
			
 
				+struct kfd_cu_info {
			
 
				+	uint32_t num_shader_engines;
			
 
				+	uint32_t num_shader_arrays_per_engine;
			
 
				+	uint32_t num_cu_per_sh;
			
 
				+	uint32_t cu_active_number;
			
 
				+	uint32_t cu_ao_mask;
			
 
				+	uint32_t simd_per_cu;
			
 
				+	uint32_t max_waves_per_simd;
			
 
				+	uint32_t wave_front_size;
			
 
				+	uint32_t max_scratch_slots_per_cu;
			
 
				+	uint32_t lds_size;
			
 
				+	uint32_t cu_bitmap[4][4];
			
 
				+};
			
 
				+
			
 
				+/* For getting GPU local memory information from KGD */
			
 
				+struct kfd_local_mem_info {
			
 
				+	uint64_t local_mem_size_private;
			
 
				+	uint64_t local_mem_size_public;
			
 
				+	uint32_t vram_width;
			
 
				+	uint32_t mem_clk_max;
			
 
				+};
			
 
				+
			
 
				 enum kgd_memory_pool {
			
 
				 	KGD_POOL_SYSTEM_CACHEABLE = 1,
			
 
				 	KGD_POOL_SYSTEM_WRITECOMBINE = 2,
			
@@ -106,7 +128,7 @@ struct tile_config {
 
				  *
			
 
				  * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture
			
 
				  *
			
 
				- * @get_vmem_size: Retrieves (physical) size of VRAM
			
 
				+ * @get_local_mem_info: Retrieves information about GPU local memory
			
 
				  *
			
 
				  * @get_gpu_clock_counter: Retrieves GPU clock counter
			
 
				  *
			
@@ -131,6 +153,12 @@ struct tile_config {
 
				  * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot.
			
 
				  * used only for no HWS mode.
			
 
				  *
			
 
				+ * @hqd_dump: Dumps CPC HQD registers to an array of address-value pairs.
			
 
				+ * Array is allocated with kmalloc, needs to be freed with kfree by caller.
			
 
				+ *
			
 
				+ * @hqd_sdma_dump: Dumps SDMA HQD registers to an array of address-value pairs.
			
 
				+ * Array is allocated with kmalloc, needs to be freed with kfree by caller.
			
 
				+ *
			
 
				  * @hqd_is_occupies: Checks if a hqd slot is occupied.
			
 
				  *
			
 
				  * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot.
			
@@ -147,6 +175,10 @@ struct tile_config {
 
				  *
			
 
				  * @get_tile_config: Returns GPU-specific tiling mode information
			
 
				  *
			
 
				+ * @get_cu_info: Retrieves activated cu info
			
 
				+ *
			
 
				+ * @get_vram_usage: Returns current VRAM usage
			
 
				+ *
			
 
				  * This structure contains function pointers to services that the kgd driver
			
 
				  * provides to amdkfd driver.
			
 
				  *
			
@@ -158,7 +190,8 @@ struct kfd2kgd_calls {
 
				 
			
 
				 	void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj);
			
 
				 
			
 
				-	uint64_t (*get_vmem_size)(struct kgd_dev *kgd);
			
 
				+	void (*get_local_mem_info)(struct kgd_dev *kgd,
			
 
				+			struct kfd_local_mem_info *mem_info);
			
 
				 	uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd);
			
 
				 
			
 
				 	uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd);
			
@@ -184,7 +217,16 @@ struct kfd2kgd_calls {
 
				 			uint32_t wptr_shift, uint32_t wptr_mask,
			
 
				 			struct mm_struct *mm);
			
 
				 
			
 
				-	int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd);
			
 
				+	int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd,
			
 
				+			     uint32_t __user *wptr, struct mm_struct *mm);
			
 
				+
			
 
				+	int (*hqd_dump)(struct kgd_dev *kgd,
			
 
				+			uint32_t pipe_id, uint32_t queue_id,
			
 
				+			uint32_t (**dump)[2], uint32_t *n_regs);
			
 
				+
			
 
				+	int (*hqd_sdma_dump)(struct kgd_dev *kgd,
			
 
				+			     uint32_t engine_id, uint32_t queue_id,
			
 
				+			     uint32_t (**dump)[2], uint32_t *n_regs);
			
 
				 
			
 
				 	bool (*hqd_is_occupied)(struct kgd_dev *kgd, uint64_t queue_address,
			
 
				 				uint32_t pipe_id, uint32_t queue_id);
			
@@ -224,6 +266,10 @@ struct kfd2kgd_calls {
 
				 	void (*set_scratch_backing_va)(struct kgd_dev *kgd,
			
 
				 				uint64_t va, uint32_t vmid);
			
 
				 	int (*get_tile_config)(struct kgd_dev *kgd, struct tile_config *config);
			
 
				+
			
 
				+	void (*get_cu_info)(struct kgd_dev *kgd,
			
 
				+			struct kfd_cu_info *cu_info);
			
 
				+	uint64_t (*get_vram_usage)(struct kgd_dev *kgd);
			
 
				 };
			
 
				 
			
 
				 /**
			
--- a/drivers/gpu/drm/amd/include/vi_structs.h
+++ b/drivers/gpu/drm/amd/include/vi_structs.h
@@ -153,6 +153,8 @@ struct vi_sdma_mqd {
 
				 	uint32_t reserved_125;
			
 
				 	uint32_t reserved_126;
			
 
				 	uint32_t reserved_127;
			
 
				+	uint32_t sdma_engine_id;
			
 
				+	uint32_t sdma_queue_id;
			
 
				 };
			
 
				 
			
 
				 struct vi_mqd {
			
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -58,7 +58,8 @@ struct kfd_ioctl_create_queue_args {
 
				 	__u64 eop_buffer_address;	/* to KFD */
			
 
				 	__u64 eop_buffer_size;	/* to KFD */
			
 
				 	__u64 ctx_save_restore_address; /* to KFD */
			
 
				-	__u64 ctx_save_restore_size;	/* to KFD */
			
 
				+	__u32 ctx_save_restore_size;	/* to KFD */
			
 
				+	__u32 ctl_stack_size;		/* to KFD */
			
 
				 };
			
 
				 
			
 
				 struct kfd_ioctl_destroy_queue_args {
			
@@ -261,6 +262,13 @@ struct kfd_ioctl_get_tile_config_args {
 
				 	 */
			
 
				 };
			
 
				 
			
 
				+struct kfd_ioctl_set_trap_handler_args {
			
 
				+	uint64_t tba_addr;		/* to KFD */
			
 
				+	uint64_t tma_addr;		/* to KFD */
			
 
				+	uint32_t gpu_id;		/* to KFD */
			
 
				+	uint32_t pad;
			
 
				+};
			
 
				+
			
 
				 #define AMDKFD_IOCTL_BASE 'K'
			
 
				 #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
			
 
				 #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
			
@@ -321,7 +329,10 @@ struct kfd_ioctl_get_tile_config_args {
 
				 #define AMDKFD_IOC_GET_TILE_CONFIG                                      \
			
 
				 		AMDKFD_IOWR(0x12, struct kfd_ioctl_get_tile_config_args)
			
 
				 
			
 
				+#define AMDKFD_IOC_SET_TRAP_HANDLER		\
			
 
				+		AMDKFD_IOW(0x13, struct kfd_ioctl_set_trap_handler_args)
			
 
				+
			
 
				 #define AMDKFD_COMMAND_START		0x01
			
 
				-#define AMDKFD_COMMAND_END		0x13
			
 
				+#define AMDKFD_COMMAND_END		0x14
			
 
				 
			
 
				 #endif