9 years ago · 95844d20ae
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -64,6 +64,7 @@
 
				 extern int amdgpu_modeset;
			
 
				 extern int amdgpu_vram_limit;
			
 
				 extern int amdgpu_gart_size;
			
 
				+extern int amdgpu_moverate;
			
 
				 extern int amdgpu_benchmarking;
			
 
				 extern int amdgpu_testing;
			
 
				 extern int amdgpu_audio;
			
@@ -2034,6 +2035,14 @@ struct amdgpu_device {
 
				 	atomic64_t			num_evictions;
			
 
				 	atomic_t			gpu_reset_counter;
			
 
				 
			
 
				+	/* data for buffer migration throttling */
			
 
				+	struct {
			
 
				+		spinlock_t		lock;
			
 
				+		s64			last_update_us;
			
 
				+		s64			accum_us; /* accumulated microseconds */
			
 
				+		u32			log2_max_MBps;
			
 
				+	} mm_stats;
			
 
				+
			
 
				 	/* display */
			
 
				 	bool				enable_virtual_display;
			
 
				 	struct amdgpu_mode_info		mode_info;
			
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -235,56 +235,115 @@ free_chunk:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-/* Returns how many bytes TTM can move per IB.
			
 
				+/* Convert microseconds to bytes. */
			
 
				+static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
			
 
				+{
			
 
				+	if (us <= 0 || !adev->mm_stats.log2_max_MBps)
			
 
				+		return 0;
			
 
				+
			
 
				+	/* Since accum_us is incremented by a million per second, just
			
 
				+	 * multiply it by the number of MB/s to get the number of bytes.
			
 
				+	 */
			
 
				+	return us << adev->mm_stats.log2_max_MBps;
			
 
				+}
			
 
				+
			
 
				+static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
			
 
				+{
			
 
				+	if (!adev->mm_stats.log2_max_MBps)
			
 
				+		return 0;
			
 
				+
			
 
				+	return bytes >> adev->mm_stats.log2_max_MBps;
			
 
				+}
			
 
				+
			
 
				+/* Returns how many bytes TTM can move right now. If no bytes can be moved,
			
 
				+ * it returns 0. If it returns non-zero, it's OK to move at least one buffer,
			
 
				+ * which means it can go over the threshold once. If that happens, the driver
			
 
				+ * will be in debt and no other buffer migrations can be done until that debt
			
 
				+ * is repaid.
			
 
				+ *
			
 
				+ * This approach allows moving a buffer of any size (it's important to allow
			
 
				+ * that).
			
 
				+ *
			
 
				+ * The currency is simply time in microseconds and it increases as the clock
			
 
				+ * ticks. The accumulated microseconds (us) are converted to bytes and
			
 
				+ * returned.
			
 
				  */
			
 
				 static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
			
 
				 {
			
 
				-	u64 real_vram_size = adev->mc.real_vram_size;
			
 
				-	u64 vram_usage = atomic64_read(&adev->vram_usage);
			
 
				+	s64 time_us, increment_us;
			
 
				+	u64 max_bytes;
			
 
				+	u64 free_vram, total_vram, used_vram;
			
 
				 
			
 
				-	/* This function is based on the current VRAM usage.
			
 
				+	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
			
 
				+	 * throttling.
			
 
				 	 *
			
 
				-	 * - If all of VRAM is free, allow relocating the number of bytes that
			
 
				-	 *   is equal to 1/4 of the size of VRAM for this IB.
			
 
				+	 * It means that in order to get full max MBps, at least 5 IBs per
			
 
				+	 * second must be submitted and not more than 200ms apart from each
			
 
				+	 * other.
			
 
				+	 */
			
 
				+	const s64 us_upper_bound = 200000;
			
 
				 
			
 
				-	 * - If more than one half of VRAM is occupied, only allow relocating
			
 
				-	 *   1 MB of data for this IB.
			
 
				-	 *
			
 
				-	 * - From 0 to one half of used VRAM, the threshold decreases
			
 
				-	 *   linearly.
			
 
				-	 *         __________________
			
 
				-	 * 1/4 of -|\               |
			
 
				-	 * VRAM    | \              |
			
 
				-	 *         |  \             |
			
 
				-	 *         |   \            |
			
 
				-	 *         |    \           |
			
 
				-	 *         |     \          |
			
 
				-	 *         |      \         |
			
 
				-	 *         |       \________|1 MB
			
 
				-	 *         |----------------|
			
 
				-	 *    VRAM 0 %             100 %
			
 
				-	 *         used            used
			
 
				-	 *
			
 
				-	 * Note: It's a threshold, not a limit. The threshold must be crossed
			
 
				-	 * for buffer relocations to stop, so any buffer of an arbitrary size
			
 
				-	 * can be moved as long as the threshold isn't crossed before
			
 
				-	 * the relocation takes place. We don't want to disable buffer
			
 
				-	 * relocations completely.
			
 
				+	if (!adev->mm_stats.log2_max_MBps)
			
 
				+		return 0;
			
 
				+
			
 
				+	total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
			
 
				+	used_vram = atomic64_read(&adev->vram_usage);
			
 
				+	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
			
 
				+
			
 
				+	spin_lock(&adev->mm_stats.lock);
			
 
				+
			
 
				+	/* Increase the amount of accumulated us. */
			
 
				+	time_us = ktime_to_us(ktime_get());
			
 
				+	increment_us = time_us - adev->mm_stats.last_update_us;
			
 
				+	adev->mm_stats.last_update_us = time_us;
			
 
				+	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
			
 
				+                                      us_upper_bound);
			
 
				+
			
 
				+	/* This prevents the short period of low performance when the VRAM
			
 
				+	 * usage is low and the driver is in debt or doesn't have enough
			
 
				+	 * accumulated us to fill VRAM quickly.
			
 
				 	 *
			
 
				-	 * The idea is that buffers should be placed in VRAM at creation time
			
 
				-	 * and TTM should only do a minimum number of relocations during
			
 
				-	 * command submission. In practice, you need to submit at least
			
 
				-	 * a dozen IBs to move all buffers to VRAM if they are in GTT.
			
 
				+	 * The situation can occur in these cases:
			
 
				+	 * - a lot of VRAM is freed by userspace
			
 
				+	 * - the presence of a big buffer causes a lot of evictions
			
 
				+	 *   (solution: split buffers into smaller ones)
			
 
				 	 *
			
 
				-	 * Also, things can get pretty crazy under memory pressure and actual
			
 
				-	 * VRAM usage can change a lot, so playing safe even at 50% does
			
 
				-	 * consistently increase performance.
			
 
				+	 * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
			
 
				+	 * accum_us to a positive number.
			
 
				 	 */
			
 
				+	if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
			
 
				+		s64 min_us;
			
 
				+
			
 
				+		/* Be more aggresive on dGPUs. Try to fill a portion of free
			
 
				+		 * VRAM now.
			
 
				+		 */
			
 
				+		if (!(adev->flags & AMD_IS_APU))
			
 
				+			min_us = bytes_to_us(adev, free_vram / 4);
			
 
				+		else
			
 
				+			min_us = 0; /* Reset accum_us on APUs. */
			
 
				+
			
 
				+		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
			
 
				+	}
			
 
				 
			
 
				-	u64 half_vram = real_vram_size >> 1;
			
 
				-	u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage;
			
 
				-	u64 bytes_moved_threshold = half_free_vram >> 1;
			
 
				-	return max(bytes_moved_threshold, 1024*1024ull);
			
 
				+	/* This returns 0 if the driver is in debt to disallow (optional)
			
 
				+	 * buffer moves.
			
 
				+	 */
			
 
				+	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
			
 
				+
			
 
				+	spin_unlock(&adev->mm_stats.lock);
			
 
				+	return max_bytes;
			
 
				+}
			
 
				+
			
 
				+/* Report how many bytes have really been moved for the last command
			
 
				+ * submission. This can result in a debt that can stop buffer migrations
			
 
				+ * temporarily.
			
 
				+ */
			
 
				+static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev,
			
 
				+					 u64 num_bytes)
			
 
				+{
			
 
				+	spin_lock(&adev->mm_stats.lock);
			
 
				+	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
			
 
				+	spin_unlock(&adev->mm_stats.lock);
			
 
				 }
			
 
				 
			
 
				 static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
			
@@ -297,15 +356,10 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
 
				 	if (bo->pin_count)
			
 
				 		return 0;
			
 
				 
			
 
				-	/* Avoid moving this one if we have moved too many buffers
			
 
				-	 * for this IB already.
			
 
				-	 *
			
 
				-	 * Note that this allows moving at least one buffer of
			
 
				-	 * any size, because it doesn't take the current "bo"
			
 
				-	 * into account. We don't want to disallow buffer moves
			
 
				-	 * completely.
			
 
				+	/* Don't move this buffer if we have depleted our allowance
			
 
				+	 * to move it. Don't move anything if the threshold is zero.
			
 
				 	 */
			
 
				-	if (p->bytes_moved <= p->bytes_moved_threshold)
			
 
				+	if (p->bytes_moved < p->bytes_moved_threshold)
			
 
				 		domain = bo->prefered_domains;
			
 
				 	else
			
 
				 		domain = bo->allowed_domains;
			
@@ -494,6 +548,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 
				 		goto error_validate;
			
 
				 	}
			
 
				 
			
 
				+	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
			
 
				+
			
 
				 	fpriv->vm.last_eviction_counter =
			
 
				 		atomic64_read(&p->adev->num_evictions);
			
 
				 
			
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1490,6 +1490,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
				 {
			
 
				 	int r, i;
			
 
				 	bool runtime = false;
			
 
				+	u32 max_MBps;
			
 
				 
			
 
				 	adev->shutdown = false;
			
 
				 	adev->dev = &pdev->dev;
			
@@ -1549,6 +1550,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
				 	spin_lock_init(&adev->didt_idx_lock);
			
 
				 	spin_lock_init(&adev->gc_cac_idx_lock);
			
 
				 	spin_lock_init(&adev->audio_endpt_idx_lock);
			
 
				+	spin_lock_init(&adev->mm_stats.lock);
			
 
				 
			
 
				 	INIT_LIST_HEAD(&adev->shadow_list);
			
 
				 	mutex_init(&adev->shadow_list_lock);
			
@@ -1660,6 +1662,14 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
				 
			
 
				 	adev->accel_working = true;
			
 
				 
			
 
				+	/* Initialize the buffer migration limit. */
			
 
				+	if (amdgpu_moverate >= 0)
			
 
				+		max_MBps = amdgpu_moverate;
			
 
				+	else
			
 
				+		max_MBps = 8; /* Allow 8 MB/s. */
			
 
				+	/* Get a log2 for easy divisions. */
			
 
				+	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
			
 
				+
			
 
				 	amdgpu_fbdev_init(adev);
			
 
				 
			
 
				 	r = amdgpu_ib_pool_init(adev);
			
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -62,6 +62,7 @@
 
				 
			
 
				 int amdgpu_vram_limit = 0;
			
 
				 int amdgpu_gart_size = -1; /* auto */
			
 
				+int amdgpu_moverate = -1; /* auto */
			
 
				 int amdgpu_benchmarking = 0;
			
 
				 int amdgpu_testing = 0;
			
 
				 int amdgpu_audio = -1;
			
@@ -100,6 +101,9 @@ module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
 
				 MODULE_PARM_DESC(gartsize, "Size of PCIE/IGP gart to setup in megabytes (32, 64, etc., -1 = auto)");
			
 
				 module_param_named(gartsize, amdgpu_gart_size, int, 0600);
			
 
				 
			
 
				+MODULE_PARM_DESC(moverate, "Maximum buffer migration rate in MB/s. (32, 64, etc., -1=auto, 0=1=disabled)");
			
 
				+module_param_named(moverate, amdgpu_moverate, int, 0600);
			
 
				+
			
 
				 MODULE_PARM_DESC(benchmark, "Run benchmark");
			
 
				 module_param_named(benchmark, amdgpu_benchmarking, int, 0444);