|
@@ -22,11 +22,11 @@ static int throtl_quantum = 32;
|
|
|
#define DFL_THROTL_SLICE_HD (HZ / 10)
|
|
|
#define DFL_THROTL_SLICE_SSD (HZ / 50)
|
|
|
#define MAX_THROTL_SLICE (HZ)
|
|
|
-#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
|
|
|
-#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
|
|
|
#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
|
|
|
-/* default latency target is 0, eg, guarantee IO latency by default */
|
|
|
-#define DFL_LATENCY_TARGET (0)
|
|
|
+#define MIN_THROTL_BPS (320 * 1024)
|
|
|
+#define MIN_THROTL_IOPS (10)
|
|
|
+#define DFL_LATENCY_TARGET (-1L)
|
|
|
+#define DFL_IDLE_THRESHOLD (0)
|
|
|
|
|
|
#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
|
|
|
|
|
@@ -157,6 +157,7 @@ struct throtl_grp {
|
|
|
unsigned long last_check_time;
|
|
|
|
|
|
unsigned long latency_target; /* us */
|
|
|
+ unsigned long latency_target_conf; /* us */
|
|
|
/* When did we start a new slice */
|
|
|
unsigned long slice_start[2];
|
|
|
unsigned long slice_end[2];
|
|
@@ -165,6 +166,7 @@ struct throtl_grp {
|
|
|
unsigned long checked_last_finish_time; /* ns / 1024 */
|
|
|
unsigned long avg_idletime; /* ns / 1024 */
|
|
|
unsigned long idletime_threshold; /* us */
|
|
|
+ unsigned long idletime_threshold_conf; /* us */
|
|
|
|
|
|
unsigned int bio_cnt; /* total bios */
|
|
|
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
|
|
@@ -201,8 +203,6 @@ struct throtl_data
|
|
|
unsigned int limit_index;
|
|
|
bool limit_valid[LIMIT_CNT];
|
|
|
|
|
|
- unsigned long dft_idletime_threshold; /* us */
|
|
|
-
|
|
|
unsigned long low_upgrade_time;
|
|
|
unsigned long low_downgrade_time;
|
|
|
|
|
@@ -294,8 +294,14 @@ static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
|
|
|
|
|
|
td = tg->td;
|
|
|
ret = tg->bps[rw][td->limit_index];
|
|
|
- if (ret == 0 && td->limit_index == LIMIT_LOW)
|
|
|
- return tg->bps[rw][LIMIT_MAX];
|
|
|
+ if (ret == 0 && td->limit_index == LIMIT_LOW) {
|
|
|
+ /* intermediate node or iops isn't 0 */
|
|
|
+ if (!list_empty(&blkg->blkcg->css.children) ||
|
|
|
+ tg->iops[rw][td->limit_index])
|
|
|
+ return U64_MAX;
|
|
|
+ else
|
|
|
+ return MIN_THROTL_BPS;
|
|
|
+ }
|
|
|
|
|
|
if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
|
|
|
tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
|
|
@@ -315,10 +321,17 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
|
|
|
|
|
|
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
|
|
|
return UINT_MAX;
|
|
|
+
|
|
|
td = tg->td;
|
|
|
ret = tg->iops[rw][td->limit_index];
|
|
|
- if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
|
|
|
- return tg->iops[rw][LIMIT_MAX];
|
|
|
+ if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
|
|
|
+ /* intermediate node or bps isn't 0 */
|
|
|
+ if (!list_empty(&blkg->blkcg->css.children) ||
|
|
|
+ tg->bps[rw][td->limit_index])
|
|
|
+ return UINT_MAX;
|
|
|
+ else
|
|
|
+ return MIN_THROTL_IOPS;
|
|
|
+ }
|
|
|
|
|
|
if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
|
|
|
tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
|
|
@@ -482,6 +495,9 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
|
|
|
/* LIMIT_LOW will have default value 0 */
|
|
|
|
|
|
tg->latency_target = DFL_LATENCY_TARGET;
|
|
|
+ tg->latency_target_conf = DFL_LATENCY_TARGET;
|
|
|
+ tg->idletime_threshold = DFL_IDLE_THRESHOLD;
|
|
|
+ tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
|
|
|
|
|
|
return &tg->pd;
|
|
|
}
|
|
@@ -510,8 +526,6 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
|
|
|
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
|
|
|
sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
|
|
|
tg->td = td;
|
|
|
-
|
|
|
- tg->idletime_threshold = td->dft_idletime_threshold;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1349,7 +1363,7 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static void tg_conf_updated(struct throtl_grp *tg)
|
|
|
+static void tg_conf_updated(struct throtl_grp *tg, bool global)
|
|
|
{
|
|
|
struct throtl_service_queue *sq = &tg->service_queue;
|
|
|
struct cgroup_subsys_state *pos_css;
|
|
@@ -1367,8 +1381,26 @@ static void tg_conf_updated(struct throtl_grp *tg)
|
|
|
* restrictions in the whole hierarchy and allows them to bypass
|
|
|
* blk-throttle.
|
|
|
*/
|
|
|
- blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
|
|
|
- tg_update_has_rules(blkg_to_tg(blkg));
|
|
|
+ blkg_for_each_descendant_pre(blkg, pos_css,
|
|
|
+ global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
|
|
|
+ struct throtl_grp *this_tg = blkg_to_tg(blkg);
|
|
|
+ struct throtl_grp *parent_tg;
|
|
|
+
|
|
|
+ tg_update_has_rules(this_tg);
|
|
|
+ /* ignore root/second level */
|
|
|
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
|
|
|
+ !blkg->parent->parent)
|
|
|
+ continue;
|
|
|
+ parent_tg = blkg_to_tg(blkg->parent);
|
|
|
+ /*
|
|
|
+ * make sure all children has lower idle time threshold and
|
|
|
+ * higher latency target
|
|
|
+ */
|
|
|
+ this_tg->idletime_threshold = min(this_tg->idletime_threshold,
|
|
|
+ parent_tg->idletime_threshold);
|
|
|
+ this_tg->latency_target = max(this_tg->latency_target,
|
|
|
+ parent_tg->latency_target);
|
|
|
+ }
|
|
|
|
|
|
/*
|
|
|
* We're already holding queue_lock and know @tg is valid. Let's
|
|
@@ -1413,7 +1445,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
|
|
|
else
|
|
|
*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
|
|
|
|
|
|
- tg_conf_updated(tg);
|
|
|
+ tg_conf_updated(tg, false);
|
|
|
ret = 0;
|
|
|
out_finish:
|
|
|
blkg_conf_finish(&ctx);
|
|
@@ -1497,34 +1529,34 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
|
|
|
tg->iops_conf[READ][off] == iops_dft &&
|
|
|
tg->iops_conf[WRITE][off] == iops_dft &&
|
|
|
(off != LIMIT_LOW ||
|
|
|
- (tg->idletime_threshold == tg->td->dft_idletime_threshold &&
|
|
|
- tg->latency_target == DFL_LATENCY_TARGET)))
|
|
|
+ (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
|
|
|
+ tg->latency_target_conf == DFL_LATENCY_TARGET)))
|
|
|
return 0;
|
|
|
|
|
|
- if (tg->bps_conf[READ][off] != bps_dft)
|
|
|
+ if (tg->bps_conf[READ][off] != U64_MAX)
|
|
|
snprintf(bufs[0], sizeof(bufs[0]), "%llu",
|
|
|
tg->bps_conf[READ][off]);
|
|
|
- if (tg->bps_conf[WRITE][off] != bps_dft)
|
|
|
+ if (tg->bps_conf[WRITE][off] != U64_MAX)
|
|
|
snprintf(bufs[1], sizeof(bufs[1]), "%llu",
|
|
|
tg->bps_conf[WRITE][off]);
|
|
|
- if (tg->iops_conf[READ][off] != iops_dft)
|
|
|
+ if (tg->iops_conf[READ][off] != UINT_MAX)
|
|
|
snprintf(bufs[2], sizeof(bufs[2]), "%u",
|
|
|
tg->iops_conf[READ][off]);
|
|
|
- if (tg->iops_conf[WRITE][off] != iops_dft)
|
|
|
+ if (tg->iops_conf[WRITE][off] != UINT_MAX)
|
|
|
snprintf(bufs[3], sizeof(bufs[3]), "%u",
|
|
|
tg->iops_conf[WRITE][off]);
|
|
|
if (off == LIMIT_LOW) {
|
|
|
- if (tg->idletime_threshold == ULONG_MAX)
|
|
|
+ if (tg->idletime_threshold_conf == ULONG_MAX)
|
|
|
strcpy(idle_time, " idle=max");
|
|
|
else
|
|
|
snprintf(idle_time, sizeof(idle_time), " idle=%lu",
|
|
|
- tg->idletime_threshold);
|
|
|
+ tg->idletime_threshold_conf);
|
|
|
|
|
|
- if (tg->latency_target == ULONG_MAX)
|
|
|
+ if (tg->latency_target_conf == ULONG_MAX)
|
|
|
strcpy(latency_time, " latency=max");
|
|
|
else
|
|
|
snprintf(latency_time, sizeof(latency_time),
|
|
|
- " latency=%lu", tg->latency_target);
|
|
|
+ " latency=%lu", tg->latency_target_conf);
|
|
|
}
|
|
|
|
|
|
seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
|
|
@@ -1563,8 +1595,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
|
|
|
v[2] = tg->iops_conf[READ][index];
|
|
|
v[3] = tg->iops_conf[WRITE][index];
|
|
|
|
|
|
- idle_time = tg->idletime_threshold;
|
|
|
- latency_time = tg->latency_target;
|
|
|
+ idle_time = tg->idletime_threshold_conf;
|
|
|
+ latency_time = tg->latency_target_conf;
|
|
|
while (true) {
|
|
|
char tok[27]; /* wiops=18446744073709551616 */
|
|
|
char *p;
|
|
@@ -1623,17 +1655,33 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
|
|
|
tg->iops_conf[READ][LIMIT_MAX]);
|
|
|
tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
|
|
|
tg->iops_conf[WRITE][LIMIT_MAX]);
|
|
|
+ tg->idletime_threshold_conf = idle_time;
|
|
|
+ tg->latency_target_conf = latency_time;
|
|
|
+
|
|
|
+ /* force user to configure all settings for low limit */
|
|
|
+ if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
|
|
|
+ tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
|
|
|
+ tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
|
|
|
+ tg->latency_target_conf == DFL_LATENCY_TARGET) {
|
|
|
+ tg->bps[READ][LIMIT_LOW] = 0;
|
|
|
+ tg->bps[WRITE][LIMIT_LOW] = 0;
|
|
|
+ tg->iops[READ][LIMIT_LOW] = 0;
|
|
|
+ tg->iops[WRITE][LIMIT_LOW] = 0;
|
|
|
+ tg->idletime_threshold = DFL_IDLE_THRESHOLD;
|
|
|
+ tg->latency_target = DFL_LATENCY_TARGET;
|
|
|
+ } else if (index == LIMIT_LOW) {
|
|
|
+ tg->idletime_threshold = tg->idletime_threshold_conf;
|
|
|
+ tg->latency_target = tg->latency_target_conf;
|
|
|
+ }
|
|
|
|
|
|
- if (index == LIMIT_LOW) {
|
|
|
- blk_throtl_update_limit_valid(tg->td);
|
|
|
- if (tg->td->limit_valid[LIMIT_LOW])
|
|
|
+ blk_throtl_update_limit_valid(tg->td);
|
|
|
+ if (tg->td->limit_valid[LIMIT_LOW]) {
|
|
|
+ if (index == LIMIT_LOW)
|
|
|
tg->td->limit_index = LIMIT_LOW;
|
|
|
- tg->idletime_threshold = (idle_time == ULONG_MAX) ?
|
|
|
- ULONG_MAX : idle_time;
|
|
|
- tg->latency_target = (latency_time == ULONG_MAX) ?
|
|
|
- ULONG_MAX : latency_time;
|
|
|
- }
|
|
|
- tg_conf_updated(tg);
|
|
|
+ } else
|
|
|
+ tg->td->limit_index = LIMIT_MAX;
|
|
|
+ tg_conf_updated(tg, index == LIMIT_LOW &&
|
|
|
+ tg->td->limit_valid[LIMIT_LOW]);
|
|
|
ret = 0;
|
|
|
out_finish:
|
|
|
blkg_conf_finish(&ctx);
|
|
@@ -1722,17 +1770,25 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
|
|
|
/*
|
|
|
* cgroup is idle if:
|
|
|
* - single idle is too long, longer than a fixed value (in case user
|
|
|
- * configure a too big threshold) or 4 times of slice
|
|
|
+ * configure a too big threshold) or 4 times of idletime threshold
|
|
|
* - average think time is more than threshold
|
|
|
* - IO latency is largely below threshold
|
|
|
*/
|
|
|
- unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
|
|
|
-
|
|
|
- time = min_t(unsigned long, MAX_IDLE_TIME, time);
|
|
|
- return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
|
|
|
- tg->avg_idletime > tg->idletime_threshold ||
|
|
|
- (tg->latency_target && tg->bio_cnt &&
|
|
|
+ unsigned long time;
|
|
|
+ bool ret;
|
|
|
+
|
|
|
+ time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
|
|
|
+ ret = tg->latency_target == DFL_LATENCY_TARGET ||
|
|
|
+ tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
|
|
|
+ (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
|
|
|
+ tg->avg_idletime > tg->idletime_threshold ||
|
|
|
+ (tg->latency_target && tg->bio_cnt &&
|
|
|
tg->bad_bio_cnt * 5 < tg->bio_cnt);
|
|
|
+ throtl_log(&tg->service_queue,
|
|
|
+ "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
|
|
|
+ tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
|
|
|
+ tg->bio_cnt, ret, tg->td->scale);
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
|
|
@@ -1828,6 +1884,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
|
|
|
struct cgroup_subsys_state *pos_css;
|
|
|
struct blkcg_gq *blkg;
|
|
|
|
|
|
+ throtl_log(&td->service_queue, "upgrade to max");
|
|
|
td->limit_index = LIMIT_MAX;
|
|
|
td->low_upgrade_time = jiffies;
|
|
|
td->scale = 0;
|
|
@@ -1850,6 +1907,7 @@ static void throtl_downgrade_state(struct throtl_data *td, int new)
|
|
|
{
|
|
|
td->scale /= 2;
|
|
|
|
|
|
+ throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
|
|
|
if (td->scale) {
|
|
|
td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
|
|
|
return;
|
|
@@ -2023,6 +2081,11 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
|
|
|
td->avg_buckets[i].valid = true;
|
|
|
last_latency = td->avg_buckets[i].latency;
|
|
|
}
|
|
|
+
|
|
|
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
|
|
|
+ throtl_log(&td->service_queue,
|
|
|
+ "Latency bucket %d: latency=%ld, valid=%d", i,
|
|
|
+ td->avg_buckets[i].latency, td->avg_buckets[i].valid);
|
|
|
}
|
|
|
#else
|
|
|
static inline void throtl_update_latency_buckets(struct throtl_data *td)
|
|
@@ -2354,19 +2417,14 @@ void blk_throtl_exit(struct request_queue *q)
|
|
|
void blk_throtl_register_queue(struct request_queue *q)
|
|
|
{
|
|
|
struct throtl_data *td;
|
|
|
- struct cgroup_subsys_state *pos_css;
|
|
|
- struct blkcg_gq *blkg;
|
|
|
|
|
|
td = q->td;
|
|
|
BUG_ON(!td);
|
|
|
|
|
|
- if (blk_queue_nonrot(q)) {
|
|
|
+ if (blk_queue_nonrot(q))
|
|
|
td->throtl_slice = DFL_THROTL_SLICE_SSD;
|
|
|
- td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
|
|
|
- } else {
|
|
|
+ else
|
|
|
td->throtl_slice = DFL_THROTL_SLICE_HD;
|
|
|
- td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD;
|
|
|
- }
|
|
|
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
|
|
|
/* if no low limit, use previous default */
|
|
|
td->throtl_slice = DFL_THROTL_SLICE_HD;
|
|
@@ -2375,18 +2433,6 @@ void blk_throtl_register_queue(struct request_queue *q)
|
|
|
td->track_bio_latency = !q->mq_ops && !q->request_fn;
|
|
|
if (!td->track_bio_latency)
|
|
|
blk_stat_enable_accounting(q);
|
|
|
-
|
|
|
- /*
|
|
|
- * some tg are created before queue is fully initialized, eg, nonrot
|
|
|
- * isn't initialized yet
|
|
|
- */
|
|
|
- rcu_read_lock();
|
|
|
- blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
|
|
|
- struct throtl_grp *tg = blkg_to_tg(blkg);
|
|
|
-
|
|
|
- tg->idletime_threshold = td->dft_idletime_threshold;
|
|
|
- }
|
|
|
- rcu_read_unlock();
|
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
|