|
@@ -140,6 +140,13 @@ struct throtl_grp {
|
|
|
/* Number of bio's dispatched in current slice */
|
|
|
unsigned int io_disp[2];
|
|
|
|
|
|
+ unsigned long last_low_overflow_time[2];
|
|
|
+
|
|
|
+ uint64_t last_bytes_disp[2];
|
|
|
+ unsigned int last_io_disp[2];
|
|
|
+
|
|
|
+ unsigned long last_check_time;
|
|
|
+
|
|
|
/* When did we start a new slice */
|
|
|
unsigned long slice_start[2];
|
|
|
unsigned long slice_end[2];
|
|
@@ -159,6 +166,9 @@ struct throtl_data
|
|
|
struct work_struct dispatch_work;
|
|
|
unsigned int limit_index;
|
|
|
bool limit_valid[LIMIT_CNT];
|
|
|
+
|
|
|
+ unsigned long low_upgrade_time;
|
|
|
+ unsigned long low_downgrade_time;
|
|
|
};
|
|
|
|
|
|
static void throtl_pending_timer_fn(unsigned long arg);
|
|
@@ -898,6 +908,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
|
|
|
/* Charge the bio to the group */
|
|
|
tg->bytes_disp[rw] += bio->bi_iter.bi_size;
|
|
|
tg->io_disp[rw]++;
|
|
|
+ tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
|
|
|
+ tg->last_io_disp[rw]++;
|
|
|
|
|
|
/*
|
|
|
* BIO_THROTTLED is used to prevent the same bio to be throttled
|
|
@@ -1527,6 +1539,45 @@ static struct blkcg_policy blkcg_policy_throtl = {
|
|
|
.pd_free_fn = throtl_pd_free,
|
|
|
};
|
|
|
|
|
|
+static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
|
|
|
+{
|
|
|
+ unsigned long rtime = jiffies, wtime = jiffies;
|
|
|
+
|
|
|
+ if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
|
|
|
+ rtime = tg->last_low_overflow_time[READ];
|
|
|
+ if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
|
|
|
+ wtime = tg->last_low_overflow_time[WRITE];
|
|
|
+ return min(rtime, wtime);
|
|
|
+}
|
|
|
+
|
|
|
+/* tg should not be an intermediate node */
|
|
|
+static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
|
|
|
+{
|
|
|
+ struct throtl_service_queue *parent_sq;
|
|
|
+ struct throtl_grp *parent = tg;
|
|
|
+ unsigned long ret = __tg_last_low_overflow_time(tg);
|
|
|
+
|
|
|
+ while (true) {
|
|
|
+ parent_sq = parent->service_queue.parent_sq;
|
|
|
+ parent = sq_to_tg(parent_sq);
|
|
|
+ if (!parent)
|
|
|
+ break;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The parent doesn't have low limit, it always reaches low
|
|
|
+ * limit. Its overflow time is useless for children
|
|
|
+ */
|
|
|
+ if (!parent->bps[READ][LIMIT_LOW] &&
|
|
|
+ !parent->iops[READ][LIMIT_LOW] &&
|
|
|
+ !parent->bps[WRITE][LIMIT_LOW] &&
|
|
|
+ !parent->iops[WRITE][LIMIT_LOW])
|
|
|
+ continue;
|
|
|
+ if (time_after(__tg_last_low_overflow_time(parent), ret))
|
|
|
+ ret = __tg_last_low_overflow_time(parent);
|
|
|
+ }
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
|
|
|
{
|
|
|
struct throtl_service_queue *sq = &tg->service_queue;
|
|
@@ -1570,6 +1621,9 @@ static bool throtl_can_upgrade(struct throtl_data *td,
|
|
|
if (td->limit_index != LIMIT_LOW)
|
|
|
return false;
|
|
|
|
|
|
+ if (time_before(jiffies, td->low_downgrade_time + throtl_slice))
|
|
|
+ return false;
|
|
|
+
|
|
|
rcu_read_lock();
|
|
|
blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
|
|
|
struct throtl_grp *tg = blkg_to_tg(blkg);
|
|
@@ -1593,6 +1647,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
|
|
|
struct blkcg_gq *blkg;
|
|
|
|
|
|
td->limit_index = LIMIT_MAX;
|
|
|
+ td->low_upgrade_time = jiffies;
|
|
|
rcu_read_lock();
|
|
|
blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
|
|
|
struct throtl_grp *tg = blkg_to_tg(blkg);
|
|
@@ -1608,6 +1663,99 @@ static void throtl_upgrade_state(struct throtl_data *td)
|
|
|
queue_work(kthrotld_workqueue, &td->dispatch_work);
|
|
|
}
|
|
|
|
|
|
+static void throtl_downgrade_state(struct throtl_data *td, int new)
|
|
|
+{
|
|
|
+ td->limit_index = new;
|
|
|
+ td->low_downgrade_time = jiffies;
|
|
|
+}
|
|
|
+
|
|
|
+static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
|
|
|
+{
|
|
|
+ struct throtl_data *td = tg->td;
|
|
|
+ unsigned long now = jiffies;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If cgroup is below low limit, consider downgrade and throttle other
|
|
|
+ * cgroups
|
|
|
+ */
|
|
|
+ if (time_after_eq(now, td->low_upgrade_time + throtl_slice) &&
|
|
|
+ time_after_eq(now, tg_last_low_overflow_time(tg) + throtl_slice))
|
|
|
+ return true;
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
|
|
|
+{
|
|
|
+ while (true) {
|
|
|
+ if (!throtl_tg_can_downgrade(tg))
|
|
|
+ return false;
|
|
|
+ tg = sq_to_tg(tg->service_queue.parent_sq);
|
|
|
+ if (!tg || !tg_to_blkg(tg)->parent)
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+static void throtl_downgrade_check(struct throtl_grp *tg)
|
|
|
+{
|
|
|
+ uint64_t bps;
|
|
|
+ unsigned int iops;
|
|
|
+ unsigned long elapsed_time;
|
|
|
+ unsigned long now = jiffies;
|
|
|
+
|
|
|
+ if (tg->td->limit_index != LIMIT_MAX ||
|
|
|
+ !tg->td->limit_valid[LIMIT_LOW])
|
|
|
+ return;
|
|
|
+ if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
|
|
|
+ return;
|
|
|
+ if (time_after(tg->last_check_time + throtl_slice, now))
|
|
|
+ return;
|
|
|
+
|
|
|
+ elapsed_time = now - tg->last_check_time;
|
|
|
+ tg->last_check_time = now;
|
|
|
+
|
|
|
+ if (time_before(now, tg_last_low_overflow_time(tg) + throtl_slice))
|
|
|
+ return;
|
|
|
+
|
|
|
+ if (tg->bps[READ][LIMIT_LOW]) {
|
|
|
+ bps = tg->last_bytes_disp[READ] * HZ;
|
|
|
+ do_div(bps, elapsed_time);
|
|
|
+ if (bps >= tg->bps[READ][LIMIT_LOW])
|
|
|
+ tg->last_low_overflow_time[READ] = now;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (tg->bps[WRITE][LIMIT_LOW]) {
|
|
|
+ bps = tg->last_bytes_disp[WRITE] * HZ;
|
|
|
+ do_div(bps, elapsed_time);
|
|
|
+ if (bps >= tg->bps[WRITE][LIMIT_LOW])
|
|
|
+ tg->last_low_overflow_time[WRITE] = now;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (tg->iops[READ][LIMIT_LOW]) {
|
|
|
+ iops = tg->last_io_disp[READ] * HZ / elapsed_time;
|
|
|
+ if (iops >= tg->iops[READ][LIMIT_LOW])
|
|
|
+ tg->last_low_overflow_time[READ] = now;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (tg->iops[WRITE][LIMIT_LOW]) {
|
|
|
+ iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
|
|
|
+ if (iops >= tg->iops[WRITE][LIMIT_LOW])
|
|
|
+ tg->last_low_overflow_time[WRITE] = now;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If cgroup is below low limit, consider downgrade and throttle other
|
|
|
+ * cgroups
|
|
|
+ */
|
|
|
+ if (throtl_hierarchy_can_downgrade(tg))
|
|
|
+ throtl_downgrade_state(tg->td, LIMIT_LOW);
|
|
|
+
|
|
|
+ tg->last_bytes_disp[READ] = 0;
|
|
|
+ tg->last_bytes_disp[WRITE] = 0;
|
|
|
+ tg->last_io_disp[READ] = 0;
|
|
|
+ tg->last_io_disp[WRITE] = 0;
|
|
|
+}
|
|
|
+
|
|
|
bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
|
|
|
struct bio *bio)
|
|
|
{
|
|
@@ -1632,12 +1780,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
|
|
|
|
|
|
again:
|
|
|
while (true) {
|
|
|
+ if (tg->last_low_overflow_time[rw] == 0)
|
|
|
+ tg->last_low_overflow_time[rw] = jiffies;
|
|
|
+ throtl_downgrade_check(tg);
|
|
|
/* throtl is FIFO - if bios are already queued, should queue */
|
|
|
if (sq->nr_queued[rw])
|
|
|
break;
|
|
|
|
|
|
/* if above limits, break to queue */
|
|
|
if (!tg_may_dispatch(tg, bio, NULL)) {
|
|
|
+ tg->last_low_overflow_time[rw] = jiffies;
|
|
|
if (throtl_can_upgrade(tg->td, tg)) {
|
|
|
throtl_upgrade_state(tg->td);
|
|
|
goto again;
|
|
@@ -1681,6 +1833,8 @@ again:
|
|
|
tg->io_disp[rw], tg_iops_limit(tg, rw),
|
|
|
sq->nr_queued[READ], sq->nr_queued[WRITE]);
|
|
|
|
|
|
+ tg->last_low_overflow_time[rw] = jiffies;
|
|
|
+
|
|
|
bio_associate_current(bio);
|
|
|
tg->td->nr_queued[rw]++;
|
|
|
throtl_add_bio_tg(bio, qn, tg);
|
|
@@ -1791,6 +1945,8 @@ int blk_throtl_init(struct request_queue *q)
|
|
|
|
|
|
td->limit_valid[LIMIT_MAX] = true;
|
|
|
td->limit_index = LIMIT_MAX;
|
|
|
+ td->low_upgrade_time = jiffies;
|
|
|
+ td->low_downgrade_time = jiffies;
|
|
|
/* activate policy */
|
|
|
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
|
|
|
if (ret)
|