|
@@ -4270,7 +4270,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
|
|
}
|
|
}
|
|
spin_unlock(&memcg->event_list_lock);
|
|
spin_unlock(&memcg->event_list_lock);
|
|
|
|
|
|
- memcg->low = 0;
|
|
|
|
|
|
+ page_counter_set_low(&memcg->memory, 0);
|
|
|
|
|
|
memcg_offline_kmem(memcg);
|
|
memcg_offline_kmem(memcg);
|
|
wb_memcg_offline(memcg);
|
|
wb_memcg_offline(memcg);
|
|
@@ -4319,12 +4319,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
|
|
{
|
|
{
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
|
|
|
|
|
- memcg->low = 0;
|
|
|
|
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
|
|
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
|
|
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
|
|
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
|
|
page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
|
|
page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
|
|
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
|
|
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
|
|
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
|
|
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
|
|
|
|
+ page_counter_set_low(&memcg->memory, 0);
|
|
memcg->high = PAGE_COUNTER_MAX;
|
|
memcg->high = PAGE_COUNTER_MAX;
|
|
memcg->soft_limit = PAGE_COUNTER_MAX;
|
|
memcg->soft_limit = PAGE_COUNTER_MAX;
|
|
memcg_wb_domain_size_changed(memcg);
|
|
memcg_wb_domain_size_changed(memcg);
|
|
@@ -5064,7 +5064,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
|
|
static int memory_low_show(struct seq_file *m, void *v)
|
|
static int memory_low_show(struct seq_file *m, void *v)
|
|
{
|
|
{
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
|
|
- unsigned long low = READ_ONCE(memcg->low);
|
|
|
|
|
|
+ unsigned long low = READ_ONCE(memcg->memory.low);
|
|
|
|
|
|
if (low == PAGE_COUNTER_MAX)
|
|
if (low == PAGE_COUNTER_MAX)
|
|
seq_puts(m, "max\n");
|
|
seq_puts(m, "max\n");
|
|
@@ -5086,7 +5086,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
|
|
if (err)
|
|
if (err)
|
|
return err;
|
|
return err;
|
|
|
|
|
|
- memcg->low = low;
|
|
|
|
|
|
+ page_counter_set_low(&memcg->memory, low);
|
|
|
|
|
|
return nbytes;
|
|
return nbytes;
|
|
}
|
|
}
|
|
@@ -5348,36 +5348,72 @@ struct cgroup_subsys memory_cgrp_subsys = {
|
|
* @root: the top ancestor of the sub-tree being checked
|
|
* @root: the top ancestor of the sub-tree being checked
|
|
* @memcg: the memory cgroup to check
|
|
* @memcg: the memory cgroup to check
|
|
*
|
|
*
|
|
- * Returns %true if memory consumption of @memcg, and that of all
|
|
|
|
- * ancestors up to (but not including) @root, is below the normal range.
|
|
|
|
|
|
+ * WARNING: This function is not stateless! It can only be used as part
|
|
|
|
+ * of a top-down tree iteration, not for isolated queries.
|
|
*
|
|
*
|
|
- * @root is exclusive; it is never low when looked at directly and isn't
|
|
|
|
- * checked when traversing the hierarchy.
|
|
|
|
|
|
+ * Returns %true if memory consumption of @memcg is below the normal range.
|
|
*
|
|
*
|
|
- * Excluding @root enables using memory.low to prioritize memory usage
|
|
|
|
- * between cgroups within a subtree of the hierarchy that is limited by
|
|
|
|
- * memory.high or memory.max.
|
|
|
|
|
|
+ * @root is exclusive; it is never low when looked at directly
|
|
*
|
|
*
|
|
- * For example, given cgroup A with children B and C:
|
|
|
|
|
|
+ * To provide a proper hierarchical behavior, effective memory.low value
|
|
|
|
+ * is used.
|
|
*
|
|
*
|
|
- * A
|
|
|
|
- * / \
|
|
|
|
- * B C
|
|
|
|
|
|
+ * Effective memory.low is always equal or less than the original memory.low.
|
|
|
|
+ * If there is no memory.low overcommittment (which is always true for
|
|
|
|
+ * top-level memory cgroups), these two values are equal.
|
|
|
|
+ * Otherwise, it's a part of parent's effective memory.low,
|
|
|
|
+ * calculated as a cgroup's memory.low usage divided by sum of sibling's
|
|
|
|
+ * memory.low usages, where memory.low usage is the size of actually
|
|
|
|
+ * protected memory.
|
|
*
|
|
*
|
|
- * and
|
|
|
|
|
|
+ * low_usage
|
|
|
|
+ * elow = min( memory.low, parent->elow * ------------------ ),
|
|
|
|
+ * siblings_low_usage
|
|
*
|
|
*
|
|
- * 1. A/memory.current > A/memory.high
|
|
|
|
- * 2. A/B/memory.current < A/B/memory.low
|
|
|
|
- * 3. A/C/memory.current >= A/C/memory.low
|
|
|
|
|
|
+ * | memory.current, if memory.current < memory.low
|
|
|
|
+ * low_usage = |
|
|
|
|
+ | 0, otherwise.
|
|
*
|
|
*
|
|
- * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
|
|
|
|
- * should reclaim from 'C' until 'A' is no longer high or until we can
|
|
|
|
- * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
|
|
|
|
- * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
|
|
|
|
- * low and we will reclaim indiscriminately from both 'B' and 'C'.
|
|
|
|
|
|
+ *
|
|
|
|
+ * Such definition of the effective memory.low provides the expected
|
|
|
|
+ * hierarchical behavior: parent's memory.low value is limiting
|
|
|
|
+ * children, unprotected memory is reclaimed first and cgroups,
|
|
|
|
+ * which are not using their guarantee do not affect actual memory
|
|
|
|
+ * distribution.
|
|
|
|
+ *
|
|
|
|
+ * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
|
|
|
|
+ *
|
|
|
|
+ * A A/memory.low = 2G, A/memory.current = 6G
|
|
|
|
+ * //\\
|
|
|
|
+ * BC DE B/memory.low = 3G B/memory.current = 2G
|
|
|
|
+ * C/memory.low = 1G C/memory.current = 2G
|
|
|
|
+ * D/memory.low = 0 D/memory.current = 2G
|
|
|
|
+ * E/memory.low = 10G E/memory.current = 0
|
|
|
|
+ *
|
|
|
|
+ * and the memory pressure is applied, the following memory distribution
|
|
|
|
+ * is expected (approximately):
|
|
|
|
+ *
|
|
|
|
+ * A/memory.current = 2G
|
|
|
|
+ *
|
|
|
|
+ * B/memory.current = 1.3G
|
|
|
|
+ * C/memory.current = 0.6G
|
|
|
|
+ * D/memory.current = 0
|
|
|
|
+ * E/memory.current = 0
|
|
|
|
+ *
|
|
|
|
+ * These calculations require constant tracking of the actual low usages
|
|
|
|
+ * (see propagate_low_usage()), as well as recursive calculation of
|
|
|
|
+ * effective memory.low values. But as we do call mem_cgroup_low()
|
|
|
|
+ * path for each memory cgroup top-down from the reclaim,
|
|
|
|
+ * it's possible to optimize this part, and save calculated elow
|
|
|
|
+ * for next usage. This part is intentionally racy, but it's ok,
|
|
|
|
+ * as memory.low is a best-effort mechanism.
|
|
*/
|
|
*/
|
|
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
|
|
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
|
|
{
|
|
{
|
|
|
|
+ unsigned long usage, low_usage, siblings_low_usage;
|
|
|
|
+ unsigned long elow, parent_elow;
|
|
|
|
+ struct mem_cgroup *parent;
|
|
|
|
+
|
|
if (mem_cgroup_disabled())
|
|
if (mem_cgroup_disabled())
|
|
return false;
|
|
return false;
|
|
|
|
|
|
@@ -5386,12 +5422,30 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
|
|
if (memcg == root)
|
|
if (memcg == root)
|
|
return false;
|
|
return false;
|
|
|
|
|
|
- for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
|
|
|
|
- if (page_counter_read(&memcg->memory) >= memcg->low)
|
|
|
|
- return false;
|
|
|
|
- }
|
|
|
|
|
|
+ elow = memcg->memory.low;
|
|
|
|
+ usage = page_counter_read(&memcg->memory);
|
|
|
|
+ parent = parent_mem_cgroup(memcg);
|
|
|
|
|
|
- return true;
|
|
|
|
|
|
+ if (parent == root)
|
|
|
|
+ goto exit;
|
|
|
|
+
|
|
|
|
+ parent_elow = READ_ONCE(parent->memory.elow);
|
|
|
|
+ elow = min(elow, parent_elow);
|
|
|
|
+
|
|
|
|
+ if (!elow || !parent_elow)
|
|
|
|
+ goto exit;
|
|
|
|
+
|
|
|
|
+ low_usage = min(usage, memcg->memory.low);
|
|
|
|
+ siblings_low_usage = atomic_long_read(
|
|
|
|
+ &parent->memory.children_low_usage);
|
|
|
|
+
|
|
|
|
+ if (!low_usage || !siblings_low_usage)
|
|
|
|
+ goto exit;
|
|
|
|
+
|
|
|
|
+ elow = min(elow, parent_elow * low_usage / siblings_low_usage);
|
|
|
|
+exit:
|
|
|
|
+ memcg->memory.elow = elow;
|
|
|
|
+ return usage < elow;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|