|
@@ -39,7 +39,6 @@
|
|
|
#include <linux/limits.h>
|
|
|
#include <linux/export.h>
|
|
|
#include <linux/mutex.h>
|
|
|
-#include <linux/rbtree.h>
|
|
|
#include <linux/slab.h>
|
|
|
#include <linux/swap.h>
|
|
|
#include <linux/swapops.h>
|
|
@@ -85,26 +84,12 @@ static int really_do_swap_account __initdata = 0;
|
|
|
#endif
|
|
|
|
|
|
|
|
|
-/*
|
|
|
- * Statistics for memory cgroup.
|
|
|
- */
|
|
|
-enum mem_cgroup_stat_index {
|
|
|
- /*
|
|
|
- * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
|
|
|
- */
|
|
|
- MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
|
|
|
- MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
|
|
|
- MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
|
|
|
- MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
|
|
|
- MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
|
|
|
- MEM_CGROUP_STAT_NSTATS,
|
|
|
-};
|
|
|
-
|
|
|
static const char * const mem_cgroup_stat_names[] = {
|
|
|
"cache",
|
|
|
"rss",
|
|
|
"rss_huge",
|
|
|
"mapped_file",
|
|
|
+ "writeback",
|
|
|
"swap",
|
|
|
};
|
|
|
|
|
@@ -175,10 +160,6 @@ struct mem_cgroup_per_zone {
|
|
|
|
|
|
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
|
|
|
|
|
|
- struct rb_node tree_node; /* RB tree node */
|
|
|
- unsigned long long usage_in_excess;/* Set to the value by which */
|
|
|
- /* the soft limit is exceeded*/
|
|
|
- bool on_tree;
|
|
|
struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
|
|
/* use container_of */
|
|
|
};
|
|
@@ -187,26 +168,6 @@ struct mem_cgroup_per_node {
|
|
|
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
|
|
|
};
|
|
|
|
|
|
-/*
|
|
|
- * Cgroups above their limits are maintained in a RB-Tree, independent of
|
|
|
- * their hierarchy representation
|
|
|
- */
|
|
|
-
|
|
|
-struct mem_cgroup_tree_per_zone {
|
|
|
- struct rb_root rb_root;
|
|
|
- spinlock_t lock;
|
|
|
-};
|
|
|
-
|
|
|
-struct mem_cgroup_tree_per_node {
|
|
|
- struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
|
|
|
-};
|
|
|
-
|
|
|
-struct mem_cgroup_tree {
|
|
|
- struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
|
|
|
-};
|
|
|
-
|
|
|
-static struct mem_cgroup_tree soft_limit_tree __read_mostly;
|
|
|
-
|
|
|
struct mem_cgroup_threshold {
|
|
|
struct eventfd_ctx *eventfd;
|
|
|
u64 threshold;
|
|
@@ -280,6 +241,7 @@ struct mem_cgroup {
|
|
|
|
|
|
bool oom_lock;
|
|
|
atomic_t under_oom;
|
|
|
+ atomic_t oom_wakeups;
|
|
|
|
|
|
int swappiness;
|
|
|
/* OOM-Killer disable */
|
|
@@ -304,7 +266,7 @@ struct mem_cgroup {
|
|
|
* Should we move charges of a task when a task is moved into this
|
|
|
* mem_cgroup ? And what type of charges should we move ?
|
|
|
*/
|
|
|
- unsigned long move_charge_at_immigrate;
|
|
|
+ unsigned long move_charge_at_immigrate;
|
|
|
/*
|
|
|
* set > 0 if pages under this cgroup are moving to other cgroup.
|
|
|
*/
|
|
@@ -341,6 +303,22 @@ struct mem_cgroup {
|
|
|
atomic_t numainfo_events;
|
|
|
atomic_t numainfo_updating;
|
|
|
#endif
|
|
|
+ /*
|
|
|
+ * Protects soft_contributed transitions.
|
|
|
+ * See mem_cgroup_update_soft_limit
|
|
|
+ */
|
|
|
+ spinlock_t soft_lock;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If true then this group has increased parents' children_in_excess
|
|
|
+ * when it got over the soft limit.
|
|
|
+ * When a group falls bellow the soft limit, parents' children_in_excess
|
|
|
+ * is decreased and soft_contributed changed to false.
|
|
|
+ */
|
|
|
+ bool soft_contributed;
|
|
|
+
|
|
|
+ /* Number of children that are in soft limit excess */
|
|
|
+ atomic_t children_in_excess;
|
|
|
|
|
|
struct mem_cgroup_per_node *nodeinfo[0];
|
|
|
/* WARNING: nodeinfo must be the last member here */
|
|
@@ -444,7 +422,6 @@ static bool move_file(void)
|
|
|
* limit reclaim to prevent infinite loops, if they ever occur.
|
|
|
*/
|
|
|
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
|
|
|
-#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
|
|
|
|
|
|
enum charge_type {
|
|
|
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
|
|
@@ -671,164 +648,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
|
|
|
return mem_cgroup_zoneinfo(memcg, nid, zid);
|
|
|
}
|
|
|
|
|
|
-static struct mem_cgroup_tree_per_zone *
|
|
|
-soft_limit_tree_node_zone(int nid, int zid)
|
|
|
-{
|
|
|
- return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
|
|
|
-}
|
|
|
-
|
|
|
-static struct mem_cgroup_tree_per_zone *
|
|
|
-soft_limit_tree_from_page(struct page *page)
|
|
|
-{
|
|
|
- int nid = page_to_nid(page);
|
|
|
- int zid = page_zonenum(page);
|
|
|
-
|
|
|
- return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
|
|
|
-}
|
|
|
-
|
|
|
-static void
|
|
|
-__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
|
|
|
- struct mem_cgroup_per_zone *mz,
|
|
|
- struct mem_cgroup_tree_per_zone *mctz,
|
|
|
- unsigned long long new_usage_in_excess)
|
|
|
-{
|
|
|
- struct rb_node **p = &mctz->rb_root.rb_node;
|
|
|
- struct rb_node *parent = NULL;
|
|
|
- struct mem_cgroup_per_zone *mz_node;
|
|
|
-
|
|
|
- if (mz->on_tree)
|
|
|
- return;
|
|
|
-
|
|
|
- mz->usage_in_excess = new_usage_in_excess;
|
|
|
- if (!mz->usage_in_excess)
|
|
|
- return;
|
|
|
- while (*p) {
|
|
|
- parent = *p;
|
|
|
- mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
|
|
|
- tree_node);
|
|
|
- if (mz->usage_in_excess < mz_node->usage_in_excess)
|
|
|
- p = &(*p)->rb_left;
|
|
|
- /*
|
|
|
- * We can't avoid mem cgroups that are over their soft
|
|
|
- * limit by the same amount
|
|
|
- */
|
|
|
- else if (mz->usage_in_excess >= mz_node->usage_in_excess)
|
|
|
- p = &(*p)->rb_right;
|
|
|
- }
|
|
|
- rb_link_node(&mz->tree_node, parent, p);
|
|
|
- rb_insert_color(&mz->tree_node, &mctz->rb_root);
|
|
|
- mz->on_tree = true;
|
|
|
-}
|
|
|
-
|
|
|
-static void
|
|
|
-__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
|
|
|
- struct mem_cgroup_per_zone *mz,
|
|
|
- struct mem_cgroup_tree_per_zone *mctz)
|
|
|
-{
|
|
|
- if (!mz->on_tree)
|
|
|
- return;
|
|
|
- rb_erase(&mz->tree_node, &mctz->rb_root);
|
|
|
- mz->on_tree = false;
|
|
|
-}
|
|
|
-
|
|
|
-static void
|
|
|
-mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
|
|
|
- struct mem_cgroup_per_zone *mz,
|
|
|
- struct mem_cgroup_tree_per_zone *mctz)
|
|
|
-{
|
|
|
- spin_lock(&mctz->lock);
|
|
|
- __mem_cgroup_remove_exceeded(memcg, mz, mctz);
|
|
|
- spin_unlock(&mctz->lock);
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
|
|
|
-{
|
|
|
- unsigned long long excess;
|
|
|
- struct mem_cgroup_per_zone *mz;
|
|
|
- struct mem_cgroup_tree_per_zone *mctz;
|
|
|
- int nid = page_to_nid(page);
|
|
|
- int zid = page_zonenum(page);
|
|
|
- mctz = soft_limit_tree_from_page(page);
|
|
|
-
|
|
|
- /*
|
|
|
- * Necessary to update all ancestors when hierarchy is used.
|
|
|
- * because their event counter is not touched.
|
|
|
- */
|
|
|
- for (; memcg; memcg = parent_mem_cgroup(memcg)) {
|
|
|
- mz = mem_cgroup_zoneinfo(memcg, nid, zid);
|
|
|
- excess = res_counter_soft_limit_excess(&memcg->res);
|
|
|
- /*
|
|
|
- * We have to update the tree if mz is on RB-tree or
|
|
|
- * mem is over its softlimit.
|
|
|
- */
|
|
|
- if (excess || mz->on_tree) {
|
|
|
- spin_lock(&mctz->lock);
|
|
|
- /* if on-tree, remove it */
|
|
|
- if (mz->on_tree)
|
|
|
- __mem_cgroup_remove_exceeded(memcg, mz, mctz);
|
|
|
- /*
|
|
|
- * Insert again. mz->usage_in_excess will be updated.
|
|
|
- * If excess is 0, no tree ops.
|
|
|
- */
|
|
|
- __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
|
|
|
- spin_unlock(&mctz->lock);
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
|
|
|
-{
|
|
|
- int node, zone;
|
|
|
- struct mem_cgroup_per_zone *mz;
|
|
|
- struct mem_cgroup_tree_per_zone *mctz;
|
|
|
-
|
|
|
- for_each_node(node) {
|
|
|
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
|
|
- mz = mem_cgroup_zoneinfo(memcg, node, zone);
|
|
|
- mctz = soft_limit_tree_node_zone(node, zone);
|
|
|
- mem_cgroup_remove_exceeded(memcg, mz, mctz);
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-static struct mem_cgroup_per_zone *
|
|
|
-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
|
|
|
-{
|
|
|
- struct rb_node *rightmost = NULL;
|
|
|
- struct mem_cgroup_per_zone *mz;
|
|
|
-
|
|
|
-retry:
|
|
|
- mz = NULL;
|
|
|
- rightmost = rb_last(&mctz->rb_root);
|
|
|
- if (!rightmost)
|
|
|
- goto done; /* Nothing to reclaim from */
|
|
|
-
|
|
|
- mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
|
|
|
- /*
|
|
|
- * Remove the node now but someone else can add it back,
|
|
|
- * we will to add it back at the end of reclaim to its correct
|
|
|
- * position in the tree.
|
|
|
- */
|
|
|
- __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
|
|
|
- if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
|
|
|
- !css_tryget(&mz->memcg->css))
|
|
|
- goto retry;
|
|
|
-done:
|
|
|
- return mz;
|
|
|
-}
|
|
|
-
|
|
|
-static struct mem_cgroup_per_zone *
|
|
|
-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
|
|
|
-{
|
|
|
- struct mem_cgroup_per_zone *mz;
|
|
|
-
|
|
|
- spin_lock(&mctz->lock);
|
|
|
- mz = __mem_cgroup_largest_soft_limit_node(mctz);
|
|
|
- spin_unlock(&mctz->lock);
|
|
|
- return mz;
|
|
|
-}
|
|
|
-
|
|
|
/*
|
|
|
* Implementation Note: reading percpu statistics for memcg.
|
|
|
*
|
|
@@ -1002,6 +821,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Called from rate-limited memcg_check_events when enough
|
|
|
+ * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
|
|
|
+ * that all the parents up the hierarchy will be notified that this group
|
|
|
+ * is in excess or that it is not in excess anymore. mmecg->soft_contributed
|
|
|
+ * makes the transition a single action whenever the state flips from one to
|
|
|
+ * the other.
|
|
|
+ */
|
|
|
+static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
|
|
|
+ struct mem_cgroup *parent = memcg;
|
|
|
+ int delta = 0;
|
|
|
+
|
|
|
+ spin_lock(&memcg->soft_lock);
|
|
|
+ if (excess) {
|
|
|
+ if (!memcg->soft_contributed) {
|
|
|
+ delta = 1;
|
|
|
+ memcg->soft_contributed = true;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if (memcg->soft_contributed) {
|
|
|
+ delta = -1;
|
|
|
+ memcg->soft_contributed = false;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Necessary to update all ancestors when hierarchy is used
|
|
|
+ * because their event counter is not touched.
|
|
|
+ * We track children even outside the hierarchy for the root
|
|
|
+ * cgroup because tree walk starting at root should visit
|
|
|
+ * all cgroups and we want to prevent from pointless tree
|
|
|
+ * walk if no children is below the limit.
|
|
|
+ */
|
|
|
+ while (delta && (parent = parent_mem_cgroup(parent)))
|
|
|
+ atomic_add(delta, &parent->children_in_excess);
|
|
|
+ if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
|
|
|
+ atomic_add(delta, &root_mem_cgroup->children_in_excess);
|
|
|
+ spin_unlock(&memcg->soft_lock);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Check events in order.
|
|
|
*
|
|
@@ -1025,7 +886,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
|
|
|
|
|
|
mem_cgroup_threshold(memcg);
|
|
|
if (unlikely(do_softlimit))
|
|
|
- mem_cgroup_update_tree(memcg, page);
|
|
|
+ mem_cgroup_update_soft_limit(memcg);
|
|
|
#if MAX_NUMNODES > 1
|
|
|
if (unlikely(do_numainfo))
|
|
|
atomic_inc(&memcg->numainfo_events);
|
|
@@ -1068,6 +929,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
|
|
|
return memcg;
|
|
|
}
|
|
|
|
|
|
+static enum mem_cgroup_filter_t
|
|
|
+mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
|
|
|
+ mem_cgroup_iter_filter cond)
|
|
|
+{
|
|
|
+ if (!cond)
|
|
|
+ return VISIT;
|
|
|
+ return cond(memcg, root);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Returns a next (in a pre-order walk) alive memcg (with elevated css
|
|
|
* ref. count) or NULL if the whole root's subtree has been visited.
|
|
@@ -1075,7 +945,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
|
|
|
* helper function to be used by mem_cgroup_iter
|
|
|
*/
|
|
|
static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
|
|
|
- struct mem_cgroup *last_visited)
|
|
|
+ struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
|
|
|
{
|
|
|
struct cgroup_subsys_state *prev_css, *next_css;
|
|
|
|
|
@@ -1093,11 +963,31 @@ skip_node:
|
|
|
if (next_css) {
|
|
|
struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
|
|
|
|
|
|
- if (css_tryget(&mem->css))
|
|
|
- return mem;
|
|
|
- else {
|
|
|
+ switch (mem_cgroup_filter(mem, root, cond)) {
|
|
|
+ case SKIP:
|
|
|
prev_css = next_css;
|
|
|
goto skip_node;
|
|
|
+ case SKIP_TREE:
|
|
|
+ if (mem == root)
|
|
|
+ return NULL;
|
|
|
+ /*
|
|
|
+ * css_rightmost_descendant is not an optimal way to
|
|
|
+ * skip through a subtree (especially for imbalanced
|
|
|
+ * trees leaning to right) but that's what we have right
|
|
|
+ * now. More effective solution would be traversing
|
|
|
+ * right-up for first non-NULL without calling
|
|
|
+ * css_next_descendant_pre afterwards.
|
|
|
+ */
|
|
|
+ prev_css = css_rightmost_descendant(next_css);
|
|
|
+ goto skip_node;
|
|
|
+ case VISIT:
|
|
|
+ if (css_tryget(&mem->css))
|
|
|
+ return mem;
|
|
|
+ else {
|
|
|
+ prev_css = next_css;
|
|
|
+ goto skip_node;
|
|
|
+ }
|
|
|
+ break;
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -1161,6 +1051,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
|
|
|
* @root: hierarchy root
|
|
|
* @prev: previously returned memcg, NULL on first invocation
|
|
|
* @reclaim: cookie for shared reclaim walks, NULL for full walks
|
|
|
+ * @cond: filter for visited nodes, NULL for no filter
|
|
|
*
|
|
|
* Returns references to children of the hierarchy below @root, or
|
|
|
* @root itself, or %NULL after a full round-trip.
|
|
@@ -1173,15 +1064,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
|
|
|
* divide up the memcgs in the hierarchy among all concurrent
|
|
|
* reclaimers operating on the same zone and priority.
|
|
|
*/
|
|
|
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
|
|
|
+struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
|
|
|
struct mem_cgroup *prev,
|
|
|
- struct mem_cgroup_reclaim_cookie *reclaim)
|
|
|
+ struct mem_cgroup_reclaim_cookie *reclaim,
|
|
|
+ mem_cgroup_iter_filter cond)
|
|
|
{
|
|
|
struct mem_cgroup *memcg = NULL;
|
|
|
struct mem_cgroup *last_visited = NULL;
|
|
|
|
|
|
- if (mem_cgroup_disabled())
|
|
|
- return NULL;
|
|
|
+ if (mem_cgroup_disabled()) {
|
|
|
+ /* first call must return non-NULL, second return NULL */
|
|
|
+ return (struct mem_cgroup *)(unsigned long)!prev;
|
|
|
+ }
|
|
|
|
|
|
if (!root)
|
|
|
root = root_mem_cgroup;
|
|
@@ -1192,7 +1086,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
|
|
|
if (!root->use_hierarchy && root != root_mem_cgroup) {
|
|
|
if (prev)
|
|
|
goto out_css_put;
|
|
|
- return root;
|
|
|
+ if (mem_cgroup_filter(root, root, cond) == VISIT)
|
|
|
+ return root;
|
|
|
+ return NULL;
|
|
|
}
|
|
|
|
|
|
rcu_read_lock();
|
|
@@ -1215,7 +1111,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
|
|
|
last_visited = mem_cgroup_iter_load(iter, root, &seq);
|
|
|
}
|
|
|
|
|
|
- memcg = __mem_cgroup_iter_next(root, last_visited);
|
|
|
+ memcg = __mem_cgroup_iter_next(root, last_visited, cond);
|
|
|
|
|
|
if (reclaim) {
|
|
|
mem_cgroup_iter_update(iter, last_visited, memcg, seq);
|
|
@@ -1226,7 +1122,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
|
|
|
reclaim->generation = iter->generation;
|
|
|
}
|
|
|
|
|
|
- if (prev && !memcg)
|
|
|
+ /*
|
|
|
+ * We have finished the whole tree walk or no group has been
|
|
|
+ * visited because filter told us to skip the root node.
|
|
|
+ */
|
|
|
+ if (!memcg && (prev || (cond && !last_visited)))
|
|
|
goto out_unlock;
|
|
|
}
|
|
|
out_unlock:
|
|
@@ -1867,6 +1767,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
|
|
|
return total;
|
|
|
}
|
|
|
|
|
|
+#if MAX_NUMNODES > 1
|
|
|
/**
|
|
|
* test_mem_cgroup_node_reclaimable
|
|
|
* @memcg: the target memcg
|
|
@@ -1889,7 +1790,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
-#if MAX_NUMNODES > 1
|
|
|
|
|
|
/*
|
|
|
* Always updating the nodemask is not very good - even if we have an empty
|
|
@@ -1957,115 +1857,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
|
|
|
return node;
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * Check all nodes whether it contains reclaimable pages or not.
|
|
|
- * For quick scan, we make use of scan_nodes. This will allow us to skip
|
|
|
- * unused nodes. But scan_nodes is lazily updated and may not cotain
|
|
|
- * enough new information. We need to do double check.
|
|
|
- */
|
|
|
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
|
|
|
-{
|
|
|
- int nid;
|
|
|
-
|
|
|
- /*
|
|
|
- * quick check...making use of scan_node.
|
|
|
- * We can skip unused nodes.
|
|
|
- */
|
|
|
- if (!nodes_empty(memcg->scan_nodes)) {
|
|
|
- for (nid = first_node(memcg->scan_nodes);
|
|
|
- nid < MAX_NUMNODES;
|
|
|
- nid = next_node(nid, memcg->scan_nodes)) {
|
|
|
-
|
|
|
- if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
|
|
|
- return true;
|
|
|
- }
|
|
|
- }
|
|
|
- /*
|
|
|
- * Check rest of nodes.
|
|
|
- */
|
|
|
- for_each_node_state(nid, N_MEMORY) {
|
|
|
- if (node_isset(nid, memcg->scan_nodes))
|
|
|
- continue;
|
|
|
- if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
|
|
|
- return true;
|
|
|
- }
|
|
|
- return false;
|
|
|
-}
|
|
|
-
|
|
|
#else
|
|
|
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
|
|
|
{
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
|
|
|
-{
|
|
|
- return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
|
|
|
-}
|
|
|
#endif
|
|
|
|
|
|
-static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
|
|
|
- struct zone *zone,
|
|
|
- gfp_t gfp_mask,
|
|
|
- unsigned long *total_scanned)
|
|
|
-{
|
|
|
- struct mem_cgroup *victim = NULL;
|
|
|
- int total = 0;
|
|
|
- int loop = 0;
|
|
|
- unsigned long excess;
|
|
|
- unsigned long nr_scanned;
|
|
|
- struct mem_cgroup_reclaim_cookie reclaim = {
|
|
|
- .zone = zone,
|
|
|
- .priority = 0,
|
|
|
- };
|
|
|
-
|
|
|
- excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
|
|
|
-
|
|
|
- while (1) {
|
|
|
- victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
|
|
|
- if (!victim) {
|
|
|
- loop++;
|
|
|
- if (loop >= 2) {
|
|
|
- /*
|
|
|
- * If we have not been able to reclaim
|
|
|
- * anything, it might because there are
|
|
|
- * no reclaimable pages under this hierarchy
|
|
|
- */
|
|
|
- if (!total)
|
|
|
- break;
|
|
|
- /*
|
|
|
- * We want to do more targeted reclaim.
|
|
|
- * excess >> 2 is not to excessive so as to
|
|
|
- * reclaim too much, nor too less that we keep
|
|
|
- * coming back to reclaim from this cgroup
|
|
|
- */
|
|
|
- if (total >= (excess >> 2) ||
|
|
|
- (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
|
|
|
- break;
|
|
|
- }
|
|
|
- continue;
|
|
|
- }
|
|
|
- if (!mem_cgroup_reclaimable(victim, false))
|
|
|
- continue;
|
|
|
- total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
|
|
|
- zone, &nr_scanned);
|
|
|
- *total_scanned += nr_scanned;
|
|
|
- if (!res_counter_soft_limit_excess(&root_memcg->res))
|
|
|
+/*
|
|
|
+ * A group is eligible for the soft limit reclaim under the given root
|
|
|
+ * hierarchy if
|
|
|
+ * a) it is over its soft limit
|
|
|
+ * b) any parent up the hierarchy is over its soft limit
|
|
|
+ *
|
|
|
+ * If the given group doesn't have any children over the limit then it
|
|
|
+ * doesn't make any sense to iterate its subtree.
|
|
|
+ */
|
|
|
+enum mem_cgroup_filter_t
|
|
|
+mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
|
|
|
+ struct mem_cgroup *root)
|
|
|
+{
|
|
|
+ struct mem_cgroup *parent;
|
|
|
+
|
|
|
+ if (!memcg)
|
|
|
+ memcg = root_mem_cgroup;
|
|
|
+ parent = memcg;
|
|
|
+
|
|
|
+ if (res_counter_soft_limit_excess(&memcg->res))
|
|
|
+ return VISIT;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If any parent up to the root in the hierarchy is over its soft limit
|
|
|
+ * then we have to obey and reclaim from this group as well.
|
|
|
+ */
|
|
|
+ while ((parent = parent_mem_cgroup(parent))) {
|
|
|
+ if (res_counter_soft_limit_excess(&parent->res))
|
|
|
+ return VISIT;
|
|
|
+ if (parent == root)
|
|
|
break;
|
|
|
}
|
|
|
- mem_cgroup_iter_break(root_memcg, victim);
|
|
|
- return total;
|
|
|
+
|
|
|
+ if (!atomic_read(&memcg->children_in_excess))
|
|
|
+ return SKIP_TREE;
|
|
|
+ return SKIP;
|
|
|
}
|
|
|
|
|
|
+static DEFINE_SPINLOCK(memcg_oom_lock);
|
|
|
+
|
|
|
/*
|
|
|
* Check OOM-Killer is already running under our hierarchy.
|
|
|
* If someone is running, return false.
|
|
|
- * Has to be called with memcg_oom_lock
|
|
|
*/
|
|
|
-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
|
|
|
+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
|
|
|
{
|
|
|
struct mem_cgroup *iter, *failed = NULL;
|
|
|
|
|
|
+ spin_lock(&memcg_oom_lock);
|
|
|
+
|
|
|
for_each_mem_cgroup_tree(iter, memcg) {
|
|
|
if (iter->oom_lock) {
|
|
|
/*
|
|
@@ -2079,33 +1928,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
|
|
|
iter->oom_lock = true;
|
|
|
}
|
|
|
|
|
|
- if (!failed)
|
|
|
- return true;
|
|
|
-
|
|
|
- /*
|
|
|
- * OK, we failed to lock the whole subtree so we have to clean up
|
|
|
- * what we set up to the failing subtree
|
|
|
- */
|
|
|
- for_each_mem_cgroup_tree(iter, memcg) {
|
|
|
- if (iter == failed) {
|
|
|
- mem_cgroup_iter_break(memcg, iter);
|
|
|
- break;
|
|
|
+ if (failed) {
|
|
|
+ /*
|
|
|
+ * OK, we failed to lock the whole subtree so we have
|
|
|
+ * to clean up what we set up to the failing subtree
|
|
|
+ */
|
|
|
+ for_each_mem_cgroup_tree(iter, memcg) {
|
|
|
+ if (iter == failed) {
|
|
|
+ mem_cgroup_iter_break(memcg, iter);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ iter->oom_lock = false;
|
|
|
}
|
|
|
- iter->oom_lock = false;
|
|
|
}
|
|
|
- return false;
|
|
|
+
|
|
|
+ spin_unlock(&memcg_oom_lock);
|
|
|
+
|
|
|
+ return !failed;
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * Has to be called with memcg_oom_lock
|
|
|
- */
|
|
|
-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
|
|
|
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
|
|
|
{
|
|
|
struct mem_cgroup *iter;
|
|
|
|
|
|
+ spin_lock(&memcg_oom_lock);
|
|
|
for_each_mem_cgroup_tree(iter, memcg)
|
|
|
iter->oom_lock = false;
|
|
|
- return 0;
|
|
|
+ spin_unlock(&memcg_oom_lock);
|
|
|
}
|
|
|
|
|
|
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
|
|
@@ -2129,7 +1978,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
|
|
|
atomic_add_unless(&iter->under_oom, -1, 0);
|
|
|
}
|
|
|
|
|
|
-static DEFINE_SPINLOCK(memcg_oom_lock);
|
|
|
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
|
|
|
|
|
|
struct oom_wait_info {
|
|
@@ -2159,6 +2007,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
|
|
|
|
|
|
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
|
|
|
{
|
|
|
+ atomic_inc(&memcg->oom_wakeups);
|
|
|
/* for filtering, pass "memcg" as argument. */
|
|
|
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
|
|
|
}
|
|
@@ -2170,56 +2019,136 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
|
|
|
+ * try to call OOM killer
|
|
|
*/
|
|
|
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
|
|
|
- int order)
|
|
|
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
|
|
|
{
|
|
|
- struct oom_wait_info owait;
|
|
|
- bool locked, need_to_kill;
|
|
|
+ bool locked;
|
|
|
+ int wakeups;
|
|
|
|
|
|
- owait.memcg = memcg;
|
|
|
- owait.wait.flags = 0;
|
|
|
- owait.wait.func = memcg_oom_wake_function;
|
|
|
- owait.wait.private = current;
|
|
|
- INIT_LIST_HEAD(&owait.wait.task_list);
|
|
|
- need_to_kill = true;
|
|
|
- mem_cgroup_mark_under_oom(memcg);
|
|
|
+ if (!current->memcg_oom.may_oom)
|
|
|
+ return;
|
|
|
+
|
|
|
+ current->memcg_oom.in_memcg_oom = 1;
|
|
|
|
|
|
- /* At first, try to OOM lock hierarchy under memcg.*/
|
|
|
- spin_lock(&memcg_oom_lock);
|
|
|
- locked = mem_cgroup_oom_lock(memcg);
|
|
|
/*
|
|
|
- * Even if signal_pending(), we can't quit charge() loop without
|
|
|
- * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
|
|
|
- * under OOM is always welcomed, use TASK_KILLABLE here.
|
|
|
+ * As with any blocking lock, a contender needs to start
|
|
|
+ * listening for wakeups before attempting the trylock,
|
|
|
+ * otherwise it can miss the wakeup from the unlock and sleep
|
|
|
+ * indefinitely. This is just open-coded because our locking
|
|
|
+ * is so particular to memcg hierarchies.
|
|
|
*/
|
|
|
- prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
|
|
|
- if (!locked || memcg->oom_kill_disable)
|
|
|
- need_to_kill = false;
|
|
|
+ wakeups = atomic_read(&memcg->oom_wakeups);
|
|
|
+ mem_cgroup_mark_under_oom(memcg);
|
|
|
+
|
|
|
+ locked = mem_cgroup_oom_trylock(memcg);
|
|
|
+
|
|
|
if (locked)
|
|
|
mem_cgroup_oom_notify(memcg);
|
|
|
- spin_unlock(&memcg_oom_lock);
|
|
|
|
|
|
- if (need_to_kill) {
|
|
|
- finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
|
+ if (locked && !memcg->oom_kill_disable) {
|
|
|
+ mem_cgroup_unmark_under_oom(memcg);
|
|
|
mem_cgroup_out_of_memory(memcg, mask, order);
|
|
|
+ mem_cgroup_oom_unlock(memcg);
|
|
|
+ /*
|
|
|
+ * There is no guarantee that an OOM-lock contender
|
|
|
+ * sees the wakeups triggered by the OOM kill
|
|
|
+ * uncharges. Wake any sleepers explicitely.
|
|
|
+ */
|
|
|
+ memcg_oom_recover(memcg);
|
|
|
} else {
|
|
|
- schedule();
|
|
|
- finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
|
+ /*
|
|
|
+ * A system call can just return -ENOMEM, but if this
|
|
|
+ * is a page fault and somebody else is handling the
|
|
|
+ * OOM already, we need to sleep on the OOM waitqueue
|
|
|
+ * for this memcg until the situation is resolved.
|
|
|
+ * Which can take some time because it might be
|
|
|
+ * handled by a userspace task.
|
|
|
+ *
|
|
|
+ * However, this is the charge context, which means
|
|
|
+ * that we may sit on a large call stack and hold
|
|
|
+ * various filesystem locks, the mmap_sem etc. and we
|
|
|
+ * don't want the OOM handler to deadlock on them
|
|
|
+ * while we sit here and wait. Store the current OOM
|
|
|
+ * context in the task_struct, then return -ENOMEM.
|
|
|
+ * At the end of the page fault handler, with the
|
|
|
+ * stack unwound, pagefault_out_of_memory() will check
|
|
|
+ * back with us by calling
|
|
|
+ * mem_cgroup_oom_synchronize(), possibly putting the
|
|
|
+ * task to sleep.
|
|
|
+ */
|
|
|
+ current->memcg_oom.oom_locked = locked;
|
|
|
+ current->memcg_oom.wakeups = wakeups;
|
|
|
+ css_get(&memcg->css);
|
|
|
+ current->memcg_oom.wait_on_memcg = memcg;
|
|
|
}
|
|
|
- spin_lock(&memcg_oom_lock);
|
|
|
- if (locked)
|
|
|
- mem_cgroup_oom_unlock(memcg);
|
|
|
- memcg_wakeup_oom(memcg);
|
|
|
- spin_unlock(&memcg_oom_lock);
|
|
|
+}
|
|
|
|
|
|
- mem_cgroup_unmark_under_oom(memcg);
|
|
|
+/**
|
|
|
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
|
|
|
+ *
|
|
|
+ * This has to be called at the end of a page fault if the the memcg
|
|
|
+ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
|
|
|
+ *
|
|
|
+ * Memcg supports userspace OOM handling, so failed allocations must
|
|
|
+ * sleep on a waitqueue until the userspace task resolves the
|
|
|
+ * situation. Sleeping directly in the charge context with all kinds
|
|
|
+ * of locks held is not a good idea, instead we remember an OOM state
|
|
|
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
|
|
|
+ * the end of the page fault to put the task to sleep and clean up the
|
|
|
+ * OOM state.
|
|
|
+ *
|
|
|
+ * Returns %true if an ongoing memcg OOM situation was detected and
|
|
|
+ * finalized, %false otherwise.
|
|
|
+ */
|
|
|
+bool mem_cgroup_oom_synchronize(void)
|
|
|
+{
|
|
|
+ struct oom_wait_info owait;
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
|
|
|
- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
|
|
|
+ /* OOM is global, do not handle */
|
|
|
+ if (!current->memcg_oom.in_memcg_oom)
|
|
|
return false;
|
|
|
- /* Give chance to dying process */
|
|
|
- schedule_timeout_uninterruptible(1);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We invoked the OOM killer but there is a chance that a kill
|
|
|
+ * did not free up any charges. Everybody else might already
|
|
|
+ * be sleeping, so restart the fault and keep the rampage
|
|
|
+ * going until some charges are released.
|
|
|
+ */
|
|
|
+ memcg = current->memcg_oom.wait_on_memcg;
|
|
|
+ if (!memcg)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
|
|
|
+ goto out_memcg;
|
|
|
+
|
|
|
+ owait.memcg = memcg;
|
|
|
+ owait.wait.flags = 0;
|
|
|
+ owait.wait.func = memcg_oom_wake_function;
|
|
|
+ owait.wait.private = current;
|
|
|
+ INIT_LIST_HEAD(&owait.wait.task_list);
|
|
|
+
|
|
|
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
|
|
|
+ /* Only sleep if we didn't miss any wakeups since OOM */
|
|
|
+ if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
|
|
|
+ schedule();
|
|
|
+ finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
|
+out_memcg:
|
|
|
+ mem_cgroup_unmark_under_oom(memcg);
|
|
|
+ if (current->memcg_oom.oom_locked) {
|
|
|
+ mem_cgroup_oom_unlock(memcg);
|
|
|
+ /*
|
|
|
+ * There is no guarantee that an OOM-lock contender
|
|
|
+ * sees the wakeups triggered by the OOM kill
|
|
|
+ * uncharges. Wake any sleepers explicitely.
|
|
|
+ */
|
|
|
+ memcg_oom_recover(memcg);
|
|
|
+ }
|
|
|
+ css_put(&memcg->css);
|
|
|
+ current->memcg_oom.wait_on_memcg = NULL;
|
|
|
+out:
|
|
|
+ current->memcg_oom.in_memcg_oom = 0;
|
|
|
return true;
|
|
|
}
|
|
|
|
|
@@ -2288,7 +2217,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
|
|
|
}
|
|
|
|
|
|
void mem_cgroup_update_page_stat(struct page *page,
|
|
|
- enum mem_cgroup_page_stat_item idx, int val)
|
|
|
+ enum mem_cgroup_stat_index idx, int val)
|
|
|
{
|
|
|
struct mem_cgroup *memcg;
|
|
|
struct page_cgroup *pc = lookup_page_cgroup(page);
|
|
@@ -2297,18 +2226,11 @@ void mem_cgroup_update_page_stat(struct page *page,
|
|
|
if (mem_cgroup_disabled())
|
|
|
return;
|
|
|
|
|
|
+ VM_BUG_ON(!rcu_read_lock_held());
|
|
|
memcg = pc->mem_cgroup;
|
|
|
if (unlikely(!memcg || !PageCgroupUsed(pc)))
|
|
|
return;
|
|
|
|
|
|
- switch (idx) {
|
|
|
- case MEMCG_NR_FILE_MAPPED:
|
|
|
- idx = MEM_CGROUP_STAT_FILE_MAPPED;
|
|
|
- break;
|
|
|
- default:
|
|
|
- BUG();
|
|
|
- }
|
|
|
-
|
|
|
this_cpu_add(memcg->stat->count[idx], val);
|
|
|
}
|
|
|
|
|
@@ -2450,7 +2372,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
|
|
|
flush_work(&stock->work);
|
|
|
}
|
|
|
out:
|
|
|
- put_online_cpus();
|
|
|
+ put_online_cpus();
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -2532,12 +2454,11 @@ enum {
|
|
|
CHARGE_RETRY, /* need to retry but retry is not bad */
|
|
|
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
|
|
|
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
|
|
|
- CHARGE_OOM_DIE, /* the current is killed because of OOM */
|
|
|
};
|
|
|
|
|
|
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
|
|
unsigned int nr_pages, unsigned int min_pages,
|
|
|
- bool oom_check)
|
|
|
+ bool invoke_oom)
|
|
|
{
|
|
|
unsigned long csize = nr_pages * PAGE_SIZE;
|
|
|
struct mem_cgroup *mem_over_limit;
|
|
@@ -2594,14 +2515,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
|
|
if (mem_cgroup_wait_acct_move(mem_over_limit))
|
|
|
return CHARGE_RETRY;
|
|
|
|
|
|
- /* If we don't need to call oom-killer at el, return immediately */
|
|
|
- if (!oom_check)
|
|
|
- return CHARGE_NOMEM;
|
|
|
- /* check OOM */
|
|
|
- if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
|
|
|
- return CHARGE_OOM_DIE;
|
|
|
+ if (invoke_oom)
|
|
|
+ mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
|
|
|
|
|
|
- return CHARGE_RETRY;
|
|
|
+ return CHARGE_NOMEM;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -2704,7 +2621,7 @@ again:
|
|
|
}
|
|
|
|
|
|
do {
|
|
|
- bool oom_check;
|
|
|
+ bool invoke_oom = oom && !nr_oom_retries;
|
|
|
|
|
|
/* If killed, bypass charge */
|
|
|
if (fatal_signal_pending(current)) {
|
|
@@ -2712,14 +2629,8 @@ again:
|
|
|
goto bypass;
|
|
|
}
|
|
|
|
|
|
- oom_check = false;
|
|
|
- if (oom && !nr_oom_retries) {
|
|
|
- oom_check = true;
|
|
|
- nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
|
|
|
- }
|
|
|
-
|
|
|
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
|
|
|
- oom_check);
|
|
|
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
|
|
|
+ nr_pages, invoke_oom);
|
|
|
switch (ret) {
|
|
|
case CHARGE_OK:
|
|
|
break;
|
|
@@ -2732,16 +2643,12 @@ again:
|
|
|
css_put(&memcg->css);
|
|
|
goto nomem;
|
|
|
case CHARGE_NOMEM: /* OOM routine works */
|
|
|
- if (!oom) {
|
|
|
+ if (!oom || invoke_oom) {
|
|
|
css_put(&memcg->css);
|
|
|
goto nomem;
|
|
|
}
|
|
|
- /* If oom, we never return -ENOMEM */
|
|
|
nr_oom_retries--;
|
|
|
break;
|
|
|
- case CHARGE_OOM_DIE: /* Killed by OOM Killer */
|
|
|
- css_put(&memcg->css);
|
|
|
- goto bypass;
|
|
|
}
|
|
|
} while (ret != CHARGE_OK);
|
|
|
|
|
@@ -2882,7 +2789,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
|
|
|
* is accessed after testing USED bit. To make pc->mem_cgroup visible
|
|
|
* before USED bit, we need memory barrier here.
|
|
|
* See mem_cgroup_add_lru_list(), etc.
|
|
|
- */
|
|
|
+ */
|
|
|
smp_wmb();
|
|
|
SetPageCgroupUsed(pc);
|
|
|
|
|
@@ -2905,9 +2812,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
|
|
|
unlock_page_cgroup(pc);
|
|
|
|
|
|
/*
|
|
|
- * "charge_statistics" updated event counter. Then, check it.
|
|
|
- * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
|
|
|
- * if they exceeds softlimit.
|
|
|
+ * "charge_statistics" updated event counter.
|
|
|
*/
|
|
|
memcg_check_events(memcg, page);
|
|
|
}
|
|
@@ -3626,9 +3531,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
|
|
|
* the page allocator. Therefore, the following sequence when backed by
|
|
|
* the SLUB allocator:
|
|
|
*
|
|
|
- * memcg_stop_kmem_account();
|
|
|
- * kmalloc(<large_number>)
|
|
|
- * memcg_resume_kmem_account();
|
|
|
+ * memcg_stop_kmem_account();
|
|
|
+ * kmalloc(<large_number>)
|
|
|
+ * memcg_resume_kmem_account();
|
|
|
*
|
|
|
* would effectively ignore the fact that we should skip accounting,
|
|
|
* since it will drive us directly to this function without passing
|
|
@@ -3750,6 +3655,20 @@ void mem_cgroup_split_huge_fixup(struct page *head)
|
|
|
}
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
|
|
+static inline
|
|
|
+void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
|
|
|
+ struct mem_cgroup *to,
|
|
|
+ unsigned int nr_pages,
|
|
|
+ enum mem_cgroup_stat_index idx)
|
|
|
+{
|
|
|
+ /* Update stat data for mem_cgroup */
|
|
|
+ preempt_disable();
|
|
|
+ WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
|
|
|
+ __this_cpu_add(from->stat->count[idx], -nr_pages);
|
|
|
+ __this_cpu_add(to->stat->count[idx], nr_pages);
|
|
|
+ preempt_enable();
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* mem_cgroup_move_account - move account of the page
|
|
|
* @page: the page
|
|
@@ -3795,13 +3714,14 @@ static int mem_cgroup_move_account(struct page *page,
|
|
|
|
|
|
move_lock_mem_cgroup(from, &flags);
|
|
|
|
|
|
- if (!anon && page_mapped(page)) {
|
|
|
- /* Update mapped_file data for mem_cgroup */
|
|
|
- preempt_disable();
|
|
|
- __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
|
|
|
- __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
|
|
|
- preempt_enable();
|
|
|
- }
|
|
|
+ if (!anon && page_mapped(page))
|
|
|
+ mem_cgroup_move_account_page_stat(from, to, nr_pages,
|
|
|
+ MEM_CGROUP_STAT_FILE_MAPPED);
|
|
|
+
|
|
|
+ if (PageWriteback(page))
|
|
|
+ mem_cgroup_move_account_page_stat(from, to, nr_pages,
|
|
|
+ MEM_CGROUP_STAT_WRITEBACK);
|
|
|
+
|
|
|
mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
|
|
|
|
|
|
/* caller should have done css_get */
|
|
@@ -4657,7 +4577,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
|
|
MEM_CGROUP_RECLAIM_SHRINK);
|
|
|
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
|
|
|
/* Usage is reduced ? */
|
|
|
- if (curusage >= oldusage)
|
|
|
+ if (curusage >= oldusage)
|
|
|
retry_count--;
|
|
|
else
|
|
|
oldusage = curusage;
|
|
@@ -4678,7 +4598,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
|
|
|
int enlarge = 0;
|
|
|
|
|
|
/* see mem_cgroup_resize_res_limit */
|
|
|
- retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
|
|
|
+ retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
|
|
|
oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
|
|
|
while (retry_count) {
|
|
|
if (signal_pending(current)) {
|
|
@@ -4727,98 +4647,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
|
|
- gfp_t gfp_mask,
|
|
|
- unsigned long *total_scanned)
|
|
|
-{
|
|
|
- unsigned long nr_reclaimed = 0;
|
|
|
- struct mem_cgroup_per_zone *mz, *next_mz = NULL;
|
|
|
- unsigned long reclaimed;
|
|
|
- int loop = 0;
|
|
|
- struct mem_cgroup_tree_per_zone *mctz;
|
|
|
- unsigned long long excess;
|
|
|
- unsigned long nr_scanned;
|
|
|
-
|
|
|
- if (order > 0)
|
|
|
- return 0;
|
|
|
-
|
|
|
- mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
|
|
|
- /*
|
|
|
- * This loop can run a while, specially if mem_cgroup's continuously
|
|
|
- * keep exceeding their soft limit and putting the system under
|
|
|
- * pressure
|
|
|
- */
|
|
|
- do {
|
|
|
- if (next_mz)
|
|
|
- mz = next_mz;
|
|
|
- else
|
|
|
- mz = mem_cgroup_largest_soft_limit_node(mctz);
|
|
|
- if (!mz)
|
|
|
- break;
|
|
|
-
|
|
|
- nr_scanned = 0;
|
|
|
- reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
|
|
|
- gfp_mask, &nr_scanned);
|
|
|
- nr_reclaimed += reclaimed;
|
|
|
- *total_scanned += nr_scanned;
|
|
|
- spin_lock(&mctz->lock);
|
|
|
-
|
|
|
- /*
|
|
|
- * If we failed to reclaim anything from this memory cgroup
|
|
|
- * it is time to move on to the next cgroup
|
|
|
- */
|
|
|
- next_mz = NULL;
|
|
|
- if (!reclaimed) {
|
|
|
- do {
|
|
|
- /*
|
|
|
- * Loop until we find yet another one.
|
|
|
- *
|
|
|
- * By the time we get the soft_limit lock
|
|
|
- * again, someone might have aded the
|
|
|
- * group back on the RB tree. Iterate to
|
|
|
- * make sure we get a different mem.
|
|
|
- * mem_cgroup_largest_soft_limit_node returns
|
|
|
- * NULL if no other cgroup is present on
|
|
|
- * the tree
|
|
|
- */
|
|
|
- next_mz =
|
|
|
- __mem_cgroup_largest_soft_limit_node(mctz);
|
|
|
- if (next_mz == mz)
|
|
|
- css_put(&next_mz->memcg->css);
|
|
|
- else /* next_mz == NULL or other memcg */
|
|
|
- break;
|
|
|
- } while (1);
|
|
|
- }
|
|
|
- __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
|
|
|
- excess = res_counter_soft_limit_excess(&mz->memcg->res);
|
|
|
- /*
|
|
|
- * One school of thought says that we should not add
|
|
|
- * back the node to the tree if reclaim returns 0.
|
|
|
- * But our reclaim could return 0, simply because due
|
|
|
- * to priority we are exposing a smaller subset of
|
|
|
- * memory to reclaim from. Consider this as a longer
|
|
|
- * term TODO.
|
|
|
- */
|
|
|
- /* If excess == 0, no tree ops */
|
|
|
- __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
|
|
|
- spin_unlock(&mctz->lock);
|
|
|
- css_put(&mz->memcg->css);
|
|
|
- loop++;
|
|
|
- /*
|
|
|
- * Could not reclaim anything and there are no more
|
|
|
- * mem cgroups to try or we seem to be looping without
|
|
|
- * reclaiming anything.
|
|
|
- */
|
|
|
- if (!nr_reclaimed &&
|
|
|
- (next_mz == NULL ||
|
|
|
- loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
|
|
|
- break;
|
|
|
- } while (!nr_reclaimed);
|
|
|
- if (next_mz)
|
|
|
- css_put(&next_mz->memcg->css);
|
|
|
- return nr_reclaimed;
|
|
|
-}
|
|
|
-
|
|
|
/**
|
|
|
* mem_cgroup_force_empty_list - clears LRU of a group
|
|
|
* @memcg: group to clear
|
|
@@ -4990,18 +4818,12 @@ static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
|
|
|
unsigned int event)
|
|
|
{
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
|
|
- int ret;
|
|
|
|
|
|
if (mem_cgroup_is_root(memcg))
|
|
|
return -EINVAL;
|
|
|
- css_get(&memcg->css);
|
|
|
- ret = mem_cgroup_force_empty(memcg);
|
|
|
- css_put(&memcg->css);
|
|
|
-
|
|
|
- return ret;
|
|
|
+ return mem_cgroup_force_empty(memcg);
|
|
|
}
|
|
|
|
|
|
-
|
|
|
static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
|
|
|
struct cftype *cft)
|
|
|
{
|
|
@@ -5139,7 +4961,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
|
|
|
*/
|
|
|
mutex_lock(&memcg_create_mutex);
|
|
|
mutex_lock(&set_limit_mutex);
|
|
|
- if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
|
|
|
+ if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
|
|
|
if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
|
|
|
ret = -EBUSY;
|
|
|
goto out;
|
|
@@ -5149,7 +4971,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
|
|
|
|
|
|
ret = memcg_update_cache_sizes(memcg);
|
|
|
if (ret) {
|
|
|
- res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
|
|
|
+ res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
|
|
|
goto out;
|
|
|
}
|
|
|
static_key_slow_inc(&memcg_kmem_enabled_key);
|
|
@@ -6089,8 +5911,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
|
|
|
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
|
|
mz = &pn->zoneinfo[zone];
|
|
|
lruvec_init(&mz->lruvec);
|
|
|
- mz->usage_in_excess = 0;
|
|
|
- mz->on_tree = false;
|
|
|
mz->memcg = memcg;
|
|
|
}
|
|
|
memcg->nodeinfo[node] = pn;
|
|
@@ -6146,7 +5966,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
|
|
|
int node;
|
|
|
size_t size = memcg_size();
|
|
|
|
|
|
- mem_cgroup_remove_from_trees(memcg);
|
|
|
free_css_id(&mem_cgroup_subsys, &memcg->css);
|
|
|
|
|
|
for_each_node(node)
|
|
@@ -6183,29 +6002,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
|
|
|
}
|
|
|
EXPORT_SYMBOL(parent_mem_cgroup);
|
|
|
|
|
|
-static void __init mem_cgroup_soft_limit_tree_init(void)
|
|
|
-{
|
|
|
- struct mem_cgroup_tree_per_node *rtpn;
|
|
|
- struct mem_cgroup_tree_per_zone *rtpz;
|
|
|
- int tmp, node, zone;
|
|
|
-
|
|
|
- for_each_node(node) {
|
|
|
- tmp = node;
|
|
|
- if (!node_state(node, N_NORMAL_MEMORY))
|
|
|
- tmp = -1;
|
|
|
- rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
|
|
|
- BUG_ON(!rtpn);
|
|
|
-
|
|
|
- soft_limit_tree.rb_tree_per_node[node] = rtpn;
|
|
|
-
|
|
|
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
|
|
- rtpz = &rtpn->rb_tree_per_zone[zone];
|
|
|
- rtpz->rb_root = RB_ROOT;
|
|
|
- spin_lock_init(&rtpz->lock);
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
static struct cgroup_subsys_state * __ref
|
|
|
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
|
|
|
{
|
|
@@ -6235,6 +6031,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
|
|
|
mutex_init(&memcg->thresholds_lock);
|
|
|
spin_lock_init(&memcg->move_lock);
|
|
|
vmpressure_init(&memcg->vmpressure);
|
|
|
+ spin_lock_init(&memcg->soft_lock);
|
|
|
|
|
|
return &memcg->css;
|
|
|
|
|
@@ -6312,6 +6109,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
|
|
|
|
|
|
mem_cgroup_invalidate_reclaim_iterators(memcg);
|
|
|
mem_cgroup_reparent_charges(memcg);
|
|
|
+ if (memcg->soft_contributed) {
|
|
|
+ while ((memcg = parent_mem_cgroup(memcg)))
|
|
|
+ atomic_dec(&memcg->children_in_excess);
|
|
|
+
|
|
|
+ if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
|
|
|
+ atomic_dec(&root_mem_cgroup->children_in_excess);
|
|
|
+ }
|
|
|
mem_cgroup_destroy_all_caches(memcg);
|
|
|
vmpressure_cleanup(&memcg->vmpressure);
|
|
|
}
|
|
@@ -6986,7 +6790,6 @@ static int __init mem_cgroup_init(void)
|
|
|
{
|
|
|
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
|
|
|
enable_swap_cgroup();
|
|
|
- mem_cgroup_soft_limit_tree_init();
|
|
|
memcg_stock_init();
|
|
|
return 0;
|
|
|
}
|