|
@@ -10,6 +10,10 @@
|
|
|
* Copyright (C) 2009 Nokia Corporation
|
|
|
* Author: Kirill A. Shutemov
|
|
|
*
|
|
|
+ * Kernel Memory Controller
|
|
|
+ * Copyright (C) 2012 Parallels Inc. and Google Inc.
|
|
|
+ * Authors: Glauber Costa and Suleiman Souhlal
|
|
|
+ *
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
@@ -267,6 +271,10 @@ struct mem_cgroup {
|
|
|
struct work_struct work_freeing;
|
|
|
};
|
|
|
|
|
|
+ /*
|
|
|
+ * the counter to account for kernel memory usage.
|
|
|
+ */
|
|
|
+ struct res_counter kmem;
|
|
|
/*
|
|
|
* Per cgroup active and inactive list, similar to the
|
|
|
* per zone LRU lists.
|
|
@@ -282,6 +290,7 @@ struct mem_cgroup {
|
|
|
* Should the accounting and control be hierarchical, per subtree?
|
|
|
*/
|
|
|
bool use_hierarchy;
|
|
|
+ unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
|
|
|
|
|
|
bool oom_lock;
|
|
|
atomic_t under_oom;
|
|
@@ -332,8 +341,61 @@ struct mem_cgroup {
|
|
|
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
|
|
|
struct tcp_memcontrol tcp_mem;
|
|
|
#endif
|
|
|
+#if defined(CONFIG_MEMCG_KMEM)
|
|
|
+ /* analogous to slab_common's slab_caches list. per-memcg */
|
|
|
+ struct list_head memcg_slab_caches;
|
|
|
+ /* Not a spinlock, we can take a lot of time walking the list */
|
|
|
+ struct mutex slab_caches_mutex;
|
|
|
+ /* Index in the kmem_cache->memcg_params->memcg_caches array */
|
|
|
+ int kmemcg_id;
|
|
|
+#endif
|
|
|
};
|
|
|
|
|
|
+/* internal only representation about the status of kmem accounting. */
|
|
|
+enum {
|
|
|
+ KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
|
|
|
+ KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
|
|
|
+ KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
|
|
|
+};
|
|
|
+
|
|
|
+/* We account when limit is on, but only after call sites are patched */
|
|
|
+#define KMEM_ACCOUNTED_MASK \
|
|
|
+ ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
|
|
|
+
|
|
|
+#ifdef CONFIG_MEMCG_KMEM
|
|
|
+static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
|
|
|
+}
|
|
|
+
|
|
|
+static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
|
|
|
+}
|
|
|
+
|
|
|
+static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
|
|
|
+}
|
|
|
+
|
|
|
+static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
|
|
|
+}
|
|
|
+
|
|
|
+static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
|
|
|
+ set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
|
|
|
+}
|
|
|
+
|
|
|
+static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
|
|
|
+ &memcg->kmem_account_flags);
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
/* Stuffs for move charges at task migration. */
|
|
|
/*
|
|
|
* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
|
|
@@ -388,9 +450,13 @@ enum charge_type {
|
|
|
};
|
|
|
|
|
|
/* for encoding cft->private value on file */
|
|
|
-#define _MEM (0)
|
|
|
-#define _MEMSWAP (1)
|
|
|
-#define _OOM_TYPE (2)
|
|
|
+enum res_type {
|
|
|
+ _MEM,
|
|
|
+ _MEMSWAP,
|
|
|
+ _OOM_TYPE,
|
|
|
+ _KMEM,
|
|
|
+};
|
|
|
+
|
|
|
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
|
|
|
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
|
|
|
#define MEMFILE_ATTR(val) ((val) & 0xffff)
|
|
@@ -487,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
+#ifdef CONFIG_MEMCG_KMEM
|
|
|
+/*
|
|
|
+ * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
|
|
|
+ * There are two main reasons for not using the css_id for this:
|
|
|
+ * 1) this works better in sparse environments, where we have a lot of memcgs,
|
|
|
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
|
|
|
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
|
|
|
+ * 200 entry array for that.
|
|
|
+ *
|
|
|
+ * 2) In order not to violate the cgroup API, we would like to do all memory
|
|
|
+ * allocation in ->create(). At that point, we haven't yet allocated the
|
|
|
+ * css_id. Having a separate index prevents us from messing with the cgroup
|
|
|
+ * core for this
|
|
|
+ *
|
|
|
+ * The current size of the caches array is stored in
|
|
|
+ * memcg_limited_groups_array_size. It will double each time we have to
|
|
|
+ * increase it.
|
|
|
+ */
|
|
|
+static DEFINE_IDA(kmem_limited_groups);
|
|
|
+int memcg_limited_groups_array_size;
|
|
|
+
|
|
|
+/*
|
|
|
+ * MIN_SIZE is different than 1, because we would like to avoid going through
|
|
|
+ * the alloc/free process all the time. In a small machine, 4 kmem-limited
|
|
|
+ * cgroups is a reasonable guess. In the future, it could be a parameter or
|
|
|
+ * tunable, but that is strictly not necessary.
|
|
|
+ *
|
|
|
+ * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
|
|
|
+ * this constant directly from cgroup, but it is understandable that this is
|
|
|
+ * better kept as an internal representation in cgroup.c. In any case, the
|
|
|
+ * css_id space is not getting any smaller, and we don't have to necessarily
|
|
|
+ * increase ours as well if it increases.
|
|
|
+ */
|
|
|
+#define MEMCG_CACHES_MIN_SIZE 4
|
|
|
+#define MEMCG_CACHES_MAX_SIZE 65535
|
|
|
+
|
|
|
+/*
|
|
|
+ * A lot of the calls to the cache allocation functions are expected to be
|
|
|
+ * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
|
|
|
+ * conditional to this static branch, we'll have to allow modules that does
|
|
|
+ * kmem_cache_alloc and the such to see this symbol as well
|
|
|
+ */
|
|
|
+struct static_key memcg_kmem_enabled_key;
|
|
|
+EXPORT_SYMBOL(memcg_kmem_enabled_key);
|
|
|
+
|
|
|
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ if (memcg_kmem_is_active(memcg)) {
|
|
|
+ static_key_slow_dec(&memcg_kmem_enabled_key);
|
|
|
+ ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
|
|
|
+ }
|
|
|
+ /*
|
|
|
+ * This check can't live in kmem destruction function,
|
|
|
+ * since the charges will outlive the cgroup
|
|
|
+ */
|
|
|
+ WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
|
|
|
+}
|
|
|
+#else
|
|
|
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+}
|
|
|
+#endif /* CONFIG_MEMCG_KMEM */
|
|
|
+
|
|
|
+static void disarm_static_keys(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ disarm_sock_keys(memcg);
|
|
|
+ disarm_kmem_keys(memcg);
|
|
|
+}
|
|
|
+
|
|
|
static void drain_all_stock_async(struct mem_cgroup *memcg);
|
|
|
|
|
|
static struct mem_cgroup_per_zone *
|
|
@@ -1453,6 +1588,10 @@ done:
|
|
|
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
|
|
|
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
|
|
|
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
|
|
|
+ printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
|
|
|
+ res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
|
|
|
+ res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
|
|
|
+ res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -2060,20 +2199,28 @@ struct memcg_stock_pcp {
|
|
|
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
|
|
|
static DEFINE_MUTEX(percpu_charge_mutex);
|
|
|
|
|
|
-/*
|
|
|
- * Try to consume stocked charge on this cpu. If success, one page is consumed
|
|
|
- * from local stock and true is returned. If the stock is 0 or charges from a
|
|
|
- * cgroup which is not current target, returns false. This stock will be
|
|
|
- * refilled.
|
|
|
+/**
|
|
|
+ * consume_stock: Try to consume stocked charge on this cpu.
|
|
|
+ * @memcg: memcg to consume from.
|
|
|
+ * @nr_pages: how many pages to charge.
|
|
|
+ *
|
|
|
+ * The charges will only happen if @memcg matches the current cpu's memcg
|
|
|
+ * stock, and at least @nr_pages are available in that stock. Failure to
|
|
|
+ * service an allocation will refill the stock.
|
|
|
+ *
|
|
|
+ * returns true if successful, false otherwise.
|
|
|
*/
|
|
|
-static bool consume_stock(struct mem_cgroup *memcg)
|
|
|
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
|
|
|
{
|
|
|
struct memcg_stock_pcp *stock;
|
|
|
bool ret = true;
|
|
|
|
|
|
+ if (nr_pages > CHARGE_BATCH)
|
|
|
+ return false;
|
|
|
+
|
|
|
stock = &get_cpu_var(memcg_stock);
|
|
|
- if (memcg == stock->cached && stock->nr_pages)
|
|
|
- stock->nr_pages--;
|
|
|
+ if (memcg == stock->cached && stock->nr_pages >= nr_pages)
|
|
|
+ stock->nr_pages -= nr_pages;
|
|
|
else /* need to call res_counter_charge */
|
|
|
ret = false;
|
|
|
put_cpu_var(memcg_stock);
|
|
@@ -2250,7 +2397,8 @@ enum {
|
|
|
};
|
|
|
|
|
|
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
|
|
- unsigned int nr_pages, bool oom_check)
|
|
|
+ unsigned int nr_pages, unsigned int min_pages,
|
|
|
+ bool oom_check)
|
|
|
{
|
|
|
unsigned long csize = nr_pages * PAGE_SIZE;
|
|
|
struct mem_cgroup *mem_over_limit;
|
|
@@ -2273,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
|
|
} else
|
|
|
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
|
|
|
/*
|
|
|
- * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
|
|
|
- * of regular pages (CHARGE_BATCH), or a single regular page (1).
|
|
|
- *
|
|
|
* Never reclaim on behalf of optional batching, retry with a
|
|
|
* single page instead.
|
|
|
*/
|
|
|
- if (nr_pages == CHARGE_BATCH)
|
|
|
+ if (nr_pages > min_pages)
|
|
|
return CHARGE_RETRY;
|
|
|
|
|
|
if (!(gfp_mask & __GFP_WAIT))
|
|
|
return CHARGE_WOULDBLOCK;
|
|
|
|
|
|
+ if (gfp_mask & __GFP_NORETRY)
|
|
|
+ return CHARGE_NOMEM;
|
|
|
+
|
|
|
ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
|
|
|
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
|
|
|
return CHARGE_RETRY;
|
|
@@ -2297,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
|
|
* unlikely to succeed so close to the limit, and we fall back
|
|
|
* to regular pages anyway in case of failure.
|
|
|
*/
|
|
|
- if (nr_pages == 1 && ret)
|
|
|
+ if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
|
|
|
return CHARGE_RETRY;
|
|
|
|
|
|
/*
|
|
@@ -2371,7 +2519,7 @@ again:
|
|
|
memcg = *ptr;
|
|
|
if (mem_cgroup_is_root(memcg))
|
|
|
goto done;
|
|
|
- if (nr_pages == 1 && consume_stock(memcg))
|
|
|
+ if (consume_stock(memcg, nr_pages))
|
|
|
goto done;
|
|
|
css_get(&memcg->css);
|
|
|
} else {
|
|
@@ -2396,7 +2544,7 @@ again:
|
|
|
rcu_read_unlock();
|
|
|
goto done;
|
|
|
}
|
|
|
- if (nr_pages == 1 && consume_stock(memcg)) {
|
|
|
+ if (consume_stock(memcg, nr_pages)) {
|
|
|
/*
|
|
|
* It seems dagerous to access memcg without css_get().
|
|
|
* But considering how consume_stok works, it's not
|
|
@@ -2431,7 +2579,8 @@ again:
|
|
|
nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
|
|
|
}
|
|
|
|
|
|
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
|
|
|
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
|
|
|
+ oom_check);
|
|
|
switch (ret) {
|
|
|
case CHARGE_OK:
|
|
|
break;
|
|
@@ -2624,183 +2773,943 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
|
|
|
memcg_check_events(memcg, page);
|
|
|
}
|
|
|
|
|
|
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
+static DEFINE_MUTEX(set_limit_mutex);
|
|
|
+
|
|
|
+#ifdef CONFIG_MEMCG_KMEM
|
|
|
+static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
|
|
|
+ (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
|
|
|
+}
|
|
|
|
|
|
-#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
|
|
|
/*
|
|
|
- * Because tail pages are not marked as "used", set it. We're under
|
|
|
- * zone->lru_lock, 'splitting on pmd' and compound_lock.
|
|
|
- * charge/uncharge will be never happen and move_account() is done under
|
|
|
- * compound_lock(), so we don't have to take care of races.
|
|
|
+ * This is a bit cumbersome, but it is rarely used and avoids a backpointer
|
|
|
+ * in the memcg_cache_params struct.
|
|
|
*/
|
|
|
-void mem_cgroup_split_huge_fixup(struct page *head)
|
|
|
+static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
|
|
|
{
|
|
|
- struct page_cgroup *head_pc = lookup_page_cgroup(head);
|
|
|
- struct page_cgroup *pc;
|
|
|
- int i;
|
|
|
+ struct kmem_cache *cachep;
|
|
|
|
|
|
- if (mem_cgroup_disabled())
|
|
|
- return;
|
|
|
- for (i = 1; i < HPAGE_PMD_NR; i++) {
|
|
|
- pc = head_pc + i;
|
|
|
- pc->mem_cgroup = head_pc->mem_cgroup;
|
|
|
- smp_wmb();/* see __commit_charge() */
|
|
|
- pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
|
|
|
- }
|
|
|
+ VM_BUG_ON(p->is_root_cache);
|
|
|
+ cachep = p->root_cache;
|
|
|
+ return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
|
|
|
}
|
|
|
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
|
|
-/**
|
|
|
- * mem_cgroup_move_account - move account of the page
|
|
|
- * @page: the page
|
|
|
- * @nr_pages: number of regular pages (>1 for huge pages)
|
|
|
- * @pc: page_cgroup of the page.
|
|
|
- * @from: mem_cgroup which the page is moved from.
|
|
|
- * @to: mem_cgroup which the page is moved to. @from != @to.
|
|
|
- *
|
|
|
- * The caller must confirm following.
|
|
|
- * - page is not on LRU (isolate_page() is useful.)
|
|
|
- * - compound_lock is held when nr_pages > 1
|
|
|
- *
|
|
|
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
|
|
|
- * from old cgroup.
|
|
|
- */
|
|
|
-static int mem_cgroup_move_account(struct page *page,
|
|
|
- unsigned int nr_pages,
|
|
|
- struct page_cgroup *pc,
|
|
|
- struct mem_cgroup *from,
|
|
|
- struct mem_cgroup *to)
|
|
|
+#ifdef CONFIG_SLABINFO
|
|
|
+static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
|
|
|
+ struct seq_file *m)
|
|
|
{
|
|
|
- unsigned long flags;
|
|
|
- int ret;
|
|
|
- bool anon = PageAnon(page);
|
|
|
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
|
|
|
+ struct memcg_cache_params *params;
|
|
|
|
|
|
- VM_BUG_ON(from == to);
|
|
|
- VM_BUG_ON(PageLRU(page));
|
|
|
- /*
|
|
|
- * The page is isolated from LRU. So, collapse function
|
|
|
- * will not handle this page. But page splitting can happen.
|
|
|
- * Do this check under compound_page_lock(). The caller should
|
|
|
- * hold it.
|
|
|
- */
|
|
|
- ret = -EBUSY;
|
|
|
- if (nr_pages > 1 && !PageTransHuge(page))
|
|
|
- goto out;
|
|
|
+ if (!memcg_can_account_kmem(memcg))
|
|
|
+ return -EIO;
|
|
|
|
|
|
- lock_page_cgroup(pc);
|
|
|
+ print_slabinfo_header(m);
|
|
|
|
|
|
- ret = -EINVAL;
|
|
|
- if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
|
|
|
- goto unlock;
|
|
|
+ mutex_lock(&memcg->slab_caches_mutex);
|
|
|
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list)
|
|
|
+ cache_show(memcg_params_to_cache(params), m);
|
|
|
+ mutex_unlock(&memcg->slab_caches_mutex);
|
|
|
|
|
|
- move_lock_mem_cgroup(from, &flags);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+#endif
|
|
|
|
|
|
- if (!anon && page_mapped(page)) {
|
|
|
- /* Update mapped_file data for mem_cgroup */
|
|
|
- preempt_disable();
|
|
|
- __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
|
|
|
- __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
|
|
|
- preempt_enable();
|
|
|
- }
|
|
|
- mem_cgroup_charge_statistics(from, anon, -nr_pages);
|
|
|
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
|
|
|
+{
|
|
|
+ struct res_counter *fail_res;
|
|
|
+ struct mem_cgroup *_memcg;
|
|
|
+ int ret = 0;
|
|
|
+ bool may_oom;
|
|
|
+
|
|
|
+ ret = res_counter_charge(&memcg->kmem, size, &fail_res);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
|
|
|
- /* caller should have done css_get */
|
|
|
- pc->mem_cgroup = to;
|
|
|
- mem_cgroup_charge_statistics(to, anon, nr_pages);
|
|
|
- move_unlock_mem_cgroup(from, &flags);
|
|
|
- ret = 0;
|
|
|
-unlock:
|
|
|
- unlock_page_cgroup(pc);
|
|
|
/*
|
|
|
- * check events
|
|
|
+ * Conditions under which we can wait for the oom_killer. Those are
|
|
|
+ * the same conditions tested by the core page allocator
|
|
|
*/
|
|
|
- memcg_check_events(to, page);
|
|
|
- memcg_check_events(from, page);
|
|
|
-out:
|
|
|
+ may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
|
|
|
+
|
|
|
+ _memcg = memcg;
|
|
|
+ ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
|
|
|
+ &_memcg, may_oom);
|
|
|
+
|
|
|
+ if (ret == -EINTR) {
|
|
|
+ /*
|
|
|
+ * __mem_cgroup_try_charge() chosed to bypass to root due to
|
|
|
+ * OOM kill or fatal signal. Since our only options are to
|
|
|
+ * either fail the allocation or charge it to this cgroup, do
|
|
|
+ * it as a temporary condition. But we can't fail. From a
|
|
|
+ * kmem/slab perspective, the cache has already been selected,
|
|
|
+ * by mem_cgroup_kmem_get_cache(), so it is too late to change
|
|
|
+ * our minds.
|
|
|
+ *
|
|
|
+ * This condition will only trigger if the task entered
|
|
|
+ * memcg_charge_kmem in a sane state, but was OOM-killed during
|
|
|
+ * __mem_cgroup_try_charge() above. Tasks that were already
|
|
|
+ * dying when the allocation triggers should have been already
|
|
|
+ * directed to the root cgroup in memcontrol.h
|
|
|
+ */
|
|
|
+ res_counter_charge_nofail(&memcg->res, size, &fail_res);
|
|
|
+ if (do_swap_account)
|
|
|
+ res_counter_charge_nofail(&memcg->memsw, size,
|
|
|
+ &fail_res);
|
|
|
+ ret = 0;
|
|
|
+ } else if (ret)
|
|
|
+ res_counter_uncharge(&memcg->kmem, size);
|
|
|
+
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
-/**
|
|
|
- * mem_cgroup_move_parent - moves page to the parent group
|
|
|
- * @page: the page to move
|
|
|
- * @pc: page_cgroup of the page
|
|
|
- * @child: page's cgroup
|
|
|
- *
|
|
|
- * move charges to its parent or the root cgroup if the group has no
|
|
|
- * parent (aka use_hierarchy==0).
|
|
|
- * Although this might fail (get_page_unless_zero, isolate_lru_page or
|
|
|
- * mem_cgroup_move_account fails) the failure is always temporary and
|
|
|
- * it signals a race with a page removal/uncharge or migration. In the
|
|
|
- * first case the page is on the way out and it will vanish from the LRU
|
|
|
- * on the next attempt and the call should be retried later.
|
|
|
- * Isolation from the LRU fails only if page has been isolated from
|
|
|
- * the LRU since we looked at it and that usually means either global
|
|
|
- * reclaim or migration going on. The page will either get back to the
|
|
|
- * LRU or vanish.
|
|
|
- * Finaly mem_cgroup_move_account fails only if the page got uncharged
|
|
|
- * (!PageCgroupUsed) or moved to a different group. The page will
|
|
|
- * disappear in the next attempt.
|
|
|
- */
|
|
|
-static int mem_cgroup_move_parent(struct page *page,
|
|
|
- struct page_cgroup *pc,
|
|
|
- struct mem_cgroup *child)
|
|
|
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
|
|
|
{
|
|
|
- struct mem_cgroup *parent;
|
|
|
- unsigned int nr_pages;
|
|
|
- unsigned long uninitialized_var(flags);
|
|
|
- int ret;
|
|
|
+ res_counter_uncharge(&memcg->res, size);
|
|
|
+ if (do_swap_account)
|
|
|
+ res_counter_uncharge(&memcg->memsw, size);
|
|
|
|
|
|
- VM_BUG_ON(mem_cgroup_is_root(child));
|
|
|
+ /* Not down to 0 */
|
|
|
+ if (res_counter_uncharge(&memcg->kmem, size))
|
|
|
+ return;
|
|
|
|
|
|
- ret = -EBUSY;
|
|
|
- if (!get_page_unless_zero(page))
|
|
|
- goto out;
|
|
|
- if (isolate_lru_page(page))
|
|
|
- goto put;
|
|
|
+ if (memcg_kmem_test_and_clear_dead(memcg))
|
|
|
+ mem_cgroup_put(memcg);
|
|
|
+}
|
|
|
|
|
|
- nr_pages = hpage_nr_pages(page);
|
|
|
+void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
|
|
|
+{
|
|
|
+ if (!memcg)
|
|
|
+ return;
|
|
|
|
|
|
- parent = parent_mem_cgroup(child);
|
|
|
+ mutex_lock(&memcg->slab_caches_mutex);
|
|
|
+ list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
|
|
|
+ mutex_unlock(&memcg->slab_caches_mutex);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * helper for acessing a memcg's index. It will be used as an index in the
|
|
|
+ * child cache array in kmem_cache, and also to derive its name. This function
|
|
|
+ * will return -1 when this is not a kmem-limited memcg.
|
|
|
+ */
|
|
|
+int memcg_cache_id(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ return memcg ? memcg->kmemcg_id : -1;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * This ends up being protected by the set_limit mutex, during normal
|
|
|
+ * operation, because that is its main call site.
|
|
|
+ *
|
|
|
+ * But when we create a new cache, we can call this as well if its parent
|
|
|
+ * is kmem-limited. That will have to hold set_limit_mutex as well.
|
|
|
+ */
|
|
|
+int memcg_update_cache_sizes(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ int num, ret;
|
|
|
+
|
|
|
+ num = ida_simple_get(&kmem_limited_groups,
|
|
|
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
|
|
|
+ if (num < 0)
|
|
|
+ return num;
|
|
|
/*
|
|
|
- * If no parent, move charges to root cgroup.
|
|
|
+ * After this point, kmem_accounted (that we test atomically in
|
|
|
+ * the beginning of this conditional), is no longer 0. This
|
|
|
+ * guarantees only one process will set the following boolean
|
|
|
+ * to true. We don't need test_and_set because we're protected
|
|
|
+ * by the set_limit_mutex anyway.
|
|
|
*/
|
|
|
- if (!parent)
|
|
|
- parent = root_mem_cgroup;
|
|
|
+ memcg_kmem_set_activated(memcg);
|
|
|
|
|
|
- if (nr_pages > 1) {
|
|
|
- VM_BUG_ON(!PageTransHuge(page));
|
|
|
- flags = compound_lock_irqsave(page);
|
|
|
+ ret = memcg_update_all_caches(num+1);
|
|
|
+ if (ret) {
|
|
|
+ ida_simple_remove(&kmem_limited_groups, num);
|
|
|
+ memcg_kmem_clear_activated(memcg);
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
- ret = mem_cgroup_move_account(page, nr_pages,
|
|
|
- pc, child, parent);
|
|
|
- if (!ret)
|
|
|
- __mem_cgroup_cancel_local_charge(child, nr_pages);
|
|
|
+ memcg->kmemcg_id = num;
|
|
|
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
|
|
|
+ mutex_init(&memcg->slab_caches_mutex);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
|
|
|
- if (nr_pages > 1)
|
|
|
- compound_unlock_irqrestore(page, flags);
|
|
|
- putback_lru_page(page);
|
|
|
-put:
|
|
|
- put_page(page);
|
|
|
-out:
|
|
|
- return ret;
|
|
|
+static size_t memcg_caches_array_size(int num_groups)
|
|
|
+{
|
|
|
+ ssize_t size;
|
|
|
+ if (num_groups <= 0)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ size = 2 * num_groups;
|
|
|
+ if (size < MEMCG_CACHES_MIN_SIZE)
|
|
|
+ size = MEMCG_CACHES_MIN_SIZE;
|
|
|
+ else if (size > MEMCG_CACHES_MAX_SIZE)
|
|
|
+ size = MEMCG_CACHES_MAX_SIZE;
|
|
|
+
|
|
|
+ return size;
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Charge the memory controller for page usage.
|
|
|
- * Return
|
|
|
- * 0 if the charge was successful
|
|
|
- * < 0 if the cgroup is over its limit
|
|
|
+ * We should update the current array size iff all caches updates succeed. This
|
|
|
+ * can only be done from the slab side. The slab mutex needs to be held when
|
|
|
+ * calling this.
|
|
|
*/
|
|
|
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
|
|
|
- gfp_t gfp_mask, enum charge_type ctype)
|
|
|
+void memcg_update_array_size(int num)
|
|
|
{
|
|
|
- struct mem_cgroup *memcg = NULL;
|
|
|
- unsigned int nr_pages = 1;
|
|
|
- bool oom = true;
|
|
|
- int ret;
|
|
|
+ if (num > memcg_limited_groups_array_size)
|
|
|
+ memcg_limited_groups_array_size = memcg_caches_array_size(num);
|
|
|
+}
|
|
|
|
|
|
- if (PageTransHuge(page)) {
|
|
|
+int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
|
|
|
+{
|
|
|
+ struct memcg_cache_params *cur_params = s->memcg_params;
|
|
|
+
|
|
|
+ VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
|
|
|
+
|
|
|
+ if (num_groups > memcg_limited_groups_array_size) {
|
|
|
+ int i;
|
|
|
+ ssize_t size = memcg_caches_array_size(num_groups);
|
|
|
+
|
|
|
+ size *= sizeof(void *);
|
|
|
+ size += sizeof(struct memcg_cache_params);
|
|
|
+
|
|
|
+ s->memcg_params = kzalloc(size, GFP_KERNEL);
|
|
|
+ if (!s->memcg_params) {
|
|
|
+ s->memcg_params = cur_params;
|
|
|
+ return -ENOMEM;
|
|
|
+ }
|
|
|
+
|
|
|
+ s->memcg_params->is_root_cache = true;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * There is the chance it will be bigger than
|
|
|
+ * memcg_limited_groups_array_size, if we failed an allocation
|
|
|
+ * in a cache, in which case all caches updated before it, will
|
|
|
+ * have a bigger array.
|
|
|
+ *
|
|
|
+ * But if that is the case, the data after
|
|
|
+ * memcg_limited_groups_array_size is certainly unused
|
|
|
+ */
|
|
|
+ for (i = 0; i < memcg_limited_groups_array_size; i++) {
|
|
|
+ if (!cur_params->memcg_caches[i])
|
|
|
+ continue;
|
|
|
+ s->memcg_params->memcg_caches[i] =
|
|
|
+ cur_params->memcg_caches[i];
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Ideally, we would wait until all caches succeed, and only
|
|
|
+ * then free the old one. But this is not worth the extra
|
|
|
+ * pointer per-cache we'd have to have for this.
|
|
|
+ *
|
|
|
+ * It is not a big deal if some caches are left with a size
|
|
|
+ * bigger than the others. And all updates will reset this
|
|
|
+ * anyway.
|
|
|
+ */
|
|
|
+ kfree(cur_params);
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
|
|
|
+ struct kmem_cache *root_cache)
|
|
|
+{
|
|
|
+ size_t size = sizeof(struct memcg_cache_params);
|
|
|
+
|
|
|
+ if (!memcg_kmem_enabled())
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ if (!memcg)
|
|
|
+ size += memcg_limited_groups_array_size * sizeof(void *);
|
|
|
+
|
|
|
+ s->memcg_params = kzalloc(size, GFP_KERNEL);
|
|
|
+ if (!s->memcg_params)
|
|
|
+ return -ENOMEM;
|
|
|
+
|
|
|
+ if (memcg) {
|
|
|
+ s->memcg_params->memcg = memcg;
|
|
|
+ s->memcg_params->root_cache = root_cache;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+void memcg_release_cache(struct kmem_cache *s)
|
|
|
+{
|
|
|
+ struct kmem_cache *root;
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+ int id;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This happens, for instance, when a root cache goes away before we
|
|
|
+ * add any memcg.
|
|
|
+ */
|
|
|
+ if (!s->memcg_params)
|
|
|
+ return;
|
|
|
+
|
|
|
+ if (s->memcg_params->is_root_cache)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ memcg = s->memcg_params->memcg;
|
|
|
+ id = memcg_cache_id(memcg);
|
|
|
+
|
|
|
+ root = s->memcg_params->root_cache;
|
|
|
+ root->memcg_params->memcg_caches[id] = NULL;
|
|
|
+ mem_cgroup_put(memcg);
|
|
|
+
|
|
|
+ mutex_lock(&memcg->slab_caches_mutex);
|
|
|
+ list_del(&s->memcg_params->list);
|
|
|
+ mutex_unlock(&memcg->slab_caches_mutex);
|
|
|
+
|
|
|
+out:
|
|
|
+ kfree(s->memcg_params);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * During the creation a new cache, we need to disable our accounting mechanism
|
|
|
+ * altogether. This is true even if we are not creating, but rather just
|
|
|
+ * enqueing new caches to be created.
|
|
|
+ *
|
|
|
+ * This is because that process will trigger allocations; some visible, like
|
|
|
+ * explicit kmallocs to auxiliary data structures, name strings and internal
|
|
|
+ * cache structures; some well concealed, like INIT_WORK() that can allocate
|
|
|
+ * objects during debug.
|
|
|
+ *
|
|
|
+ * If any allocation happens during memcg_kmem_get_cache, we will recurse back
|
|
|
+ * to it. This may not be a bounded recursion: since the first cache creation
|
|
|
+ * failed to complete (waiting on the allocation), we'll just try to create the
|
|
|
+ * cache again, failing at the same point.
|
|
|
+ *
|
|
|
+ * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
|
|
|
+ * memcg_kmem_skip_account. So we enclose anything that might allocate memory
|
|
|
+ * inside the following two functions.
|
|
|
+ */
|
|
|
+static inline void memcg_stop_kmem_account(void)
|
|
|
+{
|
|
|
+ VM_BUG_ON(!current->mm);
|
|
|
+ current->memcg_kmem_skip_account++;
|
|
|
+}
|
|
|
+
|
|
|
+static inline void memcg_resume_kmem_account(void)
|
|
|
+{
|
|
|
+ VM_BUG_ON(!current->mm);
|
|
|
+ current->memcg_kmem_skip_account--;
|
|
|
+}
|
|
|
+
|
|
|
+static void kmem_cache_destroy_work_func(struct work_struct *w)
|
|
|
+{
|
|
|
+ struct kmem_cache *cachep;
|
|
|
+ struct memcg_cache_params *p;
|
|
|
+
|
|
|
+ p = container_of(w, struct memcg_cache_params, destroy);
|
|
|
+
|
|
|
+ cachep = memcg_params_to_cache(p);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If we get down to 0 after shrink, we could delete right away.
|
|
|
+ * However, memcg_release_pages() already puts us back in the workqueue
|
|
|
+ * in that case. If we proceed deleting, we'll get a dangling
|
|
|
+ * reference, and removing the object from the workqueue in that case
|
|
|
+ * is unnecessary complication. We are not a fast path.
|
|
|
+ *
|
|
|
+ * Note that this case is fundamentally different from racing with
|
|
|
+ * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
|
|
|
+ * kmem_cache_shrink, not only we would be reinserting a dead cache
|
|
|
+ * into the queue, but doing so from inside the worker racing to
|
|
|
+ * destroy it.
|
|
|
+ *
|
|
|
+ * So if we aren't down to zero, we'll just schedule a worker and try
|
|
|
+ * again
|
|
|
+ */
|
|
|
+ if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
|
|
|
+ kmem_cache_shrink(cachep);
|
|
|
+ if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
|
|
|
+ return;
|
|
|
+ } else
|
|
|
+ kmem_cache_destroy(cachep);
|
|
|
+}
|
|
|
+
|
|
|
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
|
|
|
+{
|
|
|
+ if (!cachep->memcg_params->dead)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * There are many ways in which we can get here.
|
|
|
+ *
|
|
|
+ * We can get to a memory-pressure situation while the delayed work is
|
|
|
+ * still pending to run. The vmscan shrinkers can then release all
|
|
|
+ * cache memory and get us to destruction. If this is the case, we'll
|
|
|
+ * be executed twice, which is a bug (the second time will execute over
|
|
|
+ * bogus data). In this case, cancelling the work should be fine.
|
|
|
+ *
|
|
|
+ * But we can also get here from the worker itself, if
|
|
|
+ * kmem_cache_shrink is enough to shake all the remaining objects and
|
|
|
+ * get the page count to 0. In this case, we'll deadlock if we try to
|
|
|
+ * cancel the work (the worker runs with an internal lock held, which
|
|
|
+ * is the same lock we would hold for cancel_work_sync().)
|
|
|
+ *
|
|
|
+ * Since we can't possibly know who got us here, just refrain from
|
|
|
+ * running if there is already work pending
|
|
|
+ */
|
|
|
+ if (work_pending(&cachep->memcg_params->destroy))
|
|
|
+ return;
|
|
|
+ /*
|
|
|
+ * We have to defer the actual destroying to a workqueue, because
|
|
|
+ * we might currently be in a context that cannot sleep.
|
|
|
+ */
|
|
|
+ schedule_work(&cachep->memcg_params->destroy);
|
|
|
+}
|
|
|
+
|
|
|
+static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
|
|
|
+{
|
|
|
+ char *name;
|
|
|
+ struct dentry *dentry;
|
|
|
+
|
|
|
+ rcu_read_lock();
|
|
|
+ dentry = rcu_dereference(memcg->css.cgroup->dentry);
|
|
|
+ rcu_read_unlock();
|
|
|
+
|
|
|
+ BUG_ON(dentry == NULL);
|
|
|
+
|
|
|
+ name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
|
|
|
+ memcg_cache_id(memcg), dentry->d_name.name);
|
|
|
+
|
|
|
+ return name;
|
|
|
+}
|
|
|
+
|
|
|
+static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
|
|
|
+ struct kmem_cache *s)
|
|
|
+{
|
|
|
+ char *name;
|
|
|
+ struct kmem_cache *new;
|
|
|
+
|
|
|
+ name = memcg_cache_name(memcg, s);
|
|
|
+ if (!name)
|
|
|
+ return NULL;
|
|
|
+
|
|
|
+ new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
|
|
|
+ (s->flags & ~SLAB_PANIC), s->ctor, s);
|
|
|
+
|
|
|
+ if (new)
|
|
|
+ new->allocflags |= __GFP_KMEMCG;
|
|
|
+
|
|
|
+ kfree(name);
|
|
|
+ return new;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * This lock protects updaters, not readers. We want readers to be as fast as
|
|
|
+ * they can, and they will either see NULL or a valid cache value. Our model
|
|
|
+ * allow them to see NULL, in which case the root memcg will be selected.
|
|
|
+ *
|
|
|
+ * We need this lock because multiple allocations to the same cache from a non
|
|
|
+ * will span more than one worker. Only one of them can create the cache.
|
|
|
+ */
|
|
|
+static DEFINE_MUTEX(memcg_cache_mutex);
|
|
|
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
|
|
|
+ struct kmem_cache *cachep)
|
|
|
+{
|
|
|
+ struct kmem_cache *new_cachep;
|
|
|
+ int idx;
|
|
|
+
|
|
|
+ BUG_ON(!memcg_can_account_kmem(memcg));
|
|
|
+
|
|
|
+ idx = memcg_cache_id(memcg);
|
|
|
+
|
|
|
+ mutex_lock(&memcg_cache_mutex);
|
|
|
+ new_cachep = cachep->memcg_params->memcg_caches[idx];
|
|
|
+ if (new_cachep)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ new_cachep = kmem_cache_dup(memcg, cachep);
|
|
|
+ if (new_cachep == NULL) {
|
|
|
+ new_cachep = cachep;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ mem_cgroup_get(memcg);
|
|
|
+ atomic_set(&new_cachep->memcg_params->nr_pages , 0);
|
|
|
+
|
|
|
+ cachep->memcg_params->memcg_caches[idx] = new_cachep;
|
|
|
+ /*
|
|
|
+ * the readers won't lock, make sure everybody sees the updated value,
|
|
|
+ * so they won't put stuff in the queue again for no reason
|
|
|
+ */
|
|
|
+ wmb();
|
|
|
+out:
|
|
|
+ mutex_unlock(&memcg_cache_mutex);
|
|
|
+ return new_cachep;
|
|
|
+}
|
|
|
+
|
|
|
+void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
|
|
|
+{
|
|
|
+ struct kmem_cache *c;
|
|
|
+ int i;
|
|
|
+
|
|
|
+ if (!s->memcg_params)
|
|
|
+ return;
|
|
|
+ if (!s->memcg_params->is_root_cache)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If the cache is being destroyed, we trust that there is no one else
|
|
|
+ * requesting objects from it. Even if there are, the sanity checks in
|
|
|
+ * kmem_cache_destroy should caught this ill-case.
|
|
|
+ *
|
|
|
+ * Still, we don't want anyone else freeing memcg_caches under our
|
|
|
+ * noses, which can happen if a new memcg comes to life. As usual,
|
|
|
+ * we'll take the set_limit_mutex to protect ourselves against this.
|
|
|
+ */
|
|
|
+ mutex_lock(&set_limit_mutex);
|
|
|
+ for (i = 0; i < memcg_limited_groups_array_size; i++) {
|
|
|
+ c = s->memcg_params->memcg_caches[i];
|
|
|
+ if (!c)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We will now manually delete the caches, so to avoid races
|
|
|
+ * we need to cancel all pending destruction workers and
|
|
|
+ * proceed with destruction ourselves.
|
|
|
+ *
|
|
|
+ * kmem_cache_destroy() will call kmem_cache_shrink internally,
|
|
|
+ * and that could spawn the workers again: it is likely that
|
|
|
+ * the cache still have active pages until this very moment.
|
|
|
+ * This would lead us back to mem_cgroup_destroy_cache.
|
|
|
+ *
|
|
|
+ * But that will not execute at all if the "dead" flag is not
|
|
|
+ * set, so flip it down to guarantee we are in control.
|
|
|
+ */
|
|
|
+ c->memcg_params->dead = false;
|
|
|
+ cancel_work_sync(&c->memcg_params->destroy);
|
|
|
+ kmem_cache_destroy(c);
|
|
|
+ }
|
|
|
+ mutex_unlock(&set_limit_mutex);
|
|
|
+}
|
|
|
+
|
|
|
+struct create_work {
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+ struct kmem_cache *cachep;
|
|
|
+ struct work_struct work;
|
|
|
+};
|
|
|
+
|
|
|
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ struct kmem_cache *cachep;
|
|
|
+ struct memcg_cache_params *params;
|
|
|
+
|
|
|
+ if (!memcg_kmem_is_active(memcg))
|
|
|
+ return;
|
|
|
+
|
|
|
+ mutex_lock(&memcg->slab_caches_mutex);
|
|
|
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
|
|
|
+ cachep = memcg_params_to_cache(params);
|
|
|
+ cachep->memcg_params->dead = true;
|
|
|
+ INIT_WORK(&cachep->memcg_params->destroy,
|
|
|
+ kmem_cache_destroy_work_func);
|
|
|
+ schedule_work(&cachep->memcg_params->destroy);
|
|
|
+ }
|
|
|
+ mutex_unlock(&memcg->slab_caches_mutex);
|
|
|
+}
|
|
|
+
|
|
|
+static void memcg_create_cache_work_func(struct work_struct *w)
|
|
|
+{
|
|
|
+ struct create_work *cw;
|
|
|
+
|
|
|
+ cw = container_of(w, struct create_work, work);
|
|
|
+ memcg_create_kmem_cache(cw->memcg, cw->cachep);
|
|
|
+ /* Drop the reference gotten when we enqueued. */
|
|
|
+ css_put(&cw->memcg->css);
|
|
|
+ kfree(cw);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Enqueue the creation of a per-memcg kmem_cache.
|
|
|
+ * Called with rcu_read_lock.
|
|
|
+ */
|
|
|
+static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
|
|
|
+ struct kmem_cache *cachep)
|
|
|
+{
|
|
|
+ struct create_work *cw;
|
|
|
+
|
|
|
+ cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
|
|
|
+ if (cw == NULL)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /* The corresponding put will be done in the workqueue. */
|
|
|
+ if (!css_tryget(&memcg->css)) {
|
|
|
+ kfree(cw);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ cw->memcg = memcg;
|
|
|
+ cw->cachep = cachep;
|
|
|
+
|
|
|
+ INIT_WORK(&cw->work, memcg_create_cache_work_func);
|
|
|
+ schedule_work(&cw->work);
|
|
|
+}
|
|
|
+
|
|
|
+static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
|
|
|
+ struct kmem_cache *cachep)
|
|
|
+{
|
|
|
+ /*
|
|
|
+ * We need to stop accounting when we kmalloc, because if the
|
|
|
+ * corresponding kmalloc cache is not yet created, the first allocation
|
|
|
+ * in __memcg_create_cache_enqueue will recurse.
|
|
|
+ *
|
|
|
+ * However, it is better to enclose the whole function. Depending on
|
|
|
+ * the debugging options enabled, INIT_WORK(), for instance, can
|
|
|
+ * trigger an allocation. This too, will make us recurse. Because at
|
|
|
+ * this point we can't allow ourselves back into memcg_kmem_get_cache,
|
|
|
+ * the safest choice is to do it like this, wrapping the whole function.
|
|
|
+ */
|
|
|
+ memcg_stop_kmem_account();
|
|
|
+ __memcg_create_cache_enqueue(memcg, cachep);
|
|
|
+ memcg_resume_kmem_account();
|
|
|
+}
|
|
|
+/*
|
|
|
+ * Return the kmem_cache we're supposed to use for a slab allocation.
|
|
|
+ * We try to use the current memcg's version of the cache.
|
|
|
+ *
|
|
|
+ * If the cache does not exist yet, if we are the first user of it,
|
|
|
+ * we either create it immediately, if possible, or create it asynchronously
|
|
|
+ * in a workqueue.
|
|
|
+ * In the latter case, we will let the current allocation go through with
|
|
|
+ * the original cache.
|
|
|
+ *
|
|
|
+ * Can't be called in interrupt context or from kernel threads.
|
|
|
+ * This function needs to be called with rcu_read_lock() held.
|
|
|
+ */
|
|
|
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
|
|
|
+ gfp_t gfp)
|
|
|
+{
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+ int idx;
|
|
|
+
|
|
|
+ VM_BUG_ON(!cachep->memcg_params);
|
|
|
+ VM_BUG_ON(!cachep->memcg_params->is_root_cache);
|
|
|
+
|
|
|
+ if (!current->mm || current->memcg_kmem_skip_account)
|
|
|
+ return cachep;
|
|
|
+
|
|
|
+ rcu_read_lock();
|
|
|
+ memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
|
|
|
+ rcu_read_unlock();
|
|
|
+
|
|
|
+ if (!memcg_can_account_kmem(memcg))
|
|
|
+ return cachep;
|
|
|
+
|
|
|
+ idx = memcg_cache_id(memcg);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * barrier to mare sure we're always seeing the up to date value. The
|
|
|
+ * code updating memcg_caches will issue a write barrier to match this.
|
|
|
+ */
|
|
|
+ read_barrier_depends();
|
|
|
+ if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
|
|
|
+ /*
|
|
|
+ * If we are in a safe context (can wait, and not in interrupt
|
|
|
+ * context), we could be be predictable and return right away.
|
|
|
+ * This would guarantee that the allocation being performed
|
|
|
+ * already belongs in the new cache.
|
|
|
+ *
|
|
|
+ * However, there are some clashes that can arrive from locking.
|
|
|
+ * For instance, because we acquire the slab_mutex while doing
|
|
|
+ * kmem_cache_dup, this means no further allocation could happen
|
|
|
+ * with the slab_mutex held.
|
|
|
+ *
|
|
|
+ * Also, because cache creation issue get_online_cpus(), this
|
|
|
+ * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
|
|
|
+ * that ends up reversed during cpu hotplug. (cpuset allocates
|
|
|
+ * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
|
|
|
+ * better to defer everything.
|
|
|
+ */
|
|
|
+ memcg_create_cache_enqueue(memcg, cachep);
|
|
|
+ return cachep;
|
|
|
+ }
|
|
|
+
|
|
|
+ return cachep->memcg_params->memcg_caches[idx];
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(__memcg_kmem_get_cache);
|
|
|
+
|
|
|
+/*
|
|
|
+ * We need to verify if the allocation against current->mm->owner's memcg is
|
|
|
+ * possible for the given order. But the page is not allocated yet, so we'll
|
|
|
+ * need a further commit step to do the final arrangements.
|
|
|
+ *
|
|
|
+ * It is possible for the task to switch cgroups in this mean time, so at
|
|
|
+ * commit time, we can't rely on task conversion any longer. We'll then use
|
|
|
+ * the handle argument to return to the caller which cgroup we should commit
|
|
|
+ * against. We could also return the memcg directly and avoid the pointer
|
|
|
+ * passing, but a boolean return value gives better semantics considering
|
|
|
+ * the compiled-out case as well.
|
|
|
+ *
|
|
|
+ * Returning true means the allocation is possible.
|
|
|
+ */
|
|
|
+bool
|
|
|
+__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
|
|
|
+{
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ *_memcg = NULL;
|
|
|
+ memcg = try_get_mem_cgroup_from_mm(current->mm);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * very rare case described in mem_cgroup_from_task. Unfortunately there
|
|
|
+ * isn't much we can do without complicating this too much, and it would
|
|
|
+ * be gfp-dependent anyway. Just let it go
|
|
|
+ */
|
|
|
+ if (unlikely(!memcg))
|
|
|
+ return true;
|
|
|
+
|
|
|
+ if (!memcg_can_account_kmem(memcg)) {
|
|
|
+ css_put(&memcg->css);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
+ ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
|
|
|
+ if (!ret)
|
|
|
+ *_memcg = memcg;
|
|
|
+
|
|
|
+ css_put(&memcg->css);
|
|
|
+ return (ret == 0);
|
|
|
+}
|
|
|
+
|
|
|
+void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
|
|
|
+ int order)
|
|
|
+{
|
|
|
+ struct page_cgroup *pc;
|
|
|
+
|
|
|
+ VM_BUG_ON(mem_cgroup_is_root(memcg));
|
|
|
+
|
|
|
+ /* The page allocation failed. Revert */
|
|
|
+ if (!page) {
|
|
|
+ memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ pc = lookup_page_cgroup(page);
|
|
|
+ lock_page_cgroup(pc);
|
|
|
+ pc->mem_cgroup = memcg;
|
|
|
+ SetPageCgroupUsed(pc);
|
|
|
+ unlock_page_cgroup(pc);
|
|
|
+}
|
|
|
+
|
|
|
+void __memcg_kmem_uncharge_pages(struct page *page, int order)
|
|
|
+{
|
|
|
+ struct mem_cgroup *memcg = NULL;
|
|
|
+ struct page_cgroup *pc;
|
|
|
+
|
|
|
+
|
|
|
+ pc = lookup_page_cgroup(page);
|
|
|
+ /*
|
|
|
+ * Fast unlocked return. Theoretically might have changed, have to
|
|
|
+ * check again after locking.
|
|
|
+ */
|
|
|
+ if (!PageCgroupUsed(pc))
|
|
|
+ return;
|
|
|
+
|
|
|
+ lock_page_cgroup(pc);
|
|
|
+ if (PageCgroupUsed(pc)) {
|
|
|
+ memcg = pc->mem_cgroup;
|
|
|
+ ClearPageCgroupUsed(pc);
|
|
|
+ }
|
|
|
+ unlock_page_cgroup(pc);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We trust that only if there is a memcg associated with the page, it
|
|
|
+ * is a valid allocation
|
|
|
+ */
|
|
|
+ if (!memcg)
|
|
|
+ return;
|
|
|
+
|
|
|
+ VM_BUG_ON(mem_cgroup_is_root(memcg));
|
|
|
+ memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
|
|
|
+}
|
|
|
+#else
|
|
|
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+}
|
|
|
+#endif /* CONFIG_MEMCG_KMEM */
|
|
|
+
|
|
|
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
+
|
|
|
+#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
|
|
|
+/*
|
|
|
+ * Because tail pages are not marked as "used", set it. We're under
|
|
|
+ * zone->lru_lock, 'splitting on pmd' and compound_lock.
|
|
|
+ * charge/uncharge will be never happen and move_account() is done under
|
|
|
+ * compound_lock(), so we don't have to take care of races.
|
|
|
+ */
|
|
|
+void mem_cgroup_split_huge_fixup(struct page *head)
|
|
|
+{
|
|
|
+ struct page_cgroup *head_pc = lookup_page_cgroup(head);
|
|
|
+ struct page_cgroup *pc;
|
|
|
+ int i;
|
|
|
+
|
|
|
+ if (mem_cgroup_disabled())
|
|
|
+ return;
|
|
|
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
|
|
|
+ pc = head_pc + i;
|
|
|
+ pc->mem_cgroup = head_pc->mem_cgroup;
|
|
|
+ smp_wmb();/* see __commit_charge() */
|
|
|
+ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
|
|
|
+ }
|
|
|
+}
|
|
|
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
+
|
|
|
+/**
|
|
|
+ * mem_cgroup_move_account - move account of the page
|
|
|
+ * @page: the page
|
|
|
+ * @nr_pages: number of regular pages (>1 for huge pages)
|
|
|
+ * @pc: page_cgroup of the page.
|
|
|
+ * @from: mem_cgroup which the page is moved from.
|
|
|
+ * @to: mem_cgroup which the page is moved to. @from != @to.
|
|
|
+ *
|
|
|
+ * The caller must confirm following.
|
|
|
+ * - page is not on LRU (isolate_page() is useful.)
|
|
|
+ * - compound_lock is held when nr_pages > 1
|
|
|
+ *
|
|
|
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
|
|
|
+ * from old cgroup.
|
|
|
+ */
|
|
|
+static int mem_cgroup_move_account(struct page *page,
|
|
|
+ unsigned int nr_pages,
|
|
|
+ struct page_cgroup *pc,
|
|
|
+ struct mem_cgroup *from,
|
|
|
+ struct mem_cgroup *to)
|
|
|
+{
|
|
|
+ unsigned long flags;
|
|
|
+ int ret;
|
|
|
+ bool anon = PageAnon(page);
|
|
|
+
|
|
|
+ VM_BUG_ON(from == to);
|
|
|
+ VM_BUG_ON(PageLRU(page));
|
|
|
+ /*
|
|
|
+ * The page is isolated from LRU. So, collapse function
|
|
|
+ * will not handle this page. But page splitting can happen.
|
|
|
+ * Do this check under compound_page_lock(). The caller should
|
|
|
+ * hold it.
|
|
|
+ */
|
|
|
+ ret = -EBUSY;
|
|
|
+ if (nr_pages > 1 && !PageTransHuge(page))
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ lock_page_cgroup(pc);
|
|
|
+
|
|
|
+ ret = -EINVAL;
|
|
|
+ if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ move_lock_mem_cgroup(from, &flags);
|
|
|
+
|
|
|
+ if (!anon && page_mapped(page)) {
|
|
|
+ /* Update mapped_file data for mem_cgroup */
|
|
|
+ preempt_disable();
|
|
|
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
|
|
|
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
|
|
|
+ preempt_enable();
|
|
|
+ }
|
|
|
+ mem_cgroup_charge_statistics(from, anon, -nr_pages);
|
|
|
+
|
|
|
+ /* caller should have done css_get */
|
|
|
+ pc->mem_cgroup = to;
|
|
|
+ mem_cgroup_charge_statistics(to, anon, nr_pages);
|
|
|
+ move_unlock_mem_cgroup(from, &flags);
|
|
|
+ ret = 0;
|
|
|
+unlock:
|
|
|
+ unlock_page_cgroup(pc);
|
|
|
+ /*
|
|
|
+ * check events
|
|
|
+ */
|
|
|
+ memcg_check_events(to, page);
|
|
|
+ memcg_check_events(from, page);
|
|
|
+out:
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * mem_cgroup_move_parent - moves page to the parent group
|
|
|
+ * @page: the page to move
|
|
|
+ * @pc: page_cgroup of the page
|
|
|
+ * @child: page's cgroup
|
|
|
+ *
|
|
|
+ * move charges to its parent or the root cgroup if the group has no
|
|
|
+ * parent (aka use_hierarchy==0).
|
|
|
+ * Although this might fail (get_page_unless_zero, isolate_lru_page or
|
|
|
+ * mem_cgroup_move_account fails) the failure is always temporary and
|
|
|
+ * it signals a race with a page removal/uncharge or migration. In the
|
|
|
+ * first case the page is on the way out and it will vanish from the LRU
|
|
|
+ * on the next attempt and the call should be retried later.
|
|
|
+ * Isolation from the LRU fails only if page has been isolated from
|
|
|
+ * the LRU since we looked at it and that usually means either global
|
|
|
+ * reclaim or migration going on. The page will either get back to the
|
|
|
+ * LRU or vanish.
|
|
|
+ * Finaly mem_cgroup_move_account fails only if the page got uncharged
|
|
|
+ * (!PageCgroupUsed) or moved to a different group. The page will
|
|
|
+ * disappear in the next attempt.
|
|
|
+ */
|
|
|
+static int mem_cgroup_move_parent(struct page *page,
|
|
|
+ struct page_cgroup *pc,
|
|
|
+ struct mem_cgroup *child)
|
|
|
+{
|
|
|
+ struct mem_cgroup *parent;
|
|
|
+ unsigned int nr_pages;
|
|
|
+ unsigned long uninitialized_var(flags);
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ VM_BUG_ON(mem_cgroup_is_root(child));
|
|
|
+
|
|
|
+ ret = -EBUSY;
|
|
|
+ if (!get_page_unless_zero(page))
|
|
|
+ goto out;
|
|
|
+ if (isolate_lru_page(page))
|
|
|
+ goto put;
|
|
|
+
|
|
|
+ nr_pages = hpage_nr_pages(page);
|
|
|
+
|
|
|
+ parent = parent_mem_cgroup(child);
|
|
|
+ /*
|
|
|
+ * If no parent, move charges to root cgroup.
|
|
|
+ */
|
|
|
+ if (!parent)
|
|
|
+ parent = root_mem_cgroup;
|
|
|
+
|
|
|
+ if (nr_pages > 1) {
|
|
|
+ VM_BUG_ON(!PageTransHuge(page));
|
|
|
+ flags = compound_lock_irqsave(page);
|
|
|
+ }
|
|
|
+
|
|
|
+ ret = mem_cgroup_move_account(page, nr_pages,
|
|
|
+ pc, child, parent);
|
|
|
+ if (!ret)
|
|
|
+ __mem_cgroup_cancel_local_charge(child, nr_pages);
|
|
|
+
|
|
|
+ if (nr_pages > 1)
|
|
|
+ compound_unlock_irqrestore(page, flags);
|
|
|
+ putback_lru_page(page);
|
|
|
+put:
|
|
|
+ put_page(page);
|
|
|
+out:
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Charge the memory controller for page usage.
|
|
|
+ * Return
|
|
|
+ * 0 if the charge was successful
|
|
|
+ * < 0 if the cgroup is over its limit
|
|
|
+ */
|
|
|
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
|
|
|
+ gfp_t gfp_mask, enum charge_type ctype)
|
|
|
+{
|
|
|
+ struct mem_cgroup *memcg = NULL;
|
|
|
+ unsigned int nr_pages = 1;
|
|
|
+ bool oom = true;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ if (PageTransHuge(page)) {
|
|
|
nr_pages <<= compound_order(page);
|
|
|
VM_BUG_ON(!PageTransHuge(page));
|
|
|
/*
|
|
@@ -3486,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
-static DEFINE_MUTEX(set_limit_mutex);
|
|
|
-
|
|
|
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
|
|
unsigned long long val)
|
|
|
{
|
|
@@ -3772,6 +4679,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
|
|
|
static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
|
|
|
{
|
|
|
int node, zid;
|
|
|
+ u64 usage;
|
|
|
|
|
|
do {
|
|
|
/* This is for making all *used* pages to be on LRU. */
|
|
@@ -3792,13 +4700,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
|
|
|
cond_resched();
|
|
|
|
|
|
/*
|
|
|
+ * Kernel memory may not necessarily be trackable to a specific
|
|
|
+ * process. So they are not migrated, and therefore we can't
|
|
|
+ * expect their value to drop to 0 here.
|
|
|
+ * Having res filled up with kmem only is enough.
|
|
|
+ *
|
|
|
* This is a safety check because mem_cgroup_force_empty_list
|
|
|
* could have raced with mem_cgroup_replace_page_cache callers
|
|
|
* so the lru seemed empty but the page could have been added
|
|
|
* right after the check. RES_USAGE should be safe as we always
|
|
|
* charge before adding to the LRU.
|
|
|
*/
|
|
|
- } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
|
|
|
+ usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
|
|
|
+ res_counter_read_u64(&memcg->kmem, RES_USAGE);
|
|
|
+ } while (usage > 0);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -3942,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
|
|
|
char str[64];
|
|
|
u64 val;
|
|
|
- int type, name, len;
|
|
|
+ int name, len;
|
|
|
+ enum res_type type;
|
|
|
|
|
|
type = MEMFILE_TYPE(cft->private);
|
|
|
name = MEMFILE_ATTR(cft->private);
|
|
@@ -3963,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
|
|
|
else
|
|
|
val = res_counter_read_u64(&memcg->memsw, name);
|
|
|
break;
|
|
|
+ case _KMEM:
|
|
|
+ val = res_counter_read_u64(&memcg->kmem, name);
|
|
|
+ break;
|
|
|
default:
|
|
|
BUG();
|
|
|
}
|
|
@@ -3970,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
|
|
|
len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
|
|
|
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
|
|
|
}
|
|
|
+
|
|
|
+static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
|
|
|
+{
|
|
|
+ int ret = -EINVAL;
|
|
|
+#ifdef CONFIG_MEMCG_KMEM
|
|
|
+ bool must_inc_static_branch = false;
|
|
|
+
|
|
|
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
|
|
|
+ /*
|
|
|
+ * For simplicity, we won't allow this to be disabled. It also can't
|
|
|
+ * be changed if the cgroup has children already, or if tasks had
|
|
|
+ * already joined.
|
|
|
+ *
|
|
|
+ * If tasks join before we set the limit, a person looking at
|
|
|
+ * kmem.usage_in_bytes will have no way to determine when it took
|
|
|
+ * place, which makes the value quite meaningless.
|
|
|
+ *
|
|
|
+ * After it first became limited, changes in the value of the limit are
|
|
|
+ * of course permitted.
|
|
|
+ *
|
|
|
+ * Taking the cgroup_lock is really offensive, but it is so far the only
|
|
|
+ * way to guarantee that no children will appear. There are plenty of
|
|
|
+ * other offenders, and they should all go away. Fine grained locking
|
|
|
+ * is probably the way to go here. When we are fully hierarchical, we
|
|
|
+ * can also get rid of the use_hierarchy check.
|
|
|
+ */
|
|
|
+ cgroup_lock();
|
|
|
+ mutex_lock(&set_limit_mutex);
|
|
|
+ if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
|
|
|
+ if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
|
|
|
+ !list_empty(&cont->children))) {
|
|
|
+ ret = -EBUSY;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+ ret = res_counter_set_limit(&memcg->kmem, val);
|
|
|
+ VM_BUG_ON(ret);
|
|
|
+
|
|
|
+ ret = memcg_update_cache_sizes(memcg);
|
|
|
+ if (ret) {
|
|
|
+ res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+ must_inc_static_branch = true;
|
|
|
+ /*
|
|
|
+ * kmem charges can outlive the cgroup. In the case of slab
|
|
|
+ * pages, for instance, a page contain objects from various
|
|
|
+ * processes, so it is unfeasible to migrate them away. We
|
|
|
+ * need to reference count the memcg because of that.
|
|
|
+ */
|
|
|
+ mem_cgroup_get(memcg);
|
|
|
+ } else
|
|
|
+ ret = res_counter_set_limit(&memcg->kmem, val);
|
|
|
+out:
|
|
|
+ mutex_unlock(&set_limit_mutex);
|
|
|
+ cgroup_unlock();
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We are by now familiar with the fact that we can't inc the static
|
|
|
+ * branch inside cgroup_lock. See disarm functions for details. A
|
|
|
+ * worker here is overkill, but also wrong: After the limit is set, we
|
|
|
+ * must start accounting right away. Since this operation can't fail,
|
|
|
+ * we can safely defer it to here - no rollback will be needed.
|
|
|
+ *
|
|
|
+ * The boolean used to control this is also safe, because
|
|
|
+ * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
|
|
|
+ * able to set it to true;
|
|
|
+ */
|
|
|
+ if (must_inc_static_branch) {
|
|
|
+ static_key_slow_inc(&memcg_kmem_enabled_key);
|
|
|
+ /*
|
|
|
+ * setting the active bit after the inc will guarantee no one
|
|
|
+ * starts accounting before all call sites are patched
|
|
|
+ */
|
|
|
+ memcg_kmem_set_active(memcg);
|
|
|
+ }
|
|
|
+
|
|
|
+#endif
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+static int memcg_propagate_kmem(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ int ret = 0;
|
|
|
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
|
|
|
+ if (!parent)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ memcg->kmem_account_flags = parent->kmem_account_flags;
|
|
|
+#ifdef CONFIG_MEMCG_KMEM
|
|
|
+ /*
|
|
|
+ * When that happen, we need to disable the static branch only on those
|
|
|
+ * memcgs that enabled it. To achieve this, we would be forced to
|
|
|
+ * complicate the code by keeping track of which memcgs were the ones
|
|
|
+ * that actually enabled limits, and which ones got it from its
|
|
|
+ * parents.
|
|
|
+ *
|
|
|
+ * It is a lot simpler just to do static_key_slow_inc() on every child
|
|
|
+ * that is accounted.
|
|
|
+ */
|
|
|
+ if (!memcg_kmem_is_active(memcg))
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * destroy(), called if we fail, will issue static_key_slow_inc() and
|
|
|
+ * mem_cgroup_put() if kmem is enabled. We have to either call them
|
|
|
+ * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
|
|
|
+ * this more consistent, since it always leads to the same destroy path
|
|
|
+ */
|
|
|
+ mem_cgroup_get(memcg);
|
|
|
+ static_key_slow_inc(&memcg_kmem_enabled_key);
|
|
|
+
|
|
|
+ mutex_lock(&set_limit_mutex);
|
|
|
+ ret = memcg_update_cache_sizes(memcg);
|
|
|
+ mutex_unlock(&set_limit_mutex);
|
|
|
+#endif
|
|
|
+out:
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* The user of this function is...
|
|
|
* RES_LIMIT.
|
|
@@ -3978,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
|
|
|
const char *buffer)
|
|
|
{
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
|
|
|
- int type, name;
|
|
|
+ enum res_type type;
|
|
|
+ int name;
|
|
|
unsigned long long val;
|
|
|
int ret;
|
|
|
|
|
@@ -4000,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
|
|
|
break;
|
|
|
if (type == _MEM)
|
|
|
ret = mem_cgroup_resize_limit(memcg, val);
|
|
|
- else
|
|
|
+ else if (type == _MEMSWAP)
|
|
|
ret = mem_cgroup_resize_memsw_limit(memcg, val);
|
|
|
+ else if (type == _KMEM)
|
|
|
+ ret = memcg_update_kmem_limit(cont, val);
|
|
|
+ else
|
|
|
+ return -EINVAL;
|
|
|
break;
|
|
|
case RES_SOFT_LIMIT:
|
|
|
ret = res_counter_memparse_write_strategy(buffer, &val);
|
|
@@ -4054,7 +5097,8 @@ out:
|
|
|
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
|
|
|
{
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
|
|
|
- int type, name;
|
|
|
+ int name;
|
|
|
+ enum res_type type;
|
|
|
|
|
|
type = MEMFILE_TYPE(event);
|
|
|
name = MEMFILE_ATTR(event);
|
|
@@ -4066,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
|
|
|
case RES_MAX_USAGE:
|
|
|
if (type == _MEM)
|
|
|
res_counter_reset_max(&memcg->res);
|
|
|
- else
|
|
|
+ else if (type == _MEMSWAP)
|
|
|
res_counter_reset_max(&memcg->memsw);
|
|
|
+ else if (type == _KMEM)
|
|
|
+ res_counter_reset_max(&memcg->kmem);
|
|
|
+ else
|
|
|
+ return -EINVAL;
|
|
|
break;
|
|
|
case RES_FAILCNT:
|
|
|
if (type == _MEM)
|
|
|
res_counter_reset_failcnt(&memcg->res);
|
|
|
- else
|
|
|
+ else if (type == _MEMSWAP)
|
|
|
res_counter_reset_failcnt(&memcg->memsw);
|
|
|
+ else if (type == _KMEM)
|
|
|
+ res_counter_reset_failcnt(&memcg->kmem);
|
|
|
+ else
|
|
|
+ return -EINVAL;
|
|
|
break;
|
|
|
}
|
|
|
|
|
@@ -4390,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
|
|
|
struct mem_cgroup_thresholds *thresholds;
|
|
|
struct mem_cgroup_threshold_ary *new;
|
|
|
- int type = MEMFILE_TYPE(cft->private);
|
|
|
+ enum res_type type = MEMFILE_TYPE(cft->private);
|
|
|
u64 threshold, usage;
|
|
|
int i, size, ret;
|
|
|
|
|
@@ -4473,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
|
|
|
struct mem_cgroup_thresholds *thresholds;
|
|
|
struct mem_cgroup_threshold_ary *new;
|
|
|
- int type = MEMFILE_TYPE(cft->private);
|
|
|
+ enum res_type type = MEMFILE_TYPE(cft->private);
|
|
|
u64 usage;
|
|
|
int i, j, size;
|
|
|
|
|
@@ -4551,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
|
|
|
{
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
|
|
|
struct mem_cgroup_eventfd_list *event;
|
|
|
- int type = MEMFILE_TYPE(cft->private);
|
|
|
+ enum res_type type = MEMFILE_TYPE(cft->private);
|
|
|
|
|
|
BUG_ON(type != _OOM_TYPE);
|
|
|
event = kmalloc(sizeof(*event), GFP_KERNEL);
|
|
@@ -4576,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
|
|
|
{
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
|
|
|
struct mem_cgroup_eventfd_list *ev, *tmp;
|
|
|
- int type = MEMFILE_TYPE(cft->private);
|
|
|
+ enum res_type type = MEMFILE_TYPE(cft->private);
|
|
|
|
|
|
BUG_ON(type != _OOM_TYPE);
|
|
|
|
|
@@ -4635,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
|
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
|
|
|
{
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ memcg->kmemcg_id = -1;
|
|
|
+ ret = memcg_propagate_kmem(memcg);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+
|
|
|
return mem_cgroup_sockets_init(memcg, ss);
|
|
|
};
|
|
|
|
|
|
static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
|
|
|
{
|
|
|
mem_cgroup_sockets_destroy(memcg);
|
|
|
+
|
|
|
+ memcg_kmem_mark_dead(memcg);
|
|
|
+
|
|
|
+ if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Charges already down to 0, undo mem_cgroup_get() done in the charge
|
|
|
+ * path here, being careful not to race with memcg_uncharge_kmem: it is
|
|
|
+ * possible that the charges went down to 0 between mark_dead and the
|
|
|
+ * res_counter read, so in that case, we don't need the put
|
|
|
+ */
|
|
|
+ if (memcg_kmem_test_and_clear_dead(memcg))
|
|
|
+ mem_cgroup_put(memcg);
|
|
|
}
|
|
|
#else
|
|
|
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
|
|
@@ -4748,6 +5821,37 @@ static struct cftype mem_cgroup_files[] = {
|
|
|
.trigger = mem_cgroup_reset,
|
|
|
.read = mem_cgroup_read,
|
|
|
},
|
|
|
+#endif
|
|
|
+#ifdef CONFIG_MEMCG_KMEM
|
|
|
+ {
|
|
|
+ .name = "kmem.limit_in_bytes",
|
|
|
+ .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
|
|
|
+ .write_string = mem_cgroup_write,
|
|
|
+ .read = mem_cgroup_read,
|
|
|
+ },
|
|
|
+ {
|
|
|
+ .name = "kmem.usage_in_bytes",
|
|
|
+ .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
|
|
|
+ .read = mem_cgroup_read,
|
|
|
+ },
|
|
|
+ {
|
|
|
+ .name = "kmem.failcnt",
|
|
|
+ .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
|
|
|
+ .trigger = mem_cgroup_reset,
|
|
|
+ .read = mem_cgroup_read,
|
|
|
+ },
|
|
|
+ {
|
|
|
+ .name = "kmem.max_usage_in_bytes",
|
|
|
+ .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
|
|
|
+ .trigger = mem_cgroup_reset,
|
|
|
+ .read = mem_cgroup_read,
|
|
|
+ },
|
|
|
+#ifdef CONFIG_SLABINFO
|
|
|
+ {
|
|
|
+ .name = "kmem.slabinfo",
|
|
|
+ .read_seq_string = mem_cgroup_slabinfo_read,
|
|
|
+ },
|
|
|
+#endif
|
|
|
#endif
|
|
|
{ }, /* terminate */
|
|
|
};
|
|
@@ -4816,16 +5920,29 @@ out_free:
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
|
|
|
- * but in process context. The work_freeing structure is overlaid
|
|
|
- * on the rcu_freeing structure, which itself is overlaid on memsw.
|
|
|
+ * At destroying mem_cgroup, references from swap_cgroup can remain.
|
|
|
+ * (scanning all at force_empty is too costly...)
|
|
|
+ *
|
|
|
+ * Instead of clearing all references at force_empty, we remember
|
|
|
+ * the number of reference from swap_cgroup and free mem_cgroup when
|
|
|
+ * it goes down to 0.
|
|
|
+ *
|
|
|
+ * Removal of cgroup itself succeeds regardless of refs from swap.
|
|
|
*/
|
|
|
-static void free_work(struct work_struct *work)
|
|
|
+
|
|
|
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
|
|
|
{
|
|
|
- struct mem_cgroup *memcg;
|
|
|
+ int node;
|
|
|
int size = sizeof(struct mem_cgroup);
|
|
|
|
|
|
- memcg = container_of(work, struct mem_cgroup, work_freeing);
|
|
|
+ mem_cgroup_remove_from_trees(memcg);
|
|
|
+ free_css_id(&mem_cgroup_subsys, &memcg->css);
|
|
|
+
|
|
|
+ for_each_node(node)
|
|
|
+ free_mem_cgroup_per_zone_info(memcg, node);
|
|
|
+
|
|
|
+ free_percpu(memcg->stat);
|
|
|
+
|
|
|
/*
|
|
|
* We need to make sure that (at least for now), the jump label
|
|
|
* destruction code runs outside of the cgroup lock. This is because
|
|
@@ -4837,45 +5954,34 @@ static void free_work(struct work_struct *work)
|
|
|
* to move this code around, and make sure it is outside
|
|
|
* the cgroup_lock.
|
|
|
*/
|
|
|
- disarm_sock_keys(memcg);
|
|
|
+ disarm_static_keys(memcg);
|
|
|
if (size < PAGE_SIZE)
|
|
|
kfree(memcg);
|
|
|
else
|
|
|
vfree(memcg);
|
|
|
}
|
|
|
|
|
|
-static void free_rcu(struct rcu_head *rcu_head)
|
|
|
-{
|
|
|
- struct mem_cgroup *memcg;
|
|
|
-
|
|
|
- memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
|
|
|
- INIT_WORK(&memcg->work_freeing, free_work);
|
|
|
- schedule_work(&memcg->work_freeing);
|
|
|
-}
|
|
|
|
|
|
/*
|
|
|
- * At destroying mem_cgroup, references from swap_cgroup can remain.
|
|
|
- * (scanning all at force_empty is too costly...)
|
|
|
- *
|
|
|
- * Instead of clearing all references at force_empty, we remember
|
|
|
- * the number of reference from swap_cgroup and free mem_cgroup when
|
|
|
- * it goes down to 0.
|
|
|
- *
|
|
|
- * Removal of cgroup itself succeeds regardless of refs from swap.
|
|
|
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
|
|
|
+ * but in process context. The work_freeing structure is overlaid
|
|
|
+ * on the rcu_freeing structure, which itself is overlaid on memsw.
|
|
|
*/
|
|
|
-
|
|
|
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
|
|
|
+static void free_work(struct work_struct *work)
|
|
|
{
|
|
|
- int node;
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
|
|
|
- mem_cgroup_remove_from_trees(memcg);
|
|
|
- free_css_id(&mem_cgroup_subsys, &memcg->css);
|
|
|
+ memcg = container_of(work, struct mem_cgroup, work_freeing);
|
|
|
+ __mem_cgroup_free(memcg);
|
|
|
+}
|
|
|
|
|
|
- for_each_node(node)
|
|
|
- free_mem_cgroup_per_zone_info(memcg, node);
|
|
|
+static void free_rcu(struct rcu_head *rcu_head)
|
|
|
+{
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
|
|
|
- free_percpu(memcg->stat);
|
|
|
- call_rcu(&memcg->rcu_freeing, free_rcu);
|
|
|
+ memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
|
|
|
+ INIT_WORK(&memcg->work_freeing, free_work);
|
|
|
+ schedule_work(&memcg->work_freeing);
|
|
|
}
|
|
|
|
|
|
static void mem_cgroup_get(struct mem_cgroup *memcg)
|
|
@@ -4887,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
|
|
|
{
|
|
|
if (atomic_sub_and_test(count, &memcg->refcnt)) {
|
|
|
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
|
|
|
- __mem_cgroup_free(memcg);
|
|
|
+ call_rcu(&memcg->rcu_freeing, free_rcu);
|
|
|
if (parent)
|
|
|
mem_cgroup_put(parent);
|
|
|
}
|
|
@@ -4994,6 +6100,8 @@ mem_cgroup_css_alloc(struct cgroup *cont)
|
|
|
if (parent && parent->use_hierarchy) {
|
|
|
res_counter_init(&memcg->res, &parent->res);
|
|
|
res_counter_init(&memcg->memsw, &parent->memsw);
|
|
|
+ res_counter_init(&memcg->kmem, &parent->kmem);
|
|
|
+
|
|
|
/*
|
|
|
* We increment refcnt of the parent to ensure that we can
|
|
|
* safely access it on res_counter_charge/uncharge.
|
|
@@ -5004,6 +6112,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
|
|
|
} else {
|
|
|
res_counter_init(&memcg->res, NULL);
|
|
|
res_counter_init(&memcg->memsw, NULL);
|
|
|
+ res_counter_init(&memcg->kmem, NULL);
|
|
|
/*
|
|
|
* Deeper hierachy with use_hierarchy == false doesn't make
|
|
|
* much sense so let cgroup subsystem know about this
|
|
@@ -5043,6 +6152,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
|
|
|
|
|
|
mem_cgroup_reparent_charges(memcg);
|
|
|
+ mem_cgroup_destroy_all_caches(memcg);
|
|
|
}
|
|
|
|
|
|
static void mem_cgroup_css_free(struct cgroup *cont)
|