|
@@ -23,6 +23,11 @@
|
|
|
#include <linux/vm_event_item.h>
|
|
|
#include <linux/hardirq.h>
|
|
|
#include <linux/jump_label.h>
|
|
|
+#include <linux/page_counter.h>
|
|
|
+#include <linux/vmpressure.h>
|
|
|
+#include <linux/eventfd.h>
|
|
|
+#include <linux/mmzone.h>
|
|
|
+#include <linux/writeback.h>
|
|
|
|
|
|
struct mem_cgroup;
|
|
|
struct page;
|
|
@@ -67,12 +72,221 @@ enum mem_cgroup_events_index {
|
|
|
MEMCG_NR_EVENTS,
|
|
|
};
|
|
|
|
|
|
+/*
|
|
|
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
|
|
|
+ * it will be incremated by the number of pages. This counter is used for
|
|
|
+ * for trigger some periodic events. This is straightforward and better
|
|
|
+ * than using jiffies etc. to handle periodic memcg event.
|
|
|
+ */
|
|
|
+enum mem_cgroup_events_target {
|
|
|
+ MEM_CGROUP_TARGET_THRESH,
|
|
|
+ MEM_CGROUP_TARGET_SOFTLIMIT,
|
|
|
+ MEM_CGROUP_TARGET_NUMAINFO,
|
|
|
+ MEM_CGROUP_NTARGETS,
|
|
|
+};
|
|
|
+
|
|
|
+/*
|
|
|
+ * Bits in struct cg_proto.flags
|
|
|
+ */
|
|
|
+enum cg_proto_flags {
|
|
|
+ /* Currently active and new sockets should be assigned to cgroups */
|
|
|
+ MEMCG_SOCK_ACTIVE,
|
|
|
+ /* It was ever activated; we must disarm static keys on destruction */
|
|
|
+ MEMCG_SOCK_ACTIVATED,
|
|
|
+};
|
|
|
+
|
|
|
+struct cg_proto {
|
|
|
+ struct page_counter memory_allocated; /* Current allocated memory. */
|
|
|
+ struct percpu_counter sockets_allocated; /* Current number of sockets. */
|
|
|
+ int memory_pressure;
|
|
|
+ long sysctl_mem[3];
|
|
|
+ unsigned long flags;
|
|
|
+ /*
|
|
|
+ * memcg field is used to find which memcg we belong directly
|
|
|
+ * Each memcg struct can hold more than one cg_proto, so container_of
|
|
|
+ * won't really cut.
|
|
|
+ *
|
|
|
+ * The elegant solution would be having an inverse function to
|
|
|
+ * proto_cgroup in struct proto, but that means polluting the structure
|
|
|
+ * for everybody, instead of just for memcg users.
|
|
|
+ */
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+};
|
|
|
+
|
|
|
#ifdef CONFIG_MEMCG
|
|
|
+struct mem_cgroup_stat_cpu {
|
|
|
+ long count[MEM_CGROUP_STAT_NSTATS];
|
|
|
+ unsigned long events[MEMCG_NR_EVENTS];
|
|
|
+ unsigned long nr_page_events;
|
|
|
+ unsigned long targets[MEM_CGROUP_NTARGETS];
|
|
|
+};
|
|
|
+
|
|
|
+struct mem_cgroup_reclaim_iter {
|
|
|
+ struct mem_cgroup *position;
|
|
|
+ /* scan generation, increased every round-trip */
|
|
|
+ unsigned int generation;
|
|
|
+};
|
|
|
+
|
|
|
+/*
|
|
|
+ * per-zone information in memory controller.
|
|
|
+ */
|
|
|
+struct mem_cgroup_per_zone {
|
|
|
+ struct lruvec lruvec;
|
|
|
+ unsigned long lru_size[NR_LRU_LISTS];
|
|
|
+
|
|
|
+ struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
|
|
|
+
|
|
|
+ struct rb_node tree_node; /* RB tree node */
|
|
|
+ unsigned long usage_in_excess;/* Set to the value by which */
|
|
|
+ /* the soft limit is exceeded*/
|
|
|
+ bool on_tree;
|
|
|
+ struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
|
|
+ /* use container_of */
|
|
|
+};
|
|
|
+
|
|
|
+struct mem_cgroup_per_node {
|
|
|
+ struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
|
|
|
+};
|
|
|
+
|
|
|
+struct mem_cgroup_threshold {
|
|
|
+ struct eventfd_ctx *eventfd;
|
|
|
+ unsigned long threshold;
|
|
|
+};
|
|
|
+
|
|
|
+/* For threshold */
|
|
|
+struct mem_cgroup_threshold_ary {
|
|
|
+ /* An array index points to threshold just below or equal to usage. */
|
|
|
+ int current_threshold;
|
|
|
+ /* Size of entries[] */
|
|
|
+ unsigned int size;
|
|
|
+ /* Array of thresholds */
|
|
|
+ struct mem_cgroup_threshold entries[0];
|
|
|
+};
|
|
|
+
|
|
|
+struct mem_cgroup_thresholds {
|
|
|
+ /* Primary thresholds array */
|
|
|
+ struct mem_cgroup_threshold_ary *primary;
|
|
|
+ /*
|
|
|
+ * Spare threshold array.
|
|
|
+ * This is needed to make mem_cgroup_unregister_event() "never fail".
|
|
|
+ * It must be able to store at least primary->size - 1 entries.
|
|
|
+ */
|
|
|
+ struct mem_cgroup_threshold_ary *spare;
|
|
|
+};
|
|
|
+
|
|
|
+/*
|
|
|
+ * The memory controller data structure. The memory controller controls both
|
|
|
+ * page cache and RSS per cgroup. We would eventually like to provide
|
|
|
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
|
|
|
+ * to help the administrator determine what knobs to tune.
|
|
|
+ */
|
|
|
+struct mem_cgroup {
|
|
|
+ struct cgroup_subsys_state css;
|
|
|
+
|
|
|
+ /* Accounted resources */
|
|
|
+ struct page_counter memory;
|
|
|
+ struct page_counter memsw;
|
|
|
+ struct page_counter kmem;
|
|
|
+
|
|
|
+ /* Normal memory consumption range */
|
|
|
+ unsigned long low;
|
|
|
+ unsigned long high;
|
|
|
+
|
|
|
+ unsigned long soft_limit;
|
|
|
+
|
|
|
+ /* vmpressure notifications */
|
|
|
+ struct vmpressure vmpressure;
|
|
|
+
|
|
|
+ /* css_online() has been completed */
|
|
|
+ int initialized;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Should the accounting and control be hierarchical, per subtree?
|
|
|
+ */
|
|
|
+ bool use_hierarchy;
|
|
|
+
|
|
|
+ /* protected by memcg_oom_lock */
|
|
|
+ bool oom_lock;
|
|
|
+ int under_oom;
|
|
|
+
|
|
|
+ int swappiness;
|
|
|
+ /* OOM-Killer disable */
|
|
|
+ int oom_kill_disable;
|
|
|
+
|
|
|
+ /* protect arrays of thresholds */
|
|
|
+ struct mutex thresholds_lock;
|
|
|
+
|
|
|
+ /* thresholds for memory usage. RCU-protected */
|
|
|
+ struct mem_cgroup_thresholds thresholds;
|
|
|
+
|
|
|
+ /* thresholds for mem+swap usage. RCU-protected */
|
|
|
+ struct mem_cgroup_thresholds memsw_thresholds;
|
|
|
+
|
|
|
+ /* For oom notifier event fd */
|
|
|
+ struct list_head oom_notify;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Should we move charges of a task when a task is moved into this
|
|
|
+ * mem_cgroup ? And what type of charges should we move ?
|
|
|
+ */
|
|
|
+ unsigned long move_charge_at_immigrate;
|
|
|
+ /*
|
|
|
+ * set > 0 if pages under this cgroup are moving to other cgroup.
|
|
|
+ */
|
|
|
+ atomic_t moving_account;
|
|
|
+ /* taken only while moving_account > 0 */
|
|
|
+ spinlock_t move_lock;
|
|
|
+ struct task_struct *move_lock_task;
|
|
|
+ unsigned long move_lock_flags;
|
|
|
+ /*
|
|
|
+ * percpu counter.
|
|
|
+ */
|
|
|
+ struct mem_cgroup_stat_cpu __percpu *stat;
|
|
|
+ spinlock_t pcp_counter_lock;
|
|
|
+
|
|
|
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
|
|
|
+ struct cg_proto tcp_mem;
|
|
|
+#endif
|
|
|
+#if defined(CONFIG_MEMCG_KMEM)
|
|
|
+ /* Index in the kmem_cache->memcg_params.memcg_caches array */
|
|
|
+ int kmemcg_id;
|
|
|
+ bool kmem_acct_activated;
|
|
|
+ bool kmem_acct_active;
|
|
|
+#endif
|
|
|
+
|
|
|
+ int last_scanned_node;
|
|
|
+#if MAX_NUMNODES > 1
|
|
|
+ nodemask_t scan_nodes;
|
|
|
+ atomic_t numainfo_events;
|
|
|
+ atomic_t numainfo_updating;
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
+ struct list_head cgwb_list;
|
|
|
+ struct wb_domain cgwb_domain;
|
|
|
+#endif
|
|
|
+
|
|
|
+ /* List of events which userspace want to receive */
|
|
|
+ struct list_head event_list;
|
|
|
+ spinlock_t event_list_lock;
|
|
|
+
|
|
|
+ struct mem_cgroup_per_node *nodeinfo[0];
|
|
|
+ /* WARNING: nodeinfo must be the last member here */
|
|
|
+};
|
|
|
extern struct cgroup_subsys_state *mem_cgroup_root_css;
|
|
|
|
|
|
-void mem_cgroup_events(struct mem_cgroup *memcg,
|
|
|
+/**
|
|
|
+ * mem_cgroup_events - count memory events against a cgroup
|
|
|
+ * @memcg: the memory cgroup
|
|
|
+ * @idx: the event index
|
|
|
+ * @nr: the number of events to account for
|
|
|
+ */
|
|
|
+static inline void mem_cgroup_events(struct mem_cgroup *memcg,
|
|
|
enum mem_cgroup_events_index idx,
|
|
|
- unsigned int nr);
|
|
|
+ unsigned int nr)
|
|
|
+{
|
|
|
+ this_cpu_add(memcg->stat->events[idx], nr);
|
|
|
+}
|
|
|
|
|
|
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
|
|
|
|
|
@@ -90,15 +304,31 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
|
|
|
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
|
|
|
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
|
|
|
|
|
|
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
|
|
|
- struct mem_cgroup *root);
|
|
|
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
|
|
|
|
|
|
extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
|
|
|
extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
|
|
|
|
|
|
extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
|
|
|
-extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
|
|
|
+static inline
|
|
|
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
|
|
|
+ return css ? container_of(css, struct mem_cgroup, css) : NULL;
|
|
|
+}
|
|
|
+
|
|
|
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
|
|
|
+ struct mem_cgroup *,
|
|
|
+ struct mem_cgroup_reclaim_cookie *);
|
|
|
+void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
|
|
|
+
|
|
|
+static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
|
|
|
+ struct mem_cgroup *root)
|
|
|
+{
|
|
|
+ if (root == memcg)
|
|
|
+ return true;
|
|
|
+ if (!root->use_hierarchy)
|
|
|
+ return false;
|
|
|
+ return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
|
|
|
+}
|
|
|
|
|
|
static inline bool mm_match_cgroup(struct mm_struct *mm,
|
|
|
struct mem_cgroup *memcg)
|
|
@@ -114,22 +344,65 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
|
|
|
return match;
|
|
|
}
|
|
|
|
|
|
-extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
|
|
|
extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
|
|
|
|
|
|
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
|
|
|
- struct mem_cgroup *,
|
|
|
- struct mem_cgroup_reclaim_cookie *);
|
|
|
-void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
|
|
|
+static inline bool mem_cgroup_disabled(void)
|
|
|
+{
|
|
|
+ if (memory_cgrp_subsys.disabled)
|
|
|
+ return true;
|
|
|
+ return false;
|
|
|
+}
|
|
|
|
|
|
/*
|
|
|
* For memory reclaim.
|
|
|
*/
|
|
|
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
|
|
|
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
|
|
|
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
|
|
|
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
|
|
|
-void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
|
|
|
+
|
|
|
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
|
|
|
+ int nr_pages);
|
|
|
+
|
|
|
+static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
|
|
|
+{
|
|
|
+ struct mem_cgroup_per_zone *mz;
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+
|
|
|
+ if (mem_cgroup_disabled())
|
|
|
+ return true;
|
|
|
+
|
|
|
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
|
|
|
+ memcg = mz->memcg;
|
|
|
+
|
|
|
+ return !!(memcg->css.flags & CSS_ONLINE);
|
|
|
+}
|
|
|
+
|
|
|
+static inline
|
|
|
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
|
|
|
+{
|
|
|
+ struct mem_cgroup_per_zone *mz;
|
|
|
+
|
|
|
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
|
|
|
+ return mz->lru_size[lru];
|
|
|
+}
|
|
|
+
|
|
|
+static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
|
|
+{
|
|
|
+ unsigned long inactive_ratio;
|
|
|
+ unsigned long inactive;
|
|
|
+ unsigned long active;
|
|
|
+ unsigned long gb;
|
|
|
+
|
|
|
+ inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
|
|
|
+ active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
|
|
|
+
|
|
|
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
|
|
|
+ if (gb)
|
|
|
+ inactive_ratio = int_sqrt(10 * gb);
|
|
|
+ else
|
|
|
+ inactive_ratio = 1;
|
|
|
+
|
|
|
+ return inactive * inactive_ratio < active;
|
|
|
+}
|
|
|
+
|
|
|
extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
|
|
|
struct task_struct *p);
|
|
|
|
|
@@ -156,18 +429,26 @@ bool mem_cgroup_oom_synchronize(bool wait);
|
|
|
extern int do_swap_account;
|
|
|
#endif
|
|
|
|
|
|
-static inline bool mem_cgroup_disabled(void)
|
|
|
-{
|
|
|
- if (memory_cgrp_subsys.disabled)
|
|
|
- return true;
|
|
|
- return false;
|
|
|
-}
|
|
|
-
|
|
|
struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
|
|
|
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
|
|
|
- enum mem_cgroup_stat_index idx, int val);
|
|
|
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
|
|
|
|
|
|
+/**
|
|
|
+ * mem_cgroup_update_page_stat - update page state statistics
|
|
|
+ * @memcg: memcg to account against
|
|
|
+ * @idx: page state item to account
|
|
|
+ * @val: number of pages (positive or negative)
|
|
|
+ *
|
|
|
+ * See mem_cgroup_begin_page_stat() for locking requirements.
|
|
|
+ */
|
|
|
+static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
|
|
|
+ enum mem_cgroup_stat_index idx, int val)
|
|
|
+{
|
|
|
+ VM_BUG_ON(!rcu_read_lock_held());
|
|
|
+
|
|
|
+ if (memcg)
|
|
|
+ this_cpu_add(memcg->stat->count[idx], val);
|
|
|
+}
|
|
|
+
|
|
|
static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
|
|
|
enum mem_cgroup_stat_index idx)
|
|
|
{
|
|
@@ -184,13 +465,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
|
|
gfp_t gfp_mask,
|
|
|
unsigned long *total_scanned);
|
|
|
|
|
|
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
|
|
|
static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
|
|
|
enum vm_event_item idx)
|
|
|
{
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+
|
|
|
if (mem_cgroup_disabled())
|
|
|
return;
|
|
|
- __mem_cgroup_count_vm_event(mm, idx);
|
|
|
+
|
|
|
+ rcu_read_lock();
|
|
|
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
|
|
+ if (unlikely(!memcg))
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ switch (idx) {
|
|
|
+ case PGFAULT:
|
|
|
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
|
|
|
+ break;
|
|
|
+ case PGMAJFAULT:
|
|
|
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ BUG();
|
|
|
+ }
|
|
|
+out:
|
|
|
+ rcu_read_unlock();
|
|
|
}
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
void mem_cgroup_split_huge_fixup(struct page *head);
|
|
@@ -275,12 +574,6 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
-static inline struct cgroup_subsys_state
|
|
|
- *mem_cgroup_css(struct mem_cgroup *memcg)
|
|
|
-{
|
|
|
- return NULL;
|
|
|
-}
|
|
|
-
|
|
|
static inline struct mem_cgroup *
|
|
|
mem_cgroup_iter(struct mem_cgroup *root,
|
|
|
struct mem_cgroup *prev,
|
|
@@ -444,7 +737,10 @@ static inline bool memcg_kmem_enabled(void)
|
|
|
return static_key_false(&memcg_kmem_enabled_key);
|
|
|
}
|
|
|
|
|
|
-bool memcg_kmem_is_active(struct mem_cgroup *memcg);
|
|
|
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ return memcg->kmem_acct_active;
|
|
|
+}
|
|
|
|
|
|
/*
|
|
|
* In general, we'll do everything in our power to not incur in any overhead
|
|
@@ -463,7 +759,15 @@ void __memcg_kmem_commit_charge(struct page *page,
|
|
|
struct mem_cgroup *memcg, int order);
|
|
|
void __memcg_kmem_uncharge_pages(struct page *page, int order);
|
|
|
|
|
|
-int memcg_cache_id(struct mem_cgroup *memcg);
|
|
|
+/*
|
|
|
+ * helper for acessing a memcg's index. It will be used as an index in the
|
|
|
+ * child cache array in kmem_cache, and also to derive its name. This function
|
|
|
+ * will return -1 when this is not a kmem-limited memcg.
|
|
|
+ */
|
|
|
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ return memcg ? memcg->kmemcg_id : -1;
|
|
|
+}
|
|
|
|
|
|
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
|
|
|
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
|