12 years ago · d206e09036
--- a/Documentation/cgroups/00-INDEX
+++ b/Documentation/cgroups/00-INDEX
@@ -1,7 +1,11 @@
 
															 00-INDEX
														
 
															 	- this file
														
 
															+blkio-controller.txt
														
 
															+	- Description for Block IO Controller, implementation and usage details.
														
 
															 cgroups.txt
														
 
															 	- Control Groups definition, implementation details, examples and API.
														
 
															+cgroup_event_listener.c
														
 
															+	- A user program for cgroup listener.
														
 
															 cpuacct.txt
														
 
															 	- CPU Accounting Controller; account CPU usage for groups of tasks.
														
 
															 cpusets.txt
														
@@ -10,9 +14,13 @@ devices.txt
 
															 	- Device Whitelist Controller; description, interface and security.
														
 
															 freezer-subsystem.txt
														
 
															 	- checkpointing; rationale to not use signals, interface.
														
 
															+hugetlb.txt
														
 
															+	- HugeTLB Controller implementation and usage details.
														
 
															 memcg_test.txt
														
 
															 	- Memory Resource Controller; implementation details.
														
 
															 memory.txt
														
 
															 	- Memory Resource Controller; design, accounting, interface, testing.
														
 
															+net_prio.txt
														
 
															+	- Network priority cgroups details and usages.
														
 
															 resource_counter.txt
														
 
															 	- Resource Counter API.
														
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -299,11 +299,9 @@ a cgroup hierarchy's release_agent path is empty.
 
															 1.5 What does clone_children do ?
														
 
															 ---------------------------------
														
 
															-If the clone_children flag is enabled (1) in a cgroup, then all
														
 
															-cgroups created beneath will call the post_clone callbacks for each
														
 
															-subsystem of the newly created cgroup. Usually when this callback is
														
 
															-implemented for a subsystem, it copies the values of the parent
														
 
															-subsystem, this is the case for the cpuset.
														
 
															+This flag only affects the cpuset controller. If the clone_children
														
 
															+flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
														
 
															+configuration from the parent during initialization.
														
 
															 1.6 How do I use cgroups ?
														
 
															 --------------------------
														
@@ -553,16 +551,16 @@ call to cgroup_unload_subsys(). It should also set its_subsys.module =
 
															 THIS_MODULE in its .c file.
														
 
															 Each subsystem may export the following methods. The only mandatory
														
 
															-methods are create/destroy. Any others that are null are presumed to
														
 
															+methods are css_alloc/free. Any others that are null are presumed to
														
 
															 be successful no-ops.
														
 
															-struct cgroup_subsys_state *create(struct cgroup *cgrp)
														
 
															+struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)
														
 
															 (cgroup_mutex held by caller)
														
 
															-Called to create a subsystem state object for a cgroup. The
														
 
															+Called to allocate a subsystem state object for a cgroup. The
														
 
															 subsystem should allocate its subsystem state object for the passed
														
 
															 cgroup, returning a pointer to the new object on success or a
														
 
															-negative error code. On success, the subsystem pointer should point to
														
 
															+ERR_PTR() value. On success, the subsystem pointer should point to
														
 
															 a structure of type cgroup_subsys_state (typically embedded in a
														
 
															 larger subsystem-specific object), which will be initialized by the
														
 
															 cgroup system. Note that this will be called at initialization to
														
@@ -571,24 +569,33 @@ identified by the passed cgroup object having a NULL parent (since
 
															 it's the root of the hierarchy) and may be an appropriate place for
														
 
															 initialization code.
														
 
															-void destroy(struct cgroup *cgrp)
														
 
															+int css_online(struct cgroup *cgrp)
														
 
															 (cgroup_mutex held by caller)
														
 
															-The cgroup system is about to destroy the passed cgroup; the subsystem
														
 
															-should do any necessary cleanup and free its subsystem state
														
 
															-object. By the time this method is called, the cgroup has already been
														
 
															-unlinked from the file system and from the child list of its parent;
														
 
															-cgroup->parent is still valid. (Note - can also be called for a
														
 
															-newly-created cgroup if an error occurs after this subsystem's
														
 
															-create() method has been called for the new cgroup).
														
 
															+Called after @cgrp successfully completed all allocations and made
														
 
															+visible to cgroup_for_each_child/descendant_*() iterators. The
														
 
															+subsystem may choose to fail creation by returning -errno. This
														
 
															+callback can be used to implement reliable state sharing and
														
 
															+propagation along the hierarchy. See the comment on
														
 
															+cgroup_for_each_descendant_pre() for details.
														
 
															-int pre_destroy(struct cgroup *cgrp);
														
 
															+void css_offline(struct cgroup *cgrp);
														
 
															-Called before checking the reference count on each subsystem. This may
														
 
															-be useful for subsystems which have some extra references even if
														
 
															-there are not tasks in the cgroup. If pre_destroy() returns error code,
														
 
															-rmdir() will fail with it. From this behavior, pre_destroy() can be
														
 
															-called multiple times against a cgroup.
														
 
															+This is the counterpart of css_online() and called iff css_online()
														
 
															+has succeeded on @cgrp. This signifies the beginning of the end of
														
 
															+@cgrp. @cgrp is being removed and the subsystem should start dropping
														
 
															+all references it's holding on @cgrp. When all references are dropped,
														
 
															+cgroup removal will proceed to the next step - css_free(). After this
														
 
															+callback, @cgrp should be considered dead to the subsystem.
														
 
															+
														
 
															+void css_free(struct cgroup *cgrp)
														
 
															+(cgroup_mutex held by caller)
														
 
															+
														
 
															+The cgroup system is about to free @cgrp; the subsystem should free
														
 
															+its subsystem state object. By the time this method is called, @cgrp
														
 
															+is completely unused; @cgrp->parent is still valid. (Note - can also
														
 
															+be called for a newly-created cgroup if an error occurs after this
														
 
															+subsystem's create() method has been called for the new cgroup).
														
 
															 int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
														
 
															 (cgroup_mutex held by caller)
														
@@ -635,14 +642,6 @@ void exit(struct task_struct *task)
 
															 Called during task exit.
														
 
															-void post_clone(struct cgroup *cgrp)
														
 
															-(cgroup_mutex held by caller)
														
 
															-
														
 
															-Called during cgroup_create() to do any parameter
														
 
															-initialization which might be required before a task could attach.  For
														
 
															-example, in cpusets, no task may attach before 'cpus' and 'mems' are set
														
 
															-up.
														
 
															-
														
 
															 void bind(struct cgroup *root)
														
 
															 (cgroup_mutex held by caller)
														
--- a/Documentation/cgroups/freezer-subsystem.txt
+++ b/Documentation/cgroups/freezer-subsystem.txt
@@ -49,13 +49,49 @@ prevent the freeze/unfreeze cycle from becoming visible to the tasks
 
															 being frozen. This allows the bash example above and gdb to run as
														
 
															 expected.
														
 
															-The freezer subsystem in the container filesystem defines a file named
														
 
															-freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the
														
 
															-cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup.
														
 
															-Reading will return the current state.
														
 
															+The cgroup freezer is hierarchical. Freezing a cgroup freezes all
														
 
															+tasks beloning to the cgroup and all its descendant cgroups. Each
														
 
															+cgroup has its own state (self-state) and the state inherited from the
														
 
															+parent (parent-state). Iff both states are THAWED, the cgroup is
														
 
															+THAWED.
														
 
															-Note freezer.state doesn't exist in root cgroup, which means root cgroup
														
 
															-is non-freezable.
														
 
															+The following cgroupfs files are created by cgroup freezer.
														
 
															+
														
 
															+* freezer.state: Read-write.
														
 
															+
														
 
															+  When read, returns the effective state of the cgroup - "THAWED",
														
 
															+  "FREEZING" or "FROZEN". This is the combined self and parent-states.
														
 
															+  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
														
 
															+
														
 
															+  FREEZING cgroup transitions into FROZEN state when all tasks
														
 
															+  belonging to the cgroup and its descendants become frozen. Note that
														
 
															+  a cgroup reverts to FREEZING from FROZEN after a new task is added
														
 
															+  to the cgroup or one of its descendant cgroups until the new task is
														
 
															+  frozen.
														
 
															+
														
 
															+  When written, sets the self-state of the cgroup. Two values are
														
 
															+  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
														
 
															+  if not already freezing, enters FREEZING state along with all its
														
 
															+  descendant cgroups.
														
 
															+
														
 
															+  If THAWED is written, the self-state of the cgroup is changed to
														
 
															+  THAWED.  Note that the effective state may not change to THAWED if
														
 
															+  the parent-state is still freezing. If a cgroup's effective state
														
 
															+  becomes THAWED, all its descendants which are freezing because of
														
 
															+  the cgroup also leave the freezing state.
														
 
															+
														
 
															+* freezer.self_freezing: Read only.
														
 
															+
														
 
															+  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
														
 
															+  This value is 1 iff the last write to freezer.state was "FROZEN".
														
 
															+
														
 
															+* freezer.parent_freezing: Read only.
														
 
															+
														
 
															+  Shows the parent-state.  0 if none of the cgroup's ancestors is
														
 
															+  frozen; otherwise, 1.
														
 
															+
														
 
															+The root cgroup is non-freezable and the above interface files don't
														
 
															+exist.
														
 
															 * Examples of usage :
														
@@ -85,18 +121,3 @@ to unfreeze all tasks in the container :
 
															 This is the basic mechanism which should do the right thing for user space task
														
 
															 in a simple scenario.
														
 
															-
														
 
															-It's important to note that freezing can be incomplete. In that case we return
														
 
															-EBUSY. This means that some tasks in the cgroup are busy doing something that
														
 
															-prevents us from completely freezing the cgroup at this time. After EBUSY,
														
 
															-the cgroup will remain partially frozen -- reflected by freezer.state reporting
														
 
															-"FREEZING" when read. The state will remain "FREEZING" until one of these
														
 
															-things happens:
														
 
															-
														
 
															-	1) Userspace cancels the freezing operation by writing "THAWED" to
														
 
															-		the freezer.state file
														
 
															-	2) Userspace retries the freezing operation by writing "FROZEN" to
														
 
															-		the freezer.state file (writing "FREEZING" is not legal
														
 
															-		and returns EINVAL)
														
 
															-	3) The tasks that blocked the cgroup from entering the "FROZEN"
														
 
															-		state disappear from the cgroup's set of tasks.
														
--- a/Documentation/cgroups/net_prio.txt
+++ b/Documentation/cgroups/net_prio.txt
@@ -51,3 +51,5 @@ One usage for the net_prio cgroup is with mqprio qdisc allowing application
 
															 traffic to be steered to hardware/driver based traffic classes. These mappings
														
 
															 can then be managed by administrators or other networking protocols such as
														
 
															 DCBX.
														
 
															+
														
 
															+A new net_prio cgroup inherits the parent's configuration.
														
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -600,7 +600,7 @@ struct cftype blkcg_files[] = {
 
															 };
														
 
															 /**
														
 
															- * blkcg_pre_destroy - cgroup pre_destroy callback
														
 
															+ * blkcg_css_offline - cgroup css_offline callback
														
 
															  * @cgroup: cgroup of interest
														
 
															  *
														
 
															  * This function is called when @cgroup is about to go away and responsible
														
@@ -610,7 +610,7 @@ struct cftype blkcg_files[] = {
 
															  *
														
 
															  * This is the blkcg counterpart of ioc_release_fn().
														
 
															  */
														
 
															-static int blkcg_pre_destroy(struct cgroup *cgroup)
														
 
															+static void blkcg_css_offline(struct cgroup *cgroup)
														
 
															 {
														
 
															 	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
														
@@ -632,10 +632,9 @@ static int blkcg_pre_destroy(struct cgroup *cgroup)
 
															 	}
														
 
															 	spin_unlock_irq(&blkcg->lock);
														
 
															-	return 0;
														
 
															 }
														
 
															-static void blkcg_destroy(struct cgroup *cgroup)
														
 
															+static void blkcg_css_free(struct cgroup *cgroup)
														
 
															 {
														
 
															 	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
														
@@ -643,7 +642,7 @@ static void blkcg_destroy(struct cgroup *cgroup)
 
															 		kfree(blkcg);
														
 
															 }
														
 
															-static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
														
 
															+static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
														
 
															 {
														
 
															 	static atomic64_t id_seq = ATOMIC64_INIT(0);
														
 
															 	struct blkcg *blkcg;
														
@@ -740,10 +739,10 @@ static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 
															 struct cgroup_subsys blkio_subsys = {
														
 
															 	.name = "blkio",
														
 
															-	.create = blkcg_create,
														
 
															+	.css_alloc = blkcg_css_alloc,
														
 
															+	.css_offline = blkcg_css_offline,
														
 
															+	.css_free = blkcg_css_free,
														
 
															 	.can_attach = blkcg_can_attach,
														
 
															-	.pre_destroy = blkcg_pre_destroy,
														
 
															-	.destroy = blkcg_destroy,
														
 
															 	.subsys_id = blkio_subsys_id,
														
 
															 	.base_cftypes = blkcg_files,
														
 
															 	.module = THIS_MODULE,
														
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -12,6 +12,7 @@
 
															 #include <linux/cpumask.h>
														
 
															 #include <linux/nodemask.h>
														
 
															 #include <linux/rcupdate.h>
														
 
															+#include <linux/rculist.h>
														
 
															 #include <linux/cgroupstats.h>
														
 
															 #include <linux/prio_heap.h>
														
 
															 #include <linux/rwsem.h>
														
@@ -34,7 +35,6 @@ extern int cgroup_lock_is_held(void);
 
															 extern bool cgroup_lock_live_group(struct cgroup *cgrp);
														
 
															 extern void cgroup_unlock(void);
														
 
															 extern void cgroup_fork(struct task_struct *p);
														
 
															-extern void cgroup_fork_callbacks(struct task_struct *p);
														
 
															 extern void cgroup_post_fork(struct task_struct *p);
														
 
															 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
														
 
															 extern int cgroupstats_build(struct cgroupstats *stats,
														
@@ -66,7 +66,7 @@ struct cgroup_subsys_state {
 
															 	/*
														
 
															 	 * State maintained by the cgroup system to allow subsystems
														
 
															 	 * to be "busy". Should be accessed via css_get(),
														
 
															-	 * css_tryget() and and css_put().
														
 
															+	 * css_tryget() and css_put().
														
 
															 	 */
														
 
															 	atomic_t refcnt;
														
@@ -81,9 +81,8 @@ struct cgroup_subsys_state {
 
															 /* bits in struct cgroup_subsys_state flags field */
														
 
															 enum {
														
 
															-	CSS_ROOT, /* This CSS is the root of the subsystem */
														
 
															-	CSS_REMOVED, /* This CSS is dead */
														
 
															-	CSS_CLEAR_CSS_REFS,		/* @ss->__DEPRECATED_clear_css_refs */
														
 
															+	CSS_ROOT	= (1 << 0), /* this CSS is the root of the subsystem */
														
 
															+	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
														
 
															 };
														
 
															 /* Caller must verify that the css is not for root cgroup */
														
@@ -102,15 +101,10 @@ static inline void __css_get(struct cgroup_subsys_state *css, int count)
 
															 static inline void css_get(struct cgroup_subsys_state *css)
														
 
															 {
														
 
															 	/* We don't need to reference count the root state */
														
 
															-	if (!test_bit(CSS_ROOT, &css->flags))
														
 
															+	if (!(css->flags & CSS_ROOT))
														
 
															 		__css_get(css, 1);
														
 
															 }
														
 
															-static inline bool css_is_removed(struct cgroup_subsys_state *css)
														
 
															-{
														
 
															-	return test_bit(CSS_REMOVED, &css->flags);
														
 
															-}
														
 
															-
														
 
															 /*
														
 
															  * Call css_tryget() to take a reference on a css if your existing
														
 
															  * (known-valid) reference isn't already ref-counted. Returns false if
														
@@ -120,7 +114,7 @@ static inline bool css_is_removed(struct cgroup_subsys_state *css)
 
															 extern bool __css_tryget(struct cgroup_subsys_state *css);
														
 
															 static inline bool css_tryget(struct cgroup_subsys_state *css)
														
 
															 {
														
 
															-	if (test_bit(CSS_ROOT, &css->flags))
														
 
															+	if (css->flags & CSS_ROOT)
														
 
															 		return true;
														
 
															 	return __css_tryget(css);
														
 
															 }
														
@@ -133,7 +127,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
 
															 extern void __css_put(struct cgroup_subsys_state *css);
														
 
															 static inline void css_put(struct cgroup_subsys_state *css)
														
 
															 {
														
 
															-	if (!test_bit(CSS_ROOT, &css->flags))
														
 
															+	if (!(css->flags & CSS_ROOT))
														
 
															 		__css_put(css);
														
 
															 }
														
@@ -149,13 +143,11 @@ enum {
 
															 	/* Control Group requires release notifications to userspace */
														
 
															 	CGRP_NOTIFY_ON_RELEASE,
														
 
															 	/*
														
 
															-	 * A thread in rmdir() is wating for this cgroup.
														
 
															-	 */
														
 
															-	CGRP_WAIT_ON_RMDIR,
														
 
															-	/*
														
 
															-	 * Clone cgroup values when creating a new child cgroup
														
 
															+	 * Clone the parent's configuration when creating a new child
														
 
															+	 * cpuset cgroup.  For historical reasons, this option can be
														
 
															+	 * specified at mount time and thus is implemented here.
														
 
															 	 */
														
 
															-	CGRP_CLONE_CHILDREN,
														
 
															+	CGRP_CPUSET_CLONE_CHILDREN,
														
 
															 };
														
 
															 struct cgroup {
														
@@ -167,6 +159,8 @@ struct cgroup {
 
															 	 */
														
 
															 	atomic_t count;
														
 
															+	int id;				/* ida allocated in-hierarchy ID */
														
 
															+
														
 
															 	/*
														
 
															 	 * We link our 'sibling' struct into our parent's 'children'.
														
 
															 	 * Our children link their 'sibling' into our 'children'.
														
@@ -176,7 +170,7 @@ struct cgroup {
 
															 	struct list_head files;		/* my files */
														
 
															 	struct cgroup *parent;		/* my parent */
														
 
															-	struct dentry __rcu *dentry;	/* cgroup fs entry, RCU protected */
														
 
															+	struct dentry *dentry;		/* cgroup fs entry, RCU protected */
														
 
															 	/* Private pointers for each registered subsystem */
														
 
															 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
														
@@ -282,7 +276,7 @@ struct cgroup_map_cb {
 
															 /* cftype->flags */
														
 
															 #define CFTYPE_ONLY_ON_ROOT	(1U << 0)	/* only create on root cg */
														
 
															-#define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create onp root cg */
														
 
															+#define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create on root cg */
														
 
															 #define MAX_CFTYPE_NAME		64
														
@@ -421,23 +415,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
 
															 /* Return true if cgrp is a descendant of the task's cgroup */
														
 
															 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
														
 
															-/*
														
 
															- * When the subsys has to access css and may add permanent refcnt to css,
														
 
															- * it should take care of racy conditions with rmdir(). Following set of
														
 
															- * functions, is for stop/restart rmdir if necessary.
														
 
															- * Because these will call css_get/put, "css" should be alive css.
														
 
															- *
														
 
															- *  cgroup_exclude_rmdir();
														
 
															- *  ...do some jobs which may access arbitrary empty cgroup
														
 
															- *  cgroup_release_and_wakeup_rmdir();
														
 
															- *
														
 
															- *  When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
														
 
															- *  it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
														
 
															- */
														
 
															-
														
 
															-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
														
 
															-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
														
 
															-
														
 
															 /*
														
 
															  * Control Group taskset, used to pass around set of tasks to cgroup_subsys
														
 
															  * methods.
														
@@ -466,16 +443,17 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
 
															  */
														
 
															 struct cgroup_subsys {
														
 
															-	struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
														
 
															-	int (*pre_destroy)(struct cgroup *cgrp);
														
 
															-	void (*destroy)(struct cgroup *cgrp);
														
 
															+	struct cgroup_subsys_state *(*css_alloc)(struct cgroup *cgrp);
														
 
															+	int (*css_online)(struct cgroup *cgrp);
														
 
															+	void (*css_offline)(struct cgroup *cgrp);
														
 
															+	void (*css_free)(struct cgroup *cgrp);
														
 
															+
														
 
															 	int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
														
 
															 	void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
														
 
															 	void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
														
 
															 	void (*fork)(struct task_struct *task);
														
 
															 	void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
														
 
															 		     struct task_struct *task);
														
 
															-	void (*post_clone)(struct cgroup *cgrp);
														
 
															 	void (*bind)(struct cgroup *root);
														
 
															 	int subsys_id;
														
@@ -488,17 +466,6 @@ struct cgroup_subsys {
 
															 	 */
														
 
															 	bool use_id;
														
 
															-	/*
														
 
															-	 * If %true, cgroup removal will try to clear css refs by retrying
														
 
															-	 * ss->pre_destroy() until there's no css ref left.  This behavior
														
 
															-	 * is strictly for backward compatibility and will be removed as
														
 
															-	 * soon as the current user (memcg) is updated.
														
 
															-	 *
														
 
															-	 * If %false, ss->pre_destroy() can't fail and cgroup removal won't
														
 
															-	 * wait for css refs to drop to zero before proceeding.
														
 
															-	 */
														
 
															-	bool __DEPRECATED_clear_css_refs;
														
 
															-
														
 
															 	/*
														
 
															 	 * If %false, this subsystem is properly hierarchical -
														
 
															 	 * configuration, resource accounting and restriction on a parent
														
@@ -572,6 +539,100 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
 
															 	return task_subsys_state(task, subsys_id)->cgroup;
														
 
															 }
														
 
															+/**
														
 
															+ * cgroup_for_each_child - iterate through children of a cgroup
														
 
															+ * @pos: the cgroup * to use as the loop cursor
														
 
															+ * @cgroup: cgroup whose children to walk
														
 
															+ *
														
 
															+ * Walk @cgroup's children.  Must be called under rcu_read_lock().  A child
														
 
															+ * cgroup which hasn't finished ->css_online() or already has finished
														
 
															+ * ->css_offline() may show up during traversal and it's each subsystem's
														
 
															+ * responsibility to verify that each @pos is alive.
														
 
															+ *
														
 
															+ * If a subsystem synchronizes against the parent in its ->css_online() and
														
 
															+ * before starting iterating, a cgroup which finished ->css_online() is
														
 
															+ * guaranteed to be visible in the future iterations.
														
 
															+ */
														
 
															+#define cgroup_for_each_child(pos, cgroup)				\
														
 
															+	list_for_each_entry_rcu(pos, &(cgroup)->children, sibling)
														
 
															+
														
 
															+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
														
 
															+					  struct cgroup *cgroup);
														
 
															+
														
 
															+/**
														
 
															+ * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants
														
 
															+ * @pos: the cgroup * to use as the loop cursor
														
 
															+ * @cgroup: cgroup whose descendants to walk
														
 
															+ *
														
 
															+ * Walk @cgroup's descendants.  Must be called under rcu_read_lock().  A
														
 
															+ * descendant cgroup which hasn't finished ->css_online() or already has
														
 
															+ * finished ->css_offline() may show up during traversal and it's each
														
 
															+ * subsystem's responsibility to verify that each @pos is alive.
														
 
															+ *
														
 
															+ * If a subsystem synchronizes against the parent in its ->css_online() and
														
 
															+ * before starting iterating, and synchronizes against @pos on each
														
 
															+ * iteration, any descendant cgroup which finished ->css_offline() is
														
 
															+ * guaranteed to be visible in the future iterations.
														
 
															+ *
														
 
															+ * In other words, the following guarantees that a descendant can't escape
														
 
															+ * state updates of its ancestors.
														
 
															+ *
														
 
															+ * my_online(@cgrp)
														
 
															+ * {
														
 
															+ *	Lock @cgrp->parent and @cgrp;
														
 
															+ *	Inherit state from @cgrp->parent;
														
 
															+ *	Unlock both.
														
 
															+ * }
														
 
															+ *
														
 
															+ * my_update_state(@cgrp)
														
 
															+ * {
														
 
															+ *	Lock @cgrp;
														
 
															+ *	Update @cgrp's state;
														
 
															+ *	Unlock @cgrp;
														
 
															+ *
														
 
															+ *	cgroup_for_each_descendant_pre(@pos, @cgrp) {
														
 
															+ *		Lock @pos;
														
 
															+ *		Verify @pos is alive and inherit state from @pos->parent;
														
 
															+ *		Unlock @pos;
														
 
															+ *	}
														
 
															+ * }
														
 
															+ *
														
 
															+ * As long as the inheriting step, including checking the parent state, is
														
 
															+ * enclosed inside @pos locking, double-locking the parent isn't necessary
														
 
															+ * while inheriting.  The state update to the parent is guaranteed to be
														
 
															+ * visible by walking order and, as long as inheriting operations to the
														
 
															+ * same @pos are atomic to each other, multiple updates racing each other
														
 
															+ * still result in the correct state.  It's guaranateed that at least one
														
 
															+ * inheritance happens for any cgroup after the latest update to its
														
 
															+ * parent.
														
 
															+ *
														
 
															+ * If checking parent's state requires locking the parent, each inheriting
														
 
															+ * iteration should lock and unlock both @pos->parent and @pos.
														
 
															+ *
														
 
															+ * Alternatively, a subsystem may choose to use a single global lock to
														
 
															+ * synchronize ->css_online() and ->css_offline() against tree-walking
														
 
															+ * operations.
														
 
															+ */
														
 
															+#define cgroup_for_each_descendant_pre(pos, cgroup)			\
														
 
															+	for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos);	\
														
 
															+	     pos = cgroup_next_descendant_pre((pos), (cgroup)))
														
 
															+
														
 
															+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
														
 
															+					   struct cgroup *cgroup);
														
 
															+
														
 
															+/**
														
 
															+ * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants
														
 
															+ * @pos: the cgroup * to use as the loop cursor
														
 
															+ * @cgroup: cgroup whose descendants to walk
														
 
															+ *
														
 
															+ * Similar to cgroup_for_each_descendant_pre() but performs post-order
														
 
															+ * traversal instead.  Note that the walk visibility guarantee described in
														
 
															+ * pre-order walk doesn't apply the same to post-order walks.
														
 
															+ */
														
 
															+#define cgroup_for_each_descendant_post(pos, cgroup)			\
														
 
															+	for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos);	\
														
 
															+	     pos = cgroup_next_descendant_post((pos), (cgroup)))
														
 
															+
														
 
															 /* A cgroup_iter should be treated as an opaque object */
														
 
															 struct cgroup_iter {
														
 
															 	struct list_head *cg_link;
														
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -75,35 +75,68 @@ static inline bool cgroup_freezing(struct task_struct *task)
 
															  */
														
 
															-/* Tell the freezer not to count the current task as freezable. */
														
 
															+/**
														
 
															+ * freezer_do_not_count - tell freezer to ignore %current
														
 
															+ *
														
 
															+ * Tell freezers to ignore the current task when determining whether the
														
 
															+ * target frozen state is reached.  IOW, the current task will be
														
 
															+ * considered frozen enough by freezers.
														
 
															+ *
														
 
															+ * The caller shouldn't do anything which isn't allowed for a frozen task
														
 
															+ * until freezer_cont() is called.  Usually, freezer[_do_not]_count() pair
														
 
															+ * wrap a scheduling operation and nothing much else.
														
 
															+ */
														
 
															 static inline void freezer_do_not_count(void)
														
 
															 {
														
 
															 	current->flags |= PF_FREEZER_SKIP;
														
 
															 }
														
 
															-/*
														
 
															- * Tell the freezer to count the current task as freezable again and try to
														
 
															- * freeze it.
														
 
															+/**
														
 
															+ * freezer_count - tell freezer to stop ignoring %current
														
 
															+ *
														
 
															+ * Undo freezer_do_not_count().  It tells freezers that %current should be
														
 
															+ * considered again and tries to freeze if freezing condition is already in
														
 
															+ * effect.
														
 
															  */
														
 
															 static inline void freezer_count(void)
														
 
															 {
														
 
															 	current->flags &= ~PF_FREEZER_SKIP;
														
 
															+	/*
														
 
															+	 * If freezing is in progress, the following paired with smp_mb()
														
 
															+	 * in freezer_should_skip() ensures that either we see %true
														
 
															+	 * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP.
														
 
															+	 */
														
 
															+	smp_mb();
														
 
															 	try_to_freeze();
														
 
															 }
														
 
															-/*
														
 
															- * Check if the task should be counted as freezable by the freezer
														
 
															+/**
														
 
															+ * freezer_should_skip - whether to skip a task when determining frozen
														
 
															+ *			 state is reached
														
 
															+ * @p: task in quesion
														
 
															+ *
														
 
															+ * This function is used by freezers after establishing %true freezing() to
														
 
															+ * test whether a task should be skipped when determining the target frozen
														
 
															+ * state is reached.  IOW, if this function returns %true, @p is considered
														
 
															+ * frozen enough.
														
 
															  */
														
 
															-static inline int freezer_should_skip(struct task_struct *p)
														
 
															+static inline bool freezer_should_skip(struct task_struct *p)
														
 
															 {
														
 
															-	return !!(p->flags & PF_FREEZER_SKIP);
														
 
															+	/*
														
 
															+	 * The following smp_mb() paired with the one in freezer_count()
														
 
															+	 * ensures that either freezer_count() sees %true freezing() or we
														
 
															+	 * see cleared %PF_FREEZER_SKIP and return %false.  This makes it
														
 
															+	 * impossible for a task to slip frozen state testing after
														
 
															+	 * clearing %PF_FREEZER_SKIP.
														
 
															+	 */
														
 
															+	smp_mb();
														
 
															+	return p->flags & PF_FREEZER_SKIP;
														
 
															 }
														
 
															 /*
														
 
															- * These macros are intended to be used whenever you want allow a task that's
														
 
															- * sleeping in TASK_UNINTERRUPTIBLE or TASK_KILLABLE state to be frozen. Note
														
 
															- * that neither return any clear indication of whether a freeze event happened
														
 
															- * while in this function.
														
 
															+ * These macros are intended to be used whenever you want allow a sleeping
														
 
															+ * task to be frozen. Note that neither return any clear indication of
														
 
															+ * whether a freeze event happened while in this function.
														
 
															  */
														
 
															 /* Like schedule(), but should not block the freezer. */
														
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -27,7 +27,6 @@ struct netprio_map {
 
															 struct cgroup_netprio_state {
														
 
															 	struct cgroup_subsys_state css;
														
 
															-	u32 prioidx;
														
 
															 };
														
 
															 extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task);
														
@@ -36,13 +35,12 @@ extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task);
 
															 static inline u32 task_netprioidx(struct task_struct *p)
														
 
															 {
														
 
															-	struct cgroup_netprio_state *state;
														
 
															+	struct cgroup_subsys_state *css;
														
 
															 	u32 idx;
														
 
															 	rcu_read_lock();
														
 
															-	state = container_of(task_subsys_state(p, net_prio_subsys_id),
														
 
															-			     struct cgroup_netprio_state, css);
														
 
															-	idx = state->prioidx;
														
 
															+	css = task_subsys_state(p, net_prio_subsys_id);
														
 
															+	idx = css->cgroup->id;
														
 
															 	rcu_read_unlock();
														
 
															 	return idx;
														
 
															 }
														
@@ -57,8 +55,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
 
															 	rcu_read_lock();
														
 
															 	css = task_subsys_state(p, net_prio_subsys_id);
														
 
															 	if (css)
														
 
															-		idx = container_of(css,
														
 
															-				   struct cgroup_netprio_state, css)->prioidx;
														
 
															+		idx = css->cgroup->id;
														
 
															 	rcu_read_unlock();
														
 
															 	return idx;
														
 
															 }
														
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,6 +138,9 @@ struct cgroupfs_root {
 
															 	/* Hierarchy-specific flags */
														
 
															 	unsigned long flags;
														
 
															+	/* IDs for cgroups in this hierarchy */
														
 
															+	struct ida cgroup_ida;
														
 
															+
														
 
															 	/* The path to use for release notifications. */
														
 
															 	char release_agent_path[PATH_MAX];
														
@@ -171,8 +174,8 @@ struct css_id {
 
															 	 * The css to which this ID points. This pointer is set to valid value
														
 
															 	 * after cgroup is populated. If cgroup is removed, this will be NULL.
														
 
															 	 * This pointer is expected to be RCU-safe because destroy()
														
 
															-	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
														
 
															-	 * css_tryget() should be used for avoiding race.
														
 
															+	 * is called after synchronize_rcu(). But for safe use, css_tryget()
														
 
															+	 * should be used for avoiding race.
														
 
															 	 */
														
 
															 	struct cgroup_subsys_state __rcu *css;
														
 
															 	/*
														
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
 
															  */
														
 
															 static int need_forkexit_callback __read_mostly;
														
 
															+static int cgroup_destroy_locked(struct cgroup *cgrp);
														
 
															+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
														
 
															+			      struct cftype cfts[], bool is_add);
														
 
															+
														
 
															 #ifdef CONFIG_PROVE_LOCKING
														
 
															 int cgroup_lock_is_held(void)
														
 
															 {
														
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 
															 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
														
 
															 }
														
 
															-static int clone_children(const struct cgroup *cgrp)
														
 
															-{
														
 
															-	return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
														
 
															-}
														
 
															-
														
 
															 /*
														
 
															  * for_each_subsys() allows you to iterate on each subsystem attached to
														
 
															  * an active hierarchy
														
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 
															  *	The task_lock() exception
														
 
															  *
														
 
															  * The need for this exception arises from the action of
														
 
															- * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
														
 
															+ * cgroup_attach_task(), which overwrites one task's cgroup pointer with
														
 
															  * another.  It does so using cgroup_mutex, however there are
														
 
															  * several performance critical places that need to reference
														
 
															  * task->cgroup without the expense of grabbing a system global
														
 
															  * mutex.  Therefore except as noted below, when dereferencing or, as
														
 
															- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
														
 
															+ * in cgroup_attach_task(), modifying a task's cgroup pointer we use
														
 
															  * task_lock(), which acts on a spinlock (task->alloc_lock) already in
														
 
															  * the task_struct routinely used for such matters.
														
 
															  *
														
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 
															 	return inode;
														
 
															 }
														
 
															-/*
														
 
															- * Call subsys's pre_destroy handler.
														
 
															- * This is called before css refcnt check.
														
 
															- */
														
 
															-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
														
 
															-{
														
 
															-	struct cgroup_subsys *ss;
														
 
															-	int ret = 0;
														
 
															-
														
 
															-	for_each_subsys(cgrp->root, ss) {
														
 
															-		if (!ss->pre_destroy)
														
 
															-			continue;
														
 
															-
														
 
															-		ret = ss->pre_destroy(cgrp);
														
 
															-		if (ret) {
														
 
															-			/* ->pre_destroy() failure is being deprecated */
														
 
															-			WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
														
 
															-			break;
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															-	return ret;
														
 
															-}
														
 
															-
														
 
															 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
														
 
															 {
														
 
															 	/* is dentry a directory ? if so, kfree() associated cgroup */
														
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 
															 		 * Release the subsystem state objects.
														
 
															 		 */
														
 
															 		for_each_subsys(cgrp->root, ss)
														
 
															-			ss->destroy(cgrp);
														
 
															+			ss->css_free(cgrp);
														
 
															 		cgrp->root->number_of_cgroups--;
														
 
															 		mutex_unlock(&cgroup_mutex);
														
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 
															 		simple_xattrs_free(&cgrp->xattrs);
														
 
															+		ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
														
 
															 		kfree_rcu(cgrp, rcu_head);
														
 
															 	} else {
														
 
															 		struct cfent *cfe = __d_cfe(dentry);
														
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
 
															 		if (!test_bit(ss->subsys_id, &subsys_mask))
														
 
															 			continue;
														
 
															 		list_for_each_entry(set, &ss->cftsets, node)
														
 
															-			cgroup_rm_file(cgrp, set->cfts);
														
 
															+			cgroup_addrm_files(cgrp, NULL, set->cfts, false);
														
 
															 	}
														
 
															 	if (base_files) {
														
 
															 		while (!list_empty(&cgrp->files))
														
@@ -1014,33 +993,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 
															 	remove_dir(dentry);
														
 
															 }
														
 
															-/*
														
 
															- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
														
 
															- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
														
 
															- * reference to css->refcnt. In general, this refcnt is expected to goes down
														
 
															- * to zero, soon.
														
 
															- *
														
 
															- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
														
 
															- */
														
 
															-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
														
 
															-
														
 
															-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
														
 
															-{
														
 
															-	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
														
 
															-		wake_up_all(&cgroup_rmdir_waitq);
														
 
															-}
														
 
															-
														
 
															-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
														
 
															-{
														
 
															-	css_get(css);
														
 
															-}
														
 
															-
														
 
															-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
														
 
															-{
														
 
															-	cgroup_wakeup_rmdir_waiter(css->cgroup);
														
 
															-	css_put(css);
														
 
															-}
														
 
															-
														
 
															 /*
														
 
															  * Call with cgroup_mutex held. Drops reference counts on modules, including
														
 
															  * any duplicate ones that parse_cgroupfs_options took. If this function
														
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 
															 		seq_puts(seq, ",xattr");
														
 
															 	if (strlen(root->release_agent_path))
														
 
															 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
														
 
															-	if (clone_children(&root->top_cgroup))
														
 
															+	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
														
 
															 		seq_puts(seq, ",clone_children");
														
 
															 	if (strlen(root->name))
														
 
															 		seq_printf(seq, ",name=%s", root->name);
														
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts {
 
															 	unsigned long subsys_mask;
														
 
															 	unsigned long flags;
														
 
															 	char *release_agent;
														
 
															-	bool clone_children;
														
 
															+	bool cpuset_clone_children;
														
 
															 	char *name;
														
 
															 	/* User explicitly requested empty subsystem */
														
 
															 	bool none;
														
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 
															 			continue;
														
 
															 		}
														
 
															 		if (!strcmp(token, "clone_children")) {
														
 
															-			opts->clone_children = true;
														
 
															+			opts->cpuset_clone_children = true;
														
 
															 			continue;
														
 
															 		}
														
 
															 		if (!strcmp(token, "xattr")) {
														
@@ -1397,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 
															 		goto out_unlock;
														
 
															 	}
														
 
															+	/*
														
 
															+	 * Clear out the files of subsystems that should be removed, do
														
 
															+	 * this before rebind_subsystems, since rebind_subsystems may
														
 
															+	 * change this hierarchy's subsys_list.
														
 
															+	 */
														
 
															+	cgroup_clear_directory(cgrp->dentry, false, removed_mask);
														
 
															+
														
 
															 	ret = rebind_subsystems(root, opts.subsys_mask);
														
 
															 	if (ret) {
														
 
															+		/* rebind_subsystems failed, re-populate the removed files */
														
 
															+		cgroup_populate_dir(cgrp, false, removed_mask);
														
 
															 		drop_parsed_module_refcounts(opts.subsys_mask);
														
 
															 		goto out_unlock;
														
 
															 	}
														
 
															-	/* clear out any existing files and repopulate subsystem files */
														
 
															-	cgroup_clear_directory(cgrp->dentry, false, removed_mask);
														
 
															 	/* re-populate subsystem files */
														
 
															 	cgroup_populate_dir(cgrp, false, added_mask);
														
@@ -1432,6 +1391,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 
															 	INIT_LIST_HEAD(&cgrp->children);
														
 
															 	INIT_LIST_HEAD(&cgrp->files);
														
 
															 	INIT_LIST_HEAD(&cgrp->css_sets);
														
 
															+	INIT_LIST_HEAD(&cgrp->allcg_node);
														
 
															 	INIT_LIST_HEAD(&cgrp->release_list);
														
 
															 	INIT_LIST_HEAD(&cgrp->pidlists);
														
 
															 	mutex_init(&cgrp->pidlist_mutex);
														
@@ -1450,8 +1410,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 
															 	root->number_of_cgroups = 1;
														
 
															 	cgrp->root = root;
														
 
															 	cgrp->top_cgroup = cgrp;
														
 
															-	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
														
 
															 	init_cgroup_housekeeping(cgrp);
														
 
															+	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
														
 
															 }
														
 
															 static bool init_root_id(struct cgroupfs_root *root)
														
@@ -1518,12 +1478,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 
															 	root->subsys_mask = opts->subsys_mask;
														
 
															 	root->flags = opts->flags;
														
 
															+	ida_init(&root->cgroup_ida);
														
 
															 	if (opts->release_agent)
														
 
															 		strcpy(root->release_agent_path, opts->release_agent);
														
 
															 	if (opts->name)
														
 
															 		strcpy(root->name, opts->name);
														
 
															-	if (opts->clone_children)
														
 
															-		set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
														
 
															+	if (opts->cpuset_clone_children)
														
 
															+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
														
 
															 	return root;
														
 
															 }
														
@@ -1536,6 +1497,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
 
															 	spin_lock(&hierarchy_id_lock);
														
 
															 	ida_remove(&hierarchy_ida, root->hierarchy_id);
														
 
															 	spin_unlock(&hierarchy_id_lock);
														
 
															+	ida_destroy(&root->cgroup_ida);
														
 
															 	kfree(root);
														
 
															 }
														
@@ -1701,7 +1663,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
															 		free_cg_links(&tmp_cg_links);
														
 
															-		BUG_ON(!list_empty(&root_cgrp->sibling));
														
 
															 		BUG_ON(!list_empty(&root_cgrp->children));
														
 
															 		BUG_ON(root->number_of_cgroups != 1);
														
@@ -1750,7 +1711,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
 
															 	BUG_ON(root->number_of_cgroups != 1);
														
 
															 	BUG_ON(!list_empty(&cgrp->children));
														
 
															-	BUG_ON(!list_empty(&cgrp->sibling));
														
 
															 	mutex_lock(&cgroup_mutex);
														
 
															 	mutex_lock(&cgroup_root_mutex);
														
@@ -1808,9 +1768,11 @@ static struct kobject *cgroup_kobj;
 
															  */
														
 
															 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
														
 
															 {
														
 
															+	struct dentry *dentry = cgrp->dentry;
														
 
															 	char *start;
														
 
															-	struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
														
 
															-						      cgroup_lock_is_held());
														
 
															+
														
 
															+	rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
														
 
															+			   "cgroup_path() called without proper locking");
														
 
															 	if (!dentry || cgrp == dummytop) {
														
 
															 		/*
														
@@ -1821,9 +1783,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 
															 		return 0;
														
 
															 	}
														
 
															-	start = buf + buflen;
														
 
															+	start = buf + buflen - 1;
														
 
															-	*--start = '\0';
														
 
															+	*start = '\0';
														
 
															 	for (;;) {
														
 
															 		int len = dentry->d_name.len;
														
@@ -1834,8 +1796,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 
															 		if (!cgrp)
														
 
															 			break;
														
 
															-		dentry = rcu_dereference_check(cgrp->dentry,
														
 
															-					       cgroup_lock_is_held());
														
 
															+		dentry = cgrp->dentry;
														
 
															 		if (!cgrp->parent)
														
 
															 			continue;
														
 
															 		if (--start < buf)
														
@@ -1930,9 +1891,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
 
															 /*
														
 
															  * cgroup_task_migrate - move a task from one cgroup to another.
														
 
															  *
														
 
															- * 'guarantee' is set if the caller promises that a new css_set for the task
														
 
															- * will already exist. If not set, this function might sleep, and can fail with
														
 
															- * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
														
 
															+ * Must be called with cgroup_mutex and threadgroup locked.
														
 
															  */
														
 
															 static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
														
 
															 				struct task_struct *tsk, struct css_set *newcg)
														
@@ -2025,12 +1984,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
															 	}
														
 
															 	synchronize_rcu();
														
 
															-
														
 
															-	/*
														
 
															-	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
														
 
															-	 * is no longer empty.
														
 
															-	 */
														
 
															-	cgroup_wakeup_rmdir_waiter(cgrp);
														
 
															 out:
														
 
															 	if (retval) {
														
 
															 		for_each_subsys(root, ss) {
														
@@ -2200,7 +2153,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 
															 	 * step 5: success! and cleanup
														
 
															 	 */
														
 
															 	synchronize_rcu();
														
 
															-	cgroup_wakeup_rmdir_waiter(cgrp);
														
 
															 	retval = 0;
														
 
															 out_put_css_set_refs:
														
 
															 	if (retval) {
														
@@ -2711,10 +2663,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
 
															 		/* start off with i_nlink == 2 (for "." entry) */
														
 
															 		inc_nlink(inode);
														
 
															+		inc_nlink(dentry->d_parent->d_inode);
														
 
															-		/* start with the directory inode held, so that we can
														
 
															-		 * populate it without racing with another mkdir */
														
 
															-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
														
 
															+		/*
														
 
															+		 * Control reaches here with cgroup_mutex held.
														
 
															+		 * @inode->i_mutex should nest outside cgroup_mutex but we
														
 
															+		 * want to populate it immediately without releasing
														
 
															+		 * cgroup_mutex.  As @inode isn't visible to anyone else
														
 
															+		 * yet, trylock will always succeed without affecting
														
 
															+		 * lockdep checks.
														
 
															+		 */
														
 
															+		WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
														
 
															 	} else if (S_ISREG(mode)) {
														
 
															 		inode->i_size = 0;
														
 
															 		inode->i_fop = &cgroup_file_operations;
														
@@ -2725,32 +2684,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
 
															 	return 0;
														
 
															 }
														
 
															-/*
														
 
															- * cgroup_create_dir - create a directory for an object.
														
 
															- * @cgrp: the cgroup we create the directory for. It must have a valid
														
 
															- *        ->parent field. And we are going to fill its ->dentry field.
														
 
															- * @dentry: dentry of the new cgroup
														
 
															- * @mode: mode to set on new directory.
														
 
															- */
														
 
															-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
														
 
															-				umode_t mode)
														
 
															-{
														
 
															-	struct dentry *parent;
														
 
															-	int error = 0;
														
 
															-
														
 
															-	parent = cgrp->parent->dentry;
														
 
															-	error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
														
 
															-	if (!error) {
														
 
															-		dentry->d_fsdata = cgrp;
														
 
															-		inc_nlink(parent->d_inode);
														
 
															-		rcu_assign_pointer(cgrp->dentry, dentry);
														
 
															-		dget(dentry);
														
 
															-	}
														
 
															-	dput(dentry);
														
 
															-
														
 
															-	return error;
														
 
															-}
														
 
															-
														
 
															 /**
														
 
															  * cgroup_file_mode - deduce file mode of a control file
														
 
															  * @cft: the control file in question
														
@@ -2791,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 
															 	simple_xattrs_init(&cft->xattrs);
														
 
															-	/* does @cft->flags tell us to skip creation on @cgrp? */
														
 
															-	if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
														
 
															-		return 0;
														
 
															-	if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
														
 
															-		return 0;
														
 
															-
														
 
															 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
														
 
															 		strcpy(name, subsys->name);
														
 
															 		strcat(name, ".");
														
@@ -2837,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 
															 	int err, ret = 0;
														
 
															 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
														
 
															+		/* does cft->flags tell us to skip this file on @cgrp? */
														
 
															+		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
														
 
															+			continue;
														
 
															+		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
														
 
															+			continue;
														
 
															+
														
 
															 		if (is_add)
														
 
															 			err = cgroup_add_file(cgrp, subsys, cft);
														
 
															 		else
														
@@ -3044,6 +2977,92 @@ static void cgroup_enable_task_cg_lists(void)
 
															 	write_unlock(&css_set_lock);
														
 
															 }
														
 
															+/**
														
 
															+ * cgroup_next_descendant_pre - find the next descendant for pre-order walk
														
 
															+ * @pos: the current position (%NULL to initiate traversal)
														
 
															+ * @cgroup: cgroup whose descendants to walk
														
 
															+ *
														
 
															+ * To be used by cgroup_for_each_descendant_pre().  Find the next
														
 
															+ * descendant to visit for pre-order traversal of @cgroup's descendants.
														
 
															+ */
														
 
															+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
														
 
															+					  struct cgroup *cgroup)
														
 
															+{
														
 
															+	struct cgroup *next;
														
 
															+
														
 
															+	WARN_ON_ONCE(!rcu_read_lock_held());
														
 
															+
														
 
															+	/* if first iteration, pretend we just visited @cgroup */
														
 
															+	if (!pos) {
														
 
															+		if (list_empty(&cgroup->children))
														
 
															+			return NULL;
														
 
															+		pos = cgroup;
														
 
															+	}
														
 
															+
														
 
															+	/* visit the first child if exists */
														
 
															+	next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
														
 
															+	if (next)
														
 
															+		return next;
														
 
															+
														
 
															+	/* no child, visit my or the closest ancestor's next sibling */
														
 
															+	do {
														
 
															+		next = list_entry_rcu(pos->sibling.next, struct cgroup,
														
 
															+				      sibling);
														
 
															+		if (&next->sibling != &pos->parent->children)
														
 
															+			return next;
														
 
															+
														
 
															+		pos = pos->parent;
														
 
															+	} while (pos != cgroup);
														
 
															+
														
 
															+	return NULL;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
														
 
															+
														
 
															+static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
														
 
															+{
														
 
															+	struct cgroup *last;
														
 
															+
														
 
															+	do {
														
 
															+		last = pos;
														
 
															+		pos = list_first_or_null_rcu(&pos->children, struct cgroup,
														
 
															+					     sibling);
														
 
															+	} while (pos);
														
 
															+
														
 
															+	return last;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * cgroup_next_descendant_post - find the next descendant for post-order walk
														
 
															+ * @pos: the current position (%NULL to initiate traversal)
														
 
															+ * @cgroup: cgroup whose descendants to walk
														
 
															+ *
														
 
															+ * To be used by cgroup_for_each_descendant_post().  Find the next
														
 
															+ * descendant to visit for post-order traversal of @cgroup's descendants.
														
 
															+ */
														
 
															+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
														
 
															+					   struct cgroup *cgroup)
														
 
															+{
														
 
															+	struct cgroup *next;
														
 
															+
														
 
															+	WARN_ON_ONCE(!rcu_read_lock_held());
														
 
															+
														
 
															+	/* if first iteration, visit the leftmost descendant */
														
 
															+	if (!pos) {
														
 
															+		next = cgroup_leftmost_descendant(cgroup);
														
 
															+		return next != cgroup ? next : NULL;
														
 
															+	}
														
 
															+
														
 
															+	/* if there's an unvisited sibling, visit its leftmost descendant */
														
 
															+	next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
														
 
															+	if (&next->sibling != &pos->parent->children)
														
 
															+		return cgroup_leftmost_descendant(next);
														
 
															+
														
 
															+	/* no sibling left, visit parent */
														
 
															+	next = pos->parent;
														
 
															+	return next != cgroup ? next : NULL;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
														
 
															+
														
 
															 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
														
 
															 	__acquires(css_set_lock)
														
 
															 {
														
@@ -3757,7 +3776,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
 
															 	if (flags & POLLHUP) {
														
 
															 		__remove_wait_queue(event->wqh, &event->wait);
														
 
															 		spin_lock(&cgrp->event_list_lock);
														
 
															-		list_del(&event->list);
														
 
															+		list_del_init(&event->list);
														
 
															 		spin_unlock(&cgrp->event_list_lock);
														
 
															 		/*
														
 
															 		 * We are in atomic context, but cgroup_event_remove() may
														
@@ -3894,7 +3913,7 @@ fail:
 
															 static u64 cgroup_clone_children_read(struct cgroup *cgrp,
														
 
															 				    struct cftype *cft)
														
 
															 {
														
 
															-	return clone_children(cgrp);
														
 
															+	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
														
 
															 }
														
 
															 static int cgroup_clone_children_write(struct cgroup *cgrp,
														
@@ -3902,9 +3921,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
 
															 				     u64 val)
														
 
															 {
														
 
															 	if (val)
														
 
															-		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
														
 
															+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
														
 
															 	else
														
 
															-		clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
														
 
															+		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
														
 
															 	return 0;
														
 
															 }
														
@@ -4017,19 +4036,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 
															 	css->flags = 0;
														
 
															 	css->id = NULL;
														
 
															 	if (cgrp == dummytop)
														
 
															-		set_bit(CSS_ROOT, &css->flags);
														
 
															+		css->flags |= CSS_ROOT;
														
 
															 	BUG_ON(cgrp->subsys[ss->subsys_id]);
														
 
															 	cgrp->subsys[ss->subsys_id] = css;
														
 
															 	/*
														
 
															-	 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
														
 
															-	 * which is put on the last css_put().  dput() requires process
														
 
															-	 * context, which css_put() may be called without.  @css->dput_work
														
 
															-	 * will be used to invoke dput() asynchronously from css_put().
														
 
															+	 * css holds an extra ref to @cgrp->dentry which is put on the last
														
 
															+	 * css_put().  dput() requires process context, which css_put() may
														
 
															+	 * be called without.  @css->dput_work will be used to invoke
														
 
															+	 * dput() asynchronously from css_put().
														
 
															 	 */
														
 
															 	INIT_WORK(&css->dput_work, css_dput_fn);
														
 
															-	if (ss->__DEPRECATED_clear_css_refs)
														
 
															-		set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
														
 
															+}
														
 
															+
														
 
															+/* invoke ->post_create() on a new CSS and mark it online if successful */
														
 
															+static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	lockdep_assert_held(&cgroup_mutex);
														
 
															+
														
 
															+	if (ss->css_online)
														
 
															+		ret = ss->css_online(cgrp);
														
 
															+	if (!ret)
														
 
															+		cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
														
 
															+static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
														
 
															+	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
														
 
															+{
														
 
															+	struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
														
 
															+
														
 
															+	lockdep_assert_held(&cgroup_mutex);
														
 
															+
														
 
															+	if (!(css->flags & CSS_ONLINE))
														
 
															+		return;
														
 
															+
														
 
															+	/*
														
 
															+	 * css_offline() should be called with cgroup_mutex unlocked.  See
														
 
															+	 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
														
 
															+	 * details.  This temporary unlocking should go away once
														
 
															+	 * cgroup_mutex is unexported from controllers.
														
 
															+	 */
														
 
															+	if (ss->css_offline) {
														
 
															+		mutex_unlock(&cgroup_mutex);
														
 
															+		ss->css_offline(cgrp);
														
 
															+		mutex_lock(&cgroup_mutex);
														
 
															+	}
														
 
															+
														
 
															+	cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
														
 
															 }
														
 
															 /*
														
@@ -4049,10 +4106,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
															 	struct cgroup_subsys *ss;
														
 
															 	struct super_block *sb = root->sb;
														
 
															+	/* allocate the cgroup and its ID, 0 is reserved for the root */
														
 
															 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
														
 
															 	if (!cgrp)
														
 
															 		return -ENOMEM;
														
 
															+	cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
														
 
															+	if (cgrp->id < 0)
														
 
															+		goto err_free_cgrp;
														
 
															+
														
 
															+	/*
														
 
															+	 * Only live parents can have children.  Note that the liveliness
														
 
															+	 * check isn't strictly necessary because cgroup_mkdir() and
														
 
															+	 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
														
 
															+	 * anyway so that locking is contained inside cgroup proper and we
														
 
															+	 * don't get nasty surprises if we ever grow another caller.
														
 
															+	 */
														
 
															+	if (!cgroup_lock_live_group(parent)) {
														
 
															+		err = -ENODEV;
														
 
															+		goto err_free_id;
														
 
															+	}
														
 
															+
														
 
															 	/* Grab a reference on the superblock so the hierarchy doesn't
														
 
															 	 * get deleted on unmount if there are child cgroups.  This
														
 
															 	 * can be done outside cgroup_mutex, since the sb can't
														
@@ -4060,8 +4134,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
															 	 * fs */
														
 
															 	atomic_inc(&sb->s_active);
														
 
															-	mutex_lock(&cgroup_mutex);
														
 
															-
														
 
															 	init_cgroup_housekeeping(cgrp);
														
 
															 	cgrp->parent = parent;
														
@@ -4071,26 +4143,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
															 	if (notify_on_release(parent))
														
 
															 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
														
 
															-	if (clone_children(parent))
														
 
															-		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
														
 
															+	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
														
 
															+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
														
 
															 	for_each_subsys(root, ss) {
														
 
															 		struct cgroup_subsys_state *css;
														
 
															-		css = ss->create(cgrp);
														
 
															+		css = ss->css_alloc(cgrp);
														
 
															 		if (IS_ERR(css)) {
														
 
															 			err = PTR_ERR(css);
														
 
															-			goto err_destroy;
														
 
															+			goto err_free_all;
														
 
															 		}
														
 
															 		init_cgroup_css(css, ss, cgrp);
														
 
															 		if (ss->use_id) {
														
 
															 			err = alloc_css_id(ss, parent, cgrp);
														
 
															 			if (err)
														
 
															-				goto err_destroy;
														
 
															+				goto err_free_all;
														
 
															 		}
														
 
															-		/* At error, ->destroy() callback has to free assigned ID. */
														
 
															-		if (clone_children(parent) && ss->post_clone)
														
 
															-			ss->post_clone(cgrp);
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Create directory.  cgroup_create_file() returns with the new
														
 
															+	 * directory locked on success so that it can be populated without
														
 
															+	 * dropping cgroup_mutex.
														
 
															+	 */
														
 
															+	err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
														
 
															+	if (err < 0)
														
 
															+		goto err_free_all;
														
 
															+	lockdep_assert_held(&dentry->d_inode->i_mutex);
														
 
															+
														
 
															+	/* allocation complete, commit to creation */
														
 
															+	dentry->d_fsdata = cgrp;
														
 
															+	cgrp->dentry = dentry;
														
 
															+	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
														
 
															+	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
														
 
															+	root->number_of_cgroups++;
														
 
															+
														
 
															+	/* each css holds a ref to the cgroup's dentry */
														
 
															+	for_each_subsys(root, ss)
														
 
															+		dget(dentry);
														
 
															+
														
 
															+	/* creation succeeded, notify subsystems */
														
 
															+	for_each_subsys(root, ss) {
														
 
															+		err = online_css(ss, cgrp);
														
 
															+		if (err)
														
 
															+			goto err_destroy;
														
 
															 		if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
														
 
															 		    parent->parent) {
														
@@ -4102,50 +4199,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
															 		}
														
 
															 	}
														
 
															-	list_add(&cgrp->sibling, &cgrp->parent->children);
														
 
															-	root->number_of_cgroups++;
														
 
															-
														
 
															-	err = cgroup_create_dir(cgrp, dentry, mode);
														
 
															-	if (err < 0)
														
 
															-		goto err_remove;
														
 
															-
														
 
															-	/* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
														
 
															-	for_each_subsys(root, ss)
														
 
															-		if (!ss->__DEPRECATED_clear_css_refs)
														
 
															-			dget(dentry);
														
 
															-
														
 
															-	/* The cgroup directory was pre-locked for us */
														
 
															-	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
														
 
															-
														
 
															-	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
														
 
															-
														
 
															 	err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
														
 
															-	/* If err < 0, we have a half-filled directory - oh well ;) */
														
 
															+	if (err)
														
 
															+		goto err_destroy;
														
 
															 	mutex_unlock(&cgroup_mutex);
														
 
															 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
														
 
															 	return 0;
														
 
															- err_remove:
														
 
															-
														
 
															-	list_del(&cgrp->sibling);
														
 
															-	root->number_of_cgroups--;
														
 
															-
														
 
															- err_destroy:
														
 
															-
														
 
															+err_free_all:
														
 
															 	for_each_subsys(root, ss) {
														
 
															 		if (cgrp->subsys[ss->subsys_id])
														
 
															-			ss->destroy(cgrp);
														
 
															+			ss->css_free(cgrp);
														
 
															 	}
														
 
															-
														
 
															 	mutex_unlock(&cgroup_mutex);
														
 
															-
														
 
															 	/* Release the reference count that we took on the superblock */
														
 
															 	deactivate_super(sb);
														
 
															-
														
 
															+err_free_id:
														
 
															+	ida_simple_remove(&root->cgroup_ida, cgrp->id);
														
 
															+err_free_cgrp:
														
 
															 	kfree(cgrp);
														
 
															 	return err;
														
 
															+
														
 
															+err_destroy:
														
 
															+	cgroup_destroy_locked(cgrp);
														
 
															+	mutex_unlock(&cgroup_mutex);
														
 
															+	mutex_unlock(&dentry->d_inode->i_mutex);
														
 
															+	return err;
														
 
															 }
														
 
															 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
														
@@ -4197,153 +4278,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 
															 	return 0;
														
 
															 }
														
 
															-/*
														
 
															- * Atomically mark all (or else none) of the cgroup's CSS objects as
														
 
															- * CSS_REMOVED. Return true on success, or false if the cgroup has
														
 
															- * busy subsystems. Call with cgroup_mutex held
														
 
															- *
														
 
															- * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
														
 
															- * not, cgroup removal behaves differently.
														
 
															- *
														
 
															- * If clear is set, css refcnt for the subsystem should be zero before
														
 
															- * cgroup removal can be committed.  This is implemented by
														
 
															- * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
														
 
															- * called multiple times until all css refcnts reach zero and is allowed to
														
 
															- * veto removal on any invocation.  This behavior is deprecated and will be
														
 
															- * removed as soon as the existing user (memcg) is updated.
														
 
															- *
														
 
															- * If clear is not set, each css holds an extra reference to the cgroup's
														
 
															- * dentry and cgroup removal proceeds regardless of css refs.
														
 
															- * ->pre_destroy() will be called at least once and is not allowed to fail.
														
 
															- * On the last put of each css, whenever that may be, the extra dentry ref
														
 
															- * is put so that dentry destruction happens only after all css's are
														
 
															- * released.
														
 
															- */
														
 
															-static int cgroup_clear_css_refs(struct cgroup *cgrp)
														
 
															+static int cgroup_destroy_locked(struct cgroup *cgrp)
														
 
															+	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
														
 
															 {
														
 
															+	struct dentry *d = cgrp->dentry;
														
 
															+	struct cgroup *parent = cgrp->parent;
														
 
															+	DEFINE_WAIT(wait);
														
 
															+	struct cgroup_event *event, *tmp;
														
 
															 	struct cgroup_subsys *ss;
														
 
															-	unsigned long flags;
														
 
															-	bool failed = false;
														
 
															+	LIST_HEAD(tmp_list);
														
 
															+
														
 
															+	lockdep_assert_held(&d->d_inode->i_mutex);
														
 
															+	lockdep_assert_held(&cgroup_mutex);
														
 
															-	local_irq_save(flags);
														
 
															+	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
														
 
															+		return -EBUSY;
														
 
															 	/*
														
 
															-	 * Block new css_tryget() by deactivating refcnt.  If all refcnts
														
 
															-	 * for subsystems w/ clear_css_refs set were 1 at the moment of
														
 
															-	 * deactivation, we succeeded.
														
 
															+	 * Block new css_tryget() by deactivating refcnt and mark @cgrp
														
 
															+	 * removed.  This makes future css_tryget() and child creation
														
 
															+	 * attempts fail thus maintaining the removal conditions verified
														
 
															+	 * above.
														
 
															 	 */
														
 
															 	for_each_subsys(cgrp->root, ss) {
														
 
															 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
														
 
															 		WARN_ON(atomic_read(&css->refcnt) < 0);
														
 
															 		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
														
 
															-
														
 
															-		if (ss->__DEPRECATED_clear_css_refs)
														
 
															-			failed |= css_refcnt(css) != 1;
														
 
															-	}
														
 
															-
														
 
															-	/*
														
 
															-	 * If succeeded, set REMOVED and put all the base refs; otherwise,
														
 
															-	 * restore refcnts to positive values.  Either way, all in-progress
														
 
															-	 * css_tryget() will be released.
														
 
															-	 */
														
 
															-	for_each_subsys(cgrp->root, ss) {
														
 
															-		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
														
 
															-
														
 
															-		if (!failed) {
														
 
															-			set_bit(CSS_REMOVED, &css->flags);
														
 
															-			css_put(css);
														
 
															-		} else {
														
 
															-			atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
														
 
															-		}
														
 
															 	}
														
 
															+	set_bit(CGRP_REMOVED, &cgrp->flags);
														
 
															-	local_irq_restore(flags);
														
 
															-	return !failed;
														
 
															-}
														
 
															-
														
 
															-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
														
 
															-{
														
 
															-	struct cgroup *cgrp = dentry->d_fsdata;
														
 
															-	struct dentry *d;
														
 
															-	struct cgroup *parent;
														
 
															-	DEFINE_WAIT(wait);
														
 
															-	struct cgroup_event *event, *tmp;
														
 
															-	int ret;
														
 
															-
														
 
															-	/* the vfs holds both inode->i_mutex already */
														
 
															-again:
														
 
															-	mutex_lock(&cgroup_mutex);
														
 
															-	if (atomic_read(&cgrp->count) != 0) {
														
 
															-		mutex_unlock(&cgroup_mutex);
														
 
															-		return -EBUSY;
														
 
															-	}
														
 
															-	if (!list_empty(&cgrp->children)) {
														
 
															-		mutex_unlock(&cgroup_mutex);
														
 
															-		return -EBUSY;
														
 
															-	}
														
 
															-	mutex_unlock(&cgroup_mutex);
														
 
															-
														
 
															-	/*
														
 
															-	 * In general, subsystem has no css->refcnt after pre_destroy(). But
														
 
															-	 * in racy cases, subsystem may have to get css->refcnt after
														
 
															-	 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
														
 
															-	 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
														
 
															-	 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
														
 
															-	 * and subsystem's reference count handling. Please see css_get/put
														
 
															-	 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
														
 
															-	 */
														
 
															-	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
														
 
															+	/* tell subsystems to initate destruction */
														
 
															+	for_each_subsys(cgrp->root, ss)
														
 
															+		offline_css(ss, cgrp);
														
 
															 	/*
														
 
															-	 * Call pre_destroy handlers of subsys. Notify subsystems
														
 
															-	 * that rmdir() request comes.
														
 
															+	 * Put all the base refs.  Each css holds an extra reference to the
														
 
															+	 * cgroup's dentry and cgroup removal proceeds regardless of css
														
 
															+	 * refs.  On the last put of each css, whenever that may be, the
														
 
															+	 * extra dentry ref is put so that dentry destruction happens only
														
 
															+	 * after all css's are released.
														
 
															 	 */
														
 
															-	ret = cgroup_call_pre_destroy(cgrp);
														
 
															-	if (ret) {
														
 
															-		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
														
 
															-		return ret;
														
 
															-	}
														
 
															-
														
 
															-	mutex_lock(&cgroup_mutex);
														
 
															-	parent = cgrp->parent;
														
 
															-	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
														
 
															-		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
														
 
															-		mutex_unlock(&cgroup_mutex);
														
 
															-		return -EBUSY;
														
 
															-	}
														
 
															-	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
														
 
															-	if (!cgroup_clear_css_refs(cgrp)) {
														
 
															-		mutex_unlock(&cgroup_mutex);
														
 
															-		/*
														
 
															-		 * Because someone may call cgroup_wakeup_rmdir_waiter() before
														
 
															-		 * prepare_to_wait(), we need to check this flag.
														
 
															-		 */
														
 
															-		if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
														
 
															-			schedule();
														
 
															-		finish_wait(&cgroup_rmdir_waitq, &wait);
														
 
															-		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
														
 
															-		if (signal_pending(current))
														
 
															-			return -EINTR;
														
 
															-		goto again;
														
 
															-	}
														
 
															-	/* NO css_tryget() can success after here. */
														
 
															-	finish_wait(&cgroup_rmdir_waitq, &wait);
														
 
															-	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
														
 
															+	for_each_subsys(cgrp->root, ss)
														
 
															+		css_put(cgrp->subsys[ss->subsys_id]);
														
 
															 	raw_spin_lock(&release_list_lock);
														
 
															-	set_bit(CGRP_REMOVED, &cgrp->flags);
														
 
															 	if (!list_empty(&cgrp->release_list))
														
 
															 		list_del_init(&cgrp->release_list);
														
 
															 	raw_spin_unlock(&release_list_lock);
														
 
															 	/* delete this cgroup from parent->children */
														
 
															-	list_del_init(&cgrp->sibling);
														
 
															-
														
 
															+	list_del_rcu(&cgrp->sibling);
														
 
															 	list_del_init(&cgrp->allcg_node);
														
 
															-	d = dget(cgrp->dentry);
														
 
															-
														
 
															+	dget(d);
														
 
															 	cgroup_d_remove_dir(d);
														
 
															 	dput(d);
														
@@ -4353,21 +4341,35 @@ again:
 
															 	/*
														
 
															 	 * Unregister events and notify userspace.
														
 
															 	 * Notify userspace about cgroup removing only after rmdir of cgroup
														
 
															-	 * directory to avoid race between userspace and kernelspace
														
 
															+	 * directory to avoid race between userspace and kernelspace. Use
														
 
															+	 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
														
 
															+	 * cgroup_event_wake() is called with the wait queue head locked,
														
 
															+	 * remove_wait_queue() cannot be called while holding event_list_lock.
														
 
															 	 */
														
 
															 	spin_lock(&cgrp->event_list_lock);
														
 
															-	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
														
 
															-		list_del(&event->list);
														
 
															+	list_splice_init(&cgrp->event_list, &tmp_list);
														
 
															+	spin_unlock(&cgrp->event_list_lock);
														
 
															+	list_for_each_entry_safe(event, tmp, &tmp_list, list) {
														
 
															+		list_del_init(&event->list);
														
 
															 		remove_wait_queue(event->wqh, &event->wait);
														
 
															 		eventfd_signal(event->eventfd, 1);
														
 
															 		schedule_work(&event->remove);
														
 
															 	}
														
 
															-	spin_unlock(&cgrp->event_list_lock);
														
 
															-	mutex_unlock(&cgroup_mutex);
														
 
															 	return 0;
														
 
															 }
														
 
															+static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	mutex_lock(&cgroup_mutex);
														
 
															+	ret = cgroup_destroy_locked(dentry->d_fsdata);
														
 
															+	mutex_unlock(&cgroup_mutex);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															 static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
														
 
															 {
														
 
															 	INIT_LIST_HEAD(&ss->cftsets);
														
@@ -4388,13 +4390,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
															 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
														
 
															+	mutex_lock(&cgroup_mutex);
														
 
															+
														
 
															 	/* init base cftset */
														
 
															 	cgroup_init_cftsets(ss);
														
 
															 	/* Create the top cgroup state for this subsystem */
														
 
															 	list_add(&ss->sibling, &rootnode.subsys_list);
														
 
															 	ss->root = &rootnode;
														
 
															-	css = ss->create(dummytop);
														
 
															+	css = ss->css_alloc(dummytop);
														
 
															 	/* We don't handle early failures gracefully */
														
 
															 	BUG_ON(IS_ERR(css));
														
 
															 	init_cgroup_css(css, ss, dummytop);
														
@@ -4403,7 +4407,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
															 	 * pointer to this state - since the subsystem is
														
 
															 	 * newly registered, all tasks and hence the
														
 
															 	 * init_css_set is in the subsystem's top cgroup. */
														
 
															-	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
														
 
															+	init_css_set.subsys[ss->subsys_id] = css;
														
 
															 	need_forkexit_callback |= ss->fork || ss->exit;
														
@@ -4413,6 +4417,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
															 	BUG_ON(!list_empty(&init_task.tasks));
														
 
															 	ss->active = 1;
														
 
															+	BUG_ON(online_css(ss, dummytop));
														
 
															+
														
 
															+	mutex_unlock(&cgroup_mutex);
														
 
															 	/* this function shouldn't be used with modular subsystems, since they
														
 
															 	 * need to register a subsys_id, among other things */
														
@@ -4430,12 +4437,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
															  */
														
 
															 int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
														
 
															 {
														
 
															-	int i;
														
 
															 	struct cgroup_subsys_state *css;
														
 
															+	int i, ret;
														
 
															 	/* check name and function validity */
														
 
															 	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
														
 
															-	    ss->create == NULL || ss->destroy == NULL)
														
 
															+	    ss->css_alloc == NULL || ss->css_free == NULL)
														
 
															 		return -EINVAL;
														
 
															 	/*
														
@@ -4464,10 +4471,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 
															 	subsys[ss->subsys_id] = ss;
														
 
															 	/*
														
 
															-	 * no ss->create seems to need anything important in the ss struct, so
														
 
															-	 * this can happen first (i.e. before the rootnode attachment).
														
 
															+	 * no ss->css_alloc seems to need anything important in the ss
														
 
															+	 * struct, so this can happen first (i.e. before the rootnode
														
 
															+	 * attachment).
														
 
															 	 */
														
 
															-	css = ss->create(dummytop);
														
 
															+	css = ss->css_alloc(dummytop);
														
 
															 	if (IS_ERR(css)) {
														
 
															 		/* failure case - need to deassign the subsys[] slot. */
														
 
															 		subsys[ss->subsys_id] = NULL;
														
@@ -4482,14 +4490,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 
															 	init_cgroup_css(css, ss, dummytop);
														
 
															 	/* init_idr must be after init_cgroup_css because it sets css->id. */
														
 
															 	if (ss->use_id) {
														
 
															-		int ret = cgroup_init_idr(ss, css);
														
 
															-		if (ret) {
														
 
															-			dummytop->subsys[ss->subsys_id] = NULL;
														
 
															-			ss->destroy(dummytop);
														
 
															-			subsys[ss->subsys_id] = NULL;
														
 
															-			mutex_unlock(&cgroup_mutex);
														
 
															-			return ret;
														
 
															-		}
														
 
															+		ret = cgroup_init_idr(ss, css);
														
 
															+		if (ret)
														
 
															+			goto err_unload;
														
 
															 	}
														
 
															 	/*
														
@@ -4522,10 +4525,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 
															 	write_unlock(&css_set_lock);
														
 
															 	ss->active = 1;
														
 
															+	ret = online_css(ss, dummytop);
														
 
															+	if (ret)
														
 
															+		goto err_unload;
														
 
															 	/* success! */
														
 
															 	mutex_unlock(&cgroup_mutex);
														
 
															 	return 0;
														
 
															+
														
 
															+err_unload:
														
 
															+	mutex_unlock(&cgroup_mutex);
														
 
															+	/* @ss can't be mounted here as try_module_get() would fail */
														
 
															+	cgroup_unload_subsys(ss);
														
 
															+	return ret;
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(cgroup_load_subsys);
														
@@ -4552,6 +4564,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 
															 	BUG_ON(ss->root != &rootnode);
														
 
															 	mutex_lock(&cgroup_mutex);
														
 
															+
														
 
															+	offline_css(ss, dummytop);
														
 
															+	ss->active = 0;
														
 
															+
														
 
															+	if (ss->use_id) {
														
 
															+		idr_remove_all(&ss->idr);
														
 
															+		idr_destroy(&ss->idr);
														
 
															+	}
														
 
															+
														
 
															 	/* deassign the subsys_id */
														
 
															 	subsys[ss->subsys_id] = NULL;
														
@@ -4567,7 +4588,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 
															 		struct css_set *cg = link->cg;
														
 
															 		hlist_del(&cg->hlist);
														
 
															-		BUG_ON(!cg->subsys[ss->subsys_id]);
														
 
															 		cg->subsys[ss->subsys_id] = NULL;
														
 
															 		hhead = css_set_hash(cg->subsys);
														
 
															 		hlist_add_head(&cg->hlist, hhead);
														
@@ -4575,12 +4595,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 
															 	write_unlock(&css_set_lock);
														
 
															 	/*
														
 
															-	 * remove subsystem's css from the dummytop and free it - need to free
														
 
															-	 * before marking as null because ss->destroy needs the cgrp->subsys
														
 
															-	 * pointer to find their state. note that this also takes care of
														
 
															-	 * freeing the css_id.
														
 
															+	 * remove subsystem's css from the dummytop and free it - need to
														
 
															+	 * free before marking as null because ss->css_free needs the
														
 
															+	 * cgrp->subsys pointer to find their state. note that this also
														
 
															+	 * takes care of freeing the css_id.
														
 
															 	 */
														
 
															-	ss->destroy(dummytop);
														
 
															+	ss->css_free(dummytop);
														
 
															 	dummytop->subsys[ss->subsys_id] = NULL;
														
 
															 	mutex_unlock(&cgroup_mutex);
														
@@ -4624,8 +4644,8 @@ int __init cgroup_init_early(void)
 
															 		BUG_ON(!ss->name);
														
 
															 		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
														
 
															-		BUG_ON(!ss->create);
														
 
															-		BUG_ON(!ss->destroy);
														
 
															+		BUG_ON(!ss->css_alloc);
														
 
															+		BUG_ON(!ss->css_free);
														
 
															 		if (ss->subsys_id != i) {
														
 
															 			printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
														
 
															 			       ss->name, ss->subsys_id);
														
@@ -4831,45 +4851,20 @@ void cgroup_fork(struct task_struct *child)
 
															 	INIT_LIST_HEAD(&child->cg_list);
														
 
															 }
														
 
															-/**
														
 
															- * cgroup_fork_callbacks - run fork callbacks
														
 
															- * @child: the new task
														
 
															- *
														
 
															- * Called on a new task very soon before adding it to the
														
 
															- * tasklist. No need to take any locks since no-one can
														
 
															- * be operating on this task.
														
 
															- */
														
 
															-void cgroup_fork_callbacks(struct task_struct *child)
														
 
															-{
														
 
															-	if (need_forkexit_callback) {
														
 
															-		int i;
														
 
															-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
														
 
															-			struct cgroup_subsys *ss = subsys[i];
														
 
															-
														
 
															-			/*
														
 
															-			 * forkexit callbacks are only supported for
														
 
															-			 * builtin subsystems.
														
 
															-			 */
														
 
															-			if (!ss || ss->module)
														
 
															-				continue;
														
 
															-
														
 
															-			if (ss->fork)
														
 
															-				ss->fork(child);
														
 
															-		}
														
 
															-	}
														
 
															-}
														
 
															-
														
 
															 /**
														
 
															  * cgroup_post_fork - called on a new task after adding it to the task list
														
 
															  * @child: the task in question
														
 
															  *
														
 
															- * Adds the task to the list running through its css_set if necessary.
														
 
															- * Has to be after the task is visible on the task list in case we race
														
 
															- * with the first call to cgroup_iter_start() - to guarantee that the
														
 
															- * new task ends up on its list.
														
 
															+ * Adds the task to the list running through its css_set if necessary and
														
 
															+ * call the subsystem fork() callbacks.  Has to be after the task is
														
 
															+ * visible on the task list in case we race with the first call to
														
 
															+ * cgroup_iter_start() - to guarantee that the new task ends up on its
														
 
															+ * list.
														
 
															  */
														
 
															 void cgroup_post_fork(struct task_struct *child)
														
 
															 {
														
 
															+	int i;
														
 
															+
														
 
															 	/*
														
 
															 	 * use_task_css_set_links is set to 1 before we walk the tasklist
														
 
															 	 * under the tasklist_lock and we read it here after we added the child
														
@@ -4889,7 +4884,30 @@ void cgroup_post_fork(struct task_struct *child)
 
															 		task_unlock(child);
														
 
															 		write_unlock(&css_set_lock);
														
 
															 	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Call ss->fork().  This must happen after @child is linked on
														
 
															+	 * css_set; otherwise, @child might change state between ->fork()
														
 
															+	 * and addition to css_set.
														
 
															+	 */
														
 
															+	if (need_forkexit_callback) {
														
 
															+		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
														
 
															+			struct cgroup_subsys *ss = subsys[i];
														
 
															+
														
 
															+			/*
														
 
															+			 * fork/exit callbacks are supported only for
														
 
															+			 * builtin subsystems and we don't need further
														
 
															+			 * synchronization as they never go away.
														
 
															+			 */
														
 
															+			if (!ss || ss->module)
														
 
															+				continue;
														
 
															+
														
 
															+			if (ss->fork)
														
 
															+				ss->fork(child);
														
 
															+		}
														
 
															+	}
														
 
															 }
														
 
															+
														
 
															 /**
														
 
															  * cgroup_exit - detach cgroup from exiting task
														
 
															  * @tsk: pointer to task_struct of exiting process
														
@@ -5022,15 +5040,17 @@ static void check_for_release(struct cgroup *cgrp)
 
															 /* Caller must verify that the css is not for root cgroup */
														
 
															 bool __css_tryget(struct cgroup_subsys_state *css)
														
 
															 {
														
 
															-	do {
														
 
															-		int v = css_refcnt(css);
														
 
															+	while (true) {
														
 
															+		int t, v;
														
 
															-		if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
														
 
															+		v = css_refcnt(css);
														
 
															+		t = atomic_cmpxchg(&css->refcnt, v, v + 1);
														
 
															+		if (likely(t == v))
														
 
															 			return true;
														
 
															+		else if (t < 0)
														
 
															+			return false;
														
 
															 		cpu_relax();
														
 
															-	} while (!test_bit(CSS_REMOVED, &css->flags));
														
 
															-
														
 
															-	return false;
														
 
															+	}
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(__css_tryget);
														
@@ -5049,11 +5069,9 @@ void __css_put(struct cgroup_subsys_state *css)
 
															 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
														
 
															 			check_for_release(cgrp);
														
 
															 		}
														
 
															-		cgroup_wakeup_rmdir_waiter(cgrp);
														
 
															 		break;
														
 
															 	case 0:
														
 
															-		if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
														
 
															-			schedule_work(&css->dput_work);
														
 
															+		schedule_work(&css->dput_work);
														
 
															 		break;
														
 
															 	}
														
 
															 	rcu_read_unlock();
														
@@ -5439,7 +5457,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 
															 }
														
 
															 #ifdef CONFIG_CGROUP_DEBUG
														
 
															-static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
														
 
															+static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
														
 
															 {
														
 
															 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
														
@@ -5449,7 +5467,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
 
															 	return css;
														
 
															 }
														
 
															-static void debug_destroy(struct cgroup *cont)
														
 
															+static void debug_css_free(struct cgroup *cont)
														
 
															 {
														
 
															 	kfree(cont->subsys[debug_subsys_id]);
														
 
															 }
														
@@ -5578,8 +5596,8 @@ static struct cftype debug_files[] =  {
 
															 struct cgroup_subsys debug_subsys = {
														
 
															 	.name = "debug",
														
 
															-	.create = debug_create,
														
 
															-	.destroy = debug_destroy,
														
 
															+	.css_alloc = debug_css_alloc,
														
 
															+	.css_free = debug_css_free,
														
 
															 	.subsys_id = debug_subsys_id,
														
 
															 	.base_cftypes = debug_files,
														
 
															 };
														
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -22,24 +22,33 @@
 
															 #include <linux/freezer.h>
														
 
															 #include <linux/seq_file.h>
														
 
															-enum freezer_state {
														
 
															-	CGROUP_THAWED = 0,
														
 
															-	CGROUP_FREEZING,
														
 
															-	CGROUP_FROZEN,
														
 
															+/*
														
 
															+ * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
														
 
															+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
														
 
															+ * for "THAWED".  FREEZING_PARENT is set if the parent freezer is FREEZING
														
 
															+ * for whatever reason.  IOW, a cgroup has FREEZING_PARENT set if one of
														
 
															+ * its ancestors has FREEZING_SELF set.
														
 
															+ */
														
 
															+enum freezer_state_flags {
														
 
															+	CGROUP_FREEZER_ONLINE	= (1 << 0), /* freezer is fully online */
														
 
															+	CGROUP_FREEZING_SELF	= (1 << 1), /* this freezer is freezing */
														
 
															+	CGROUP_FREEZING_PARENT	= (1 << 2), /* the parent freezer is freezing */
														
 
															+	CGROUP_FROZEN		= (1 << 3), /* this and its descendants frozen */
														
 
															+
														
 
															+	/* mask for all FREEZING flags */
														
 
															+	CGROUP_FREEZING		= CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
														
 
															 };
														
 
															 struct freezer {
														
 
															-	struct cgroup_subsys_state css;
														
 
															-	enum freezer_state state;
														
 
															-	spinlock_t lock; /* protects _writes_ to state */
														
 
															+	struct cgroup_subsys_state	css;
														
 
															+	unsigned int			state;
														
 
															+	spinlock_t			lock;
														
 
															 };
														
 
															-static inline struct freezer *cgroup_freezer(
														
 
															-		struct cgroup *cgroup)
														
 
															+static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
														
 
															 {
														
 
															-	return container_of(
														
 
															-		cgroup_subsys_state(cgroup, freezer_subsys_id),
														
 
															-		struct freezer, css);
														
 
															+	return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
														
 
															+			    struct freezer, css);
														
 
															 }
														
 
															 static inline struct freezer *task_freezer(struct task_struct *task)
														
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task)
 
															 			    struct freezer, css);
														
 
															 }
														
 
															+static struct freezer *parent_freezer(struct freezer *freezer)
														
 
															+{
														
 
															+	struct cgroup *pcg = freezer->css.cgroup->parent;
														
 
															+
														
 
															+	if (pcg)
														
 
															+		return cgroup_freezer(pcg);
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															 bool cgroup_freezing(struct task_struct *task)
														
 
															 {
														
 
															-	enum freezer_state state;
														
 
															 	bool ret;
														
 
															 	rcu_read_lock();
														
 
															-	state = task_freezer(task)->state;
														
 
															-	ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
														
 
															+	ret = task_freezer(task)->state & CGROUP_FREEZING;
														
 
															 	rcu_read_unlock();
														
 
															 	return ret;
														
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task)
 
															  * cgroups_write_string() limits the size of freezer state strings to
														
 
															  * CGROUP_LOCAL_BUFFER_SIZE
														
 
															  */
														
 
															-static const char *freezer_state_strs[] = {
														
 
															-	"THAWED",
														
 
															-	"FREEZING",
														
 
															-	"FROZEN",
														
 
															+static const char *freezer_state_strs(unsigned int state)
														
 
															+{
														
 
															+	if (state & CGROUP_FROZEN)
														
 
															+		return "FROZEN";
														
 
															+	if (state & CGROUP_FREEZING)
														
 
															+		return "FREEZING";
														
 
															+	return "THAWED";
														
 
															 };
														
 
															-/*
														
 
															- * State diagram
														
 
															- * Transitions are caused by userspace writes to the freezer.state file.
														
 
															- * The values in parenthesis are state labels. The rest are edge labels.
														
 
															- *
														
 
															- * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
														
 
															- *    ^ ^                    |                     |
														
 
															- *    | \_______THAWED_______/                     |
														
 
															- *    \__________________________THAWED____________/
														
 
															- */
														
 
															-
														
 
															 struct cgroup_subsys freezer_subsys;
														
 
															-/* Locks taken and their ordering
														
 
															- * ------------------------------
														
 
															- * cgroup_mutex (AKA cgroup_lock)
														
 
															- * freezer->lock
														
 
															- * css_set_lock
														
 
															- * task->alloc_lock (AKA task_lock)
														
 
															- * task->sighand->siglock
														
 
															- *
														
 
															- * cgroup code forces css_set_lock to be taken before task->alloc_lock
														
 
															- *
														
 
															- * freezer_create(), freezer_destroy():
														
 
															- * cgroup_mutex [ by cgroup core ]
														
 
															- *
														
 
															- * freezer_can_attach():
														
 
															- * cgroup_mutex (held by caller of can_attach)
														
 
															- *
														
 
															- * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
														
 
															- * freezer->lock
														
 
															- *  sighand->siglock (if the cgroup is freezing)
														
 
															- *
														
 
															- * freezer_read():
														
 
															- * cgroup_mutex
														
 
															- *  freezer->lock
														
 
															- *   write_lock css_set_lock (cgroup iterator start)
														
 
															- *    task->alloc_lock
														
 
															- *   read_lock css_set_lock (cgroup iterator start)
														
 
															- *
														
 
															- * freezer_write() (freeze):
														
 
															- * cgroup_mutex
														
 
															- *  freezer->lock
														
 
															- *   write_lock css_set_lock (cgroup iterator start)
														
 
															- *    task->alloc_lock
														
 
															- *   read_lock css_set_lock (cgroup iterator start)
														
 
															- *    sighand->siglock (fake signal delivery inside freeze_task())
														
 
															- *
														
 
															- * freezer_write() (unfreeze):
														
 
															- * cgroup_mutex
														
 
															- *  freezer->lock
														
 
															- *   write_lock css_set_lock (cgroup iterator start)
														
 
															- *    task->alloc_lock
														
 
															- *   read_lock css_set_lock (cgroup iterator start)
														
 
															- *    task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
														
 
															- *     sighand->siglock
														
 
															- */
														
 
															-static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
														
 
															+static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
														
 
															 {
														
 
															 	struct freezer *freezer;
														
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
 
															 		return ERR_PTR(-ENOMEM);
														
 
															 	spin_lock_init(&freezer->lock);
														
 
															-	freezer->state = CGROUP_THAWED;
														
 
															 	return &freezer->css;
														
 
															 }
														
 
															-static void freezer_destroy(struct cgroup *cgroup)
														
 
															+/**
														
 
															+ * freezer_css_online - commit creation of a freezer cgroup
														
 
															+ * @cgroup: cgroup being created
														
 
															+ *
														
 
															+ * We're committing to creation of @cgroup.  Mark it online and inherit
														
 
															+ * parent's freezing state while holding both parent's and our
														
 
															+ * freezer->lock.
														
 
															+ */
														
 
															+static int freezer_css_online(struct cgroup *cgroup)
														
 
															+{
														
 
															+	struct freezer *freezer = cgroup_freezer(cgroup);
														
 
															+	struct freezer *parent = parent_freezer(freezer);
														
 
															+
														
 
															+	/*
														
 
															+	 * The following double locking and freezing state inheritance
														
 
															+	 * guarantee that @cgroup can never escape ancestors' freezing
														
 
															+	 * states.  See cgroup_for_each_descendant_pre() for details.
														
 
															+	 */
														
 
															+	if (parent)
														
 
															+		spin_lock_irq(&parent->lock);
														
 
															+	spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
														
 
															+
														
 
															+	freezer->state |= CGROUP_FREEZER_ONLINE;
														
 
															+
														
 
															+	if (parent && (parent->state & CGROUP_FREEZING)) {
														
 
															+		freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
														
 
															+		atomic_inc(&system_freezing_cnt);
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock(&freezer->lock);
														
 
															+	if (parent)
														
 
															+		spin_unlock_irq(&parent->lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * freezer_css_offline - initiate destruction of @cgroup
														
 
															+ * @cgroup: cgroup being destroyed
														
 
															+ *
														
 
															+ * @cgroup is going away.  Mark it dead and decrement system_freezing_count
														
 
															+ * if it was holding one.
														
 
															+ */
														
 
															+static void freezer_css_offline(struct cgroup *cgroup)
														
 
															 {
														
 
															 	struct freezer *freezer = cgroup_freezer(cgroup);
														
 
															-	if (freezer->state != CGROUP_THAWED)
														
 
															+	spin_lock_irq(&freezer->lock);
														
 
															+
														
 
															+	if (freezer->state & CGROUP_FREEZING)
														
 
															 		atomic_dec(&system_freezing_cnt);
														
 
															-	kfree(freezer);
														
 
															+
														
 
															+	freezer->state = 0;
														
 
															+
														
 
															+	spin_unlock_irq(&freezer->lock);
														
 
															 }
														
 
															-/* task is frozen or will freeze immediately when next it gets woken */
														
 
															-static bool is_task_frozen_enough(struct task_struct *task)
														
 
															+static void freezer_css_free(struct cgroup *cgroup)
														
 
															 {
														
 
															-	return frozen(task) ||
														
 
															-		(task_is_stopped_or_traced(task) && freezing(task));
														
 
															+	kfree(cgroup_freezer(cgroup));
														
 
															 }
														
 
															 /*
														
 
															- * The call to cgroup_lock() in the freezer.state write method prevents
														
 
															- * a write to that file racing against an attach, and hence the
														
 
															- * can_attach() result will remain valid until the attach completes.
														
 
															+ * Tasks can be migrated into a different freezer anytime regardless of its
														
 
															+ * current state.  freezer_attach() is responsible for making new tasks
														
 
															+ * conform to the current state.
														
 
															+ *
														
 
															+ * Freezer state changes and task migration are synchronized via
														
 
															+ * @freezer->lock.  freezer_attach() makes the new tasks conform to the
														
 
															+ * current state and all following state changes can see the new tasks.
														
 
															  */
														
 
															-static int freezer_can_attach(struct cgroup *new_cgroup,
														
 
															-			      struct cgroup_taskset *tset)
														
 
															+static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
														
 
															 {
														
 
															-	struct freezer *freezer;
														
 
															+	struct freezer *freezer = cgroup_freezer(new_cgrp);
														
 
															 	struct task_struct *task;
														
 
															+	bool clear_frozen = false;
														
 
															+
														
 
															+	spin_lock_irq(&freezer->lock);
														
 
															 	/*
														
 
															-	 * Anything frozen can't move or be moved to/from.
														
 
															+	 * Make the new tasks conform to the current state of @new_cgrp.
														
 
															+	 * For simplicity, when migrating any task to a FROZEN cgroup, we
														
 
															+	 * revert it to FREEZING and let update_if_frozen() determine the
														
 
															+	 * correct state later.
														
 
															+	 *
														
 
															+	 * Tasks in @tset are on @new_cgrp but may not conform to its
														
 
															+	 * current state before executing the following - !frozen tasks may
														
 
															+	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
														
 
															 	 */
														
 
															-	cgroup_taskset_for_each(task, new_cgroup, tset)
														
 
															-		if (cgroup_freezing(task))
														
 
															-			return -EBUSY;
														
 
															+	cgroup_taskset_for_each(task, new_cgrp, tset) {
														
 
															+		if (!(freezer->state & CGROUP_FREEZING)) {
														
 
															+			__thaw_task(task);
														
 
															+		} else {
														
 
															+			freeze_task(task);
														
 
															+			freezer->state &= ~CGROUP_FROZEN;
														
 
															+			clear_frozen = true;
														
 
															+		}
														
 
															+	}
														
 
															-	freezer = cgroup_freezer(new_cgroup);
														
 
															-	if (freezer->state != CGROUP_THAWED)
														
 
															-		return -EBUSY;
														
 
															+	spin_unlock_irq(&freezer->lock);
														
 
															-	return 0;
														
 
															+	/*
														
 
															+	 * Propagate FROZEN clearing upwards.  We may race with
														
 
															+	 * update_if_frozen(), but as long as both work bottom-up, either
														
 
															+	 * update_if_frozen() sees child's FROZEN cleared or we clear the
														
 
															+	 * parent's FROZEN later.  No parent w/ !FROZEN children can be
														
 
															+	 * left FROZEN.
														
 
															+	 */
														
 
															+	while (clear_frozen && (freezer = parent_freezer(freezer))) {
														
 
															+		spin_lock_irq(&freezer->lock);
														
 
															+		freezer->state &= ~CGROUP_FROZEN;
														
 
															+		clear_frozen = freezer->state & CGROUP_FREEZING;
														
 
															+		spin_unlock_irq(&freezer->lock);
														
 
															+	}
														
 
															 }
														
 
															 static void freezer_fork(struct task_struct *task)
														
 
															 {
														
 
															 	struct freezer *freezer;
														
 
															-	/*
														
 
															-	 * No lock is needed, since the task isn't on tasklist yet,
														
 
															-	 * so it can't be moved to another cgroup, which means the
														
 
															-	 * freezer won't be removed and will be valid during this
														
 
															-	 * function call.  Nevertheless, apply RCU read-side critical
														
 
															-	 * section to suppress RCU lockdep false positives.
														
 
															-	 */
														
 
															 	rcu_read_lock();
														
 
															 	freezer = task_freezer(task);
														
 
															-	rcu_read_unlock();
														
 
															 	/*
														
 
															 	 * The root cgroup is non-freezable, so we can skip the
														
 
															 	 * following check.
														
 
															 	 */
														
 
															 	if (!freezer->css.cgroup->parent)
														
 
															-		return;
														
 
															+		goto out;
														
 
															 	spin_lock_irq(&freezer->lock);
														
 
															-	BUG_ON(freezer->state == CGROUP_FROZEN);
														
 
															-
														
 
															-	/* Locking avoids race with FREEZING -> THAWED transitions. */
														
 
															-	if (freezer->state == CGROUP_FREEZING)
														
 
															+	if (freezer->state & CGROUP_FREEZING)
														
 
															 		freeze_task(task);
														
 
															 	spin_unlock_irq(&freezer->lock);
														
 
															+out:
														
 
															+	rcu_read_unlock();
														
 
															 }
														
 
															-/*
														
 
															- * caller must hold freezer->lock
														
 
															+/**
														
 
															+ * update_if_frozen - update whether a cgroup finished freezing
														
 
															+ * @cgroup: cgroup of interest
														
 
															+ *
														
 
															+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
														
 
															+ * calling this function.  If the current state is FREEZING but not FROZEN,
														
 
															+ * this function checks whether all tasks of this cgroup and the descendant
														
 
															+ * cgroups finished freezing and, if so, sets FROZEN.
														
 
															+ *
														
 
															+ * The caller is responsible for grabbing RCU read lock and calling
														
 
															+ * update_if_frozen() on all descendants prior to invoking this function.
														
 
															+ *
														
 
															+ * Task states and freezer state might disagree while tasks are being
														
 
															+ * migrated into or out of @cgroup, so we can't verify task states against
														
 
															+ * @freezer state here.  See freezer_attach() for details.
														
 
															  */
														
 
															-static void update_if_frozen(struct cgroup *cgroup,
														
 
															-				 struct freezer *freezer)
														
 
															+static void update_if_frozen(struct cgroup *cgroup)
														
 
															 {
														
 
															+	struct freezer *freezer = cgroup_freezer(cgroup);
														
 
															+	struct cgroup *pos;
														
 
															 	struct cgroup_iter it;
														
 
															 	struct task_struct *task;
														
 
															-	unsigned int nfrozen = 0, ntotal = 0;
														
 
															-	enum freezer_state old_state = freezer->state;
														
 
															-	cgroup_iter_start(cgroup, &it);
														
 
															-	while ((task = cgroup_iter_next(cgroup, &it))) {
														
 
															-		ntotal++;
														
 
															-		if (freezing(task) && is_task_frozen_enough(task))
														
 
															-			nfrozen++;
														
 
															+	WARN_ON_ONCE(!rcu_read_lock_held());
														
 
															+
														
 
															+	spin_lock_irq(&freezer->lock);
														
 
															+
														
 
															+	if (!(freezer->state & CGROUP_FREEZING) ||
														
 
															+	    (freezer->state & CGROUP_FROZEN))
														
 
															+		goto out_unlock;
														
 
															+
														
 
															+	/* are all (live) children frozen? */
														
 
															+	cgroup_for_each_child(pos, cgroup) {
														
 
															+		struct freezer *child = cgroup_freezer(pos);
														
 
															+
														
 
															+		if ((child->state & CGROUP_FREEZER_ONLINE) &&
														
 
															+		    !(child->state & CGROUP_FROZEN))
														
 
															+			goto out_unlock;
														
 
															 	}
														
 
															-	if (old_state == CGROUP_THAWED) {
														
 
															-		BUG_ON(nfrozen > 0);
														
 
															-	} else if (old_state == CGROUP_FREEZING) {
														
 
															-		if (nfrozen == ntotal)
														
 
															-			freezer->state = CGROUP_FROZEN;
														
 
															-	} else { /* old_state == CGROUP_FROZEN */
														
 
															-		BUG_ON(nfrozen != ntotal);
														
 
															+	/* are all tasks frozen? */
														
 
															+	cgroup_iter_start(cgroup, &it);
														
 
															+
														
 
															+	while ((task = cgroup_iter_next(cgroup, &it))) {
														
 
															+		if (freezing(task)) {
														
 
															+			/*
														
 
															+			 * freezer_should_skip() indicates that the task
														
 
															+			 * should be skipped when determining freezing
														
 
															+			 * completion.  Consider it frozen in addition to
														
 
															+			 * the usual frozen condition.
														
 
															+			 */
														
 
															+			if (!frozen(task) && !freezer_should_skip(task))
														
 
															+				goto out_iter_end;
														
 
															+		}
														
 
															 	}
														
 
															+	freezer->state |= CGROUP_FROZEN;
														
 
															+out_iter_end:
														
 
															 	cgroup_iter_end(cgroup, &it);
														
 
															+out_unlock:
														
 
															+	spin_unlock_irq(&freezer->lock);
														
 
															 }
														
 
															 static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
														
 
															 			struct seq_file *m)
														
 
															 {
														
 
															-	struct freezer *freezer;
														
 
															-	enum freezer_state state;
														
 
															+	struct cgroup *pos;
														
 
															-	if (!cgroup_lock_live_group(cgroup))
														
 
															-		return -ENODEV;
														
 
															+	rcu_read_lock();
														
 
															-	freezer = cgroup_freezer(cgroup);
														
 
															-	spin_lock_irq(&freezer->lock);
														
 
															-	state = freezer->state;
														
 
															-	if (state == CGROUP_FREEZING) {
														
 
															-		/* We change from FREEZING to FROZEN lazily if the cgroup was
														
 
															-		 * only partially frozen when we exitted write. */
														
 
															-		update_if_frozen(cgroup, freezer);
														
 
															-		state = freezer->state;
														
 
															-	}
														
 
															-	spin_unlock_irq(&freezer->lock);
														
 
															-	cgroup_unlock();
														
 
															+	/* update states bottom-up */
														
 
															+	cgroup_for_each_descendant_post(pos, cgroup)
														
 
															+		update_if_frozen(pos);
														
 
															+	update_if_frozen(cgroup);
														
 
															+
														
 
															+	rcu_read_unlock();
														
 
															-	seq_puts(m, freezer_state_strs[state]);
														
 
															+	seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
														
 
															 	seq_putc(m, '\n');
														
 
															 	return 0;
														
 
															 }
														
 
															-static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
														
 
															+static void freeze_cgroup(struct freezer *freezer)
														
 
															 {
														
 
															+	struct cgroup *cgroup = freezer->css.cgroup;
														
 
															 	struct cgroup_iter it;
														
 
															 	struct task_struct *task;
														
 
															-	unsigned int num_cant_freeze_now = 0;
														
 
															 	cgroup_iter_start(cgroup, &it);
														
 
															-	while ((task = cgroup_iter_next(cgroup, &it))) {
														
 
															-		if (!freeze_task(task))
														
 
															-			continue;
														
 
															-		if (is_task_frozen_enough(task))
														
 
															-			continue;
														
 
															-		if (!freezing(task) && !freezer_should_skip(task))
														
 
															-			num_cant_freeze_now++;
														
 
															-	}
														
 
															+	while ((task = cgroup_iter_next(cgroup, &it)))
														
 
															+		freeze_task(task);
														
 
															 	cgroup_iter_end(cgroup, &it);
														
 
															-
														
 
															-	return num_cant_freeze_now ? -EBUSY : 0;
														
 
															 }
														
 
															-static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
														
 
															+static void unfreeze_cgroup(struct freezer *freezer)
														
 
															 {
														
 
															+	struct cgroup *cgroup = freezer->css.cgroup;
														
 
															 	struct cgroup_iter it;
														
 
															 	struct task_struct *task;
														
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 
															 	cgroup_iter_end(cgroup, &it);
														
 
															 }
														
 
															-static int freezer_change_state(struct cgroup *cgroup,
														
 
															-				enum freezer_state goal_state)
														
 
															+/**
														
 
															+ * freezer_apply_state - apply state change to a single cgroup_freezer
														
 
															+ * @freezer: freezer to apply state change to
														
 
															+ * @freeze: whether to freeze or unfreeze
														
 
															+ * @state: CGROUP_FREEZING_* flag to set or clear
														
 
															+ *
														
 
															+ * Set or clear @state on @cgroup according to @freeze, and perform
														
 
															+ * freezing or thawing as necessary.
														
 
															+ */
														
 
															+static void freezer_apply_state(struct freezer *freezer, bool freeze,
														
 
															+				unsigned int state)
														
 
															 {
														
 
															-	struct freezer *freezer;
														
 
															-	int retval = 0;
														
 
															-
														
 
															-	freezer = cgroup_freezer(cgroup);
														
 
															+	/* also synchronizes against task migration, see freezer_attach() */
														
 
															+	lockdep_assert_held(&freezer->lock);
														
 
															-	spin_lock_irq(&freezer->lock);
														
 
															+	if (!(freezer->state & CGROUP_FREEZER_ONLINE))
														
 
															+		return;
														
 
															-	update_if_frozen(cgroup, freezer);
														
 
															-
														
 
															-	switch (goal_state) {
														
 
															-	case CGROUP_THAWED:
														
 
															-		if (freezer->state != CGROUP_THAWED)
														
 
															-			atomic_dec(&system_freezing_cnt);
														
 
															-		freezer->state = CGROUP_THAWED;
														
 
															-		unfreeze_cgroup(cgroup, freezer);
														
 
															-		break;
														
 
															-	case CGROUP_FROZEN:
														
 
															-		if (freezer->state == CGROUP_THAWED)
														
 
															+	if (freeze) {
														
 
															+		if (!(freezer->state & CGROUP_FREEZING))
														
 
															 			atomic_inc(&system_freezing_cnt);
														
 
															-		freezer->state = CGROUP_FREEZING;
														
 
															-		retval = try_to_freeze_cgroup(cgroup, freezer);
														
 
															-		break;
														
 
															-	default:
														
 
															-		BUG();
														
 
															+		freezer->state |= state;
														
 
															+		freeze_cgroup(freezer);
														
 
															+	} else {
														
 
															+		bool was_freezing = freezer->state & CGROUP_FREEZING;
														
 
															+
														
 
															+		freezer->state &= ~state;
														
 
															+
														
 
															+		if (!(freezer->state & CGROUP_FREEZING)) {
														
 
															+			if (was_freezing)
														
 
															+				atomic_dec(&system_freezing_cnt);
														
 
															+			freezer->state &= ~CGROUP_FROZEN;
														
 
															+			unfreeze_cgroup(freezer);
														
 
															+		}
														
 
															 	}
														
 
															+}
														
 
															+/**
														
 
															+ * freezer_change_state - change the freezing state of a cgroup_freezer
														
 
															+ * @freezer: freezer of interest
														
 
															+ * @freeze: whether to freeze or thaw
														
 
															+ *
														
 
															+ * Freeze or thaw @freezer according to @freeze.  The operations are
														
 
															+ * recursive - all descendants of @freezer will be affected.
														
 
															+ */
														
 
															+static void freezer_change_state(struct freezer *freezer, bool freeze)
														
 
															+{
														
 
															+	struct cgroup *pos;
														
 
															+
														
 
															+	/* update @freezer */
														
 
															+	spin_lock_irq(&freezer->lock);
														
 
															+	freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
														
 
															 	spin_unlock_irq(&freezer->lock);
														
 
															-	return retval;
														
 
															+	/*
														
 
															+	 * Update all its descendants in pre-order traversal.  Each
														
 
															+	 * descendant will try to inherit its parent's FREEZING state as
														
 
															+	 * CGROUP_FREEZING_PARENT.
														
 
															+	 */
														
 
															+	rcu_read_lock();
														
 
															+	cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
														
 
															+		struct freezer *pos_f = cgroup_freezer(pos);
														
 
															+		struct freezer *parent = parent_freezer(pos_f);
														
 
															+
														
 
															+		/*
														
 
															+		 * Our update to @parent->state is already visible which is
														
 
															+		 * all we need.  No need to lock @parent.  For more info on
														
 
															+		 * synchronization, see freezer_post_create().
														
 
															+		 */
														
 
															+		spin_lock_irq(&pos_f->lock);
														
 
															+		freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
														
 
															+				    CGROUP_FREEZING_PARENT);
														
 
															+		spin_unlock_irq(&pos_f->lock);
														
 
															+	}
														
 
															+	rcu_read_unlock();
														
 
															 }
														
 
															-static int freezer_write(struct cgroup *cgroup,
														
 
															-			 struct cftype *cft,
														
 
															+static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
														
 
															 			 const char *buffer)
														
 
															 {
														
 
															-	int retval;
														
 
															-	enum freezer_state goal_state;
														
 
															+	bool freeze;
														
 
															-	if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
														
 
															-		goal_state = CGROUP_THAWED;
														
 
															-	else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
														
 
															-		goal_state = CGROUP_FROZEN;
														
 
															+	if (strcmp(buffer, freezer_state_strs(0)) == 0)
														
 
															+		freeze = false;
														
 
															+	else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
														
 
															+		freeze = true;
														
 
															 	else
														
 
															 		return -EINVAL;
														
 
															-	if (!cgroup_lock_live_group(cgroup))
														
 
															-		return -ENODEV;
														
 
															-	retval = freezer_change_state(cgroup, goal_state);
														
 
															-	cgroup_unlock();
														
 
															-	return retval;
														
 
															+	freezer_change_state(cgroup_freezer(cgroup), freeze);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
														
 
															+{
														
 
															+	struct freezer *freezer = cgroup_freezer(cgroup);
														
 
															+
														
 
															+	return (bool)(freezer->state & CGROUP_FREEZING_SELF);
														
 
															+}
														
 
															+
														
 
															+static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
														
 
															+{
														
 
															+	struct freezer *freezer = cgroup_freezer(cgroup);
														
 
															+
														
 
															+	return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
														
 
															 }
														
 
															 static struct cftype files[] = {
														
@@ -362,23 +462,27 @@ static struct cftype files[] = {
 
															 		.read_seq_string = freezer_read,
														
 
															 		.write_string = freezer_write,
														
 
															 	},
														
 
															+	{
														
 
															+		.name = "self_freezing",
														
 
															+		.flags = CFTYPE_NOT_ON_ROOT,
														
 
															+		.read_u64 = freezer_self_freezing_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "parent_freezing",
														
 
															+		.flags = CFTYPE_NOT_ON_ROOT,
														
 
															+		.read_u64 = freezer_parent_freezing_read,
														
 
															+	},
														
 
															 	{ }	/* terminate */
														
 
															 };
														
 
															 struct cgroup_subsys freezer_subsys = {
														
 
															 	.name		= "freezer",
														
 
															-	.create		= freezer_create,
														
 
															-	.destroy	= freezer_destroy,
														
 
															+	.css_alloc	= freezer_css_alloc,
														
 
															+	.css_online	= freezer_css_online,
														
 
															+	.css_offline	= freezer_css_offline,
														
 
															+	.css_free	= freezer_css_free,
														
 
															 	.subsys_id	= freezer_subsys_id,
														
 
															-	.can_attach	= freezer_can_attach,
														
 
															+	.attach		= freezer_attach,
														
 
															 	.fork		= freezer_fork,
														
 
															 	.base_cftypes	= files,
														
 
															-
														
 
															-	/*
														
 
															-	 * freezer subsys doesn't handle hierarchy at all.  Frozen state
														
 
															-	 * should be inherited through the hierarchy - if a parent is
														
 
															-	 * frozen, all its children should be frozen.  Fix it and remove
														
 
															-	 * the following.
														
 
															-	 */
														
 
															-	.broken_hierarchy = true,
														
 
															 };
														
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1784,56 +1784,20 @@ static struct cftype files[] = {
 
															 };
														
 
															 /*
														
 
															- * post_clone() is called during cgroup_create() when the
														
 
															- * clone_children mount argument was specified.  The cgroup
														
 
															- * can not yet have any tasks.
														
 
															- *
														
 
															- * Currently we refuse to set up the cgroup - thereby
														
 
															- * refusing the task to be entered, and as a result refusing
														
 
															- * the sys_unshare() or clone() which initiated it - if any
														
 
															- * sibling cpusets have exclusive cpus or mem.
														
 
															- *
														
 
															- * If this becomes a problem for some users who wish to
														
 
															- * allow that scenario, then cpuset_post_clone() could be
														
 
															- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
														
 
															- * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
														
 
															- * held.
														
 
															- */
														
 
															-static void cpuset_post_clone(struct cgroup *cgroup)
														
 
															-{
														
 
															-	struct cgroup *parent, *child;
														
 
															-	struct cpuset *cs, *parent_cs;
														
 
															-
														
 
															-	parent = cgroup->parent;
														
 
															-	list_for_each_entry(child, &parent->children, sibling) {
														
 
															-		cs = cgroup_cs(child);
														
 
															-		if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
														
 
															-			return;
														
 
															-	}
														
 
															-	cs = cgroup_cs(cgroup);
														
 
															-	parent_cs = cgroup_cs(parent);
														
 
															-
														
 
															-	mutex_lock(&callback_mutex);
														
 
															-	cs->mems_allowed = parent_cs->mems_allowed;
														
 
															-	cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
														
 
															-	mutex_unlock(&callback_mutex);
														
 
															-	return;
														
 
															-}
														
 
															-
														
 
															-/*
														
 
															- *	cpuset_create - create a cpuset
														
 
															+ *	cpuset_css_alloc - allocate a cpuset css
														
 
															  *	cont:	control group that the new cpuset will be part of
														
 
															  */
														
 
															-static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
														
 
															+static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
														
 
															 {
														
 
															-	struct cpuset *cs;
														
 
															-	struct cpuset *parent;
														
 
															+	struct cgroup *parent_cg = cont->parent;
														
 
															+	struct cgroup *tmp_cg;
														
 
															+	struct cpuset *parent, *cs;
														
 
															-	if (!cont->parent) {
														
 
															+	if (!parent_cg)
														
 
															 		return &top_cpuset.css;
														
 
															-	}
														
 
															-	parent = cgroup_cs(cont->parent);
														
 
															+	parent = cgroup_cs(parent_cg);
														
 
															+
														
 
															 	cs = kmalloc(sizeof(*cs), GFP_KERNEL);
														
 
															 	if (!cs)
														
 
															 		return ERR_PTR(-ENOMEM);
														
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
 
															 	cs->parent = parent;
														
 
															 	number_of_cpusets++;
														
 
															-	return &cs->css ;
														
 
															+
														
 
															+	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
														
 
															+		goto skip_clone;
														
 
															+
														
 
															+	/*
														
 
															+	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
														
 
															+	 * set.  This flag handling is implemented in cgroup core for
														
 
															+	 * histrical reasons - the flag may be specified during mount.
														
 
															+	 *
														
 
															+	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
														
 
															+	 * refuse to clone the configuration - thereby refusing the task to
														
 
															+	 * be entered, and as a result refusing the sys_unshare() or
														
 
															+	 * clone() which initiated it.  If this becomes a problem for some
														
 
															+	 * users who wish to allow that scenario, then this could be
														
 
															+	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
														
 
															+	 * (and likewise for mems) to the new cgroup.
														
 
															+	 */
														
 
															+	list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
														
 
															+		struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
														
 
															+
														
 
															+		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
														
 
															+			goto skip_clone;
														
 
															+	}
														
 
															+
														
 
															+	mutex_lock(&callback_mutex);
														
 
															+	cs->mems_allowed = parent->mems_allowed;
														
 
															+	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
														
 
															+	mutex_unlock(&callback_mutex);
														
 
															+skip_clone:
														
 
															+	return &cs->css;
														
 
															 }
														
 
															 /*
														
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
 
															  * will call async_rebuild_sched_domains().
														
 
															  */
														
 
															-static void cpuset_destroy(struct cgroup *cont)
														
 
															+static void cpuset_css_free(struct cgroup *cont)
														
 
															 {
														
 
															 	struct cpuset *cs = cgroup_cs(cont);
														
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont)
 
															 struct cgroup_subsys cpuset_subsys = {
														
 
															 	.name = "cpuset",
														
 
															-	.create = cpuset_create,
														
 
															-	.destroy = cpuset_destroy,
														
 
															+	.css_alloc = cpuset_css_alloc,
														
 
															+	.css_free = cpuset_css_free,
														
 
															 	.can_attach = cpuset_can_attach,
														
 
															 	.attach = cpuset_attach,
														
 
															-	.post_clone = cpuset_post_clone,
														
 
															 	.subsys_id = cpuset_subsys_id,
														
 
															 	.base_cftypes = files,
														
 
															 	.early_init = 1,
														
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7434,7 +7434,7 @@ unlock:
 
															 device_initcall(perf_event_sysfs_init);
														
 
															 #ifdef CONFIG_CGROUP_PERF
														
 
															-static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
														
 
															+static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
														
 
															 {
														
 
															 	struct perf_cgroup *jc;
														
@@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
 
															 	return &jc->css;
														
 
															 }
														
 
															-static void perf_cgroup_destroy(struct cgroup *cont)
														
 
															+static void perf_cgroup_css_free(struct cgroup *cont)
														
 
															 {
														
 
															 	struct perf_cgroup *jc;
														
 
															 	jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
														
@@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
 
															 struct cgroup_subsys perf_subsys = {
														
 
															 	.name		= "perf_event",
														
 
															 	.subsys_id	= perf_subsys_id,
														
 
															-	.create		= perf_cgroup_create,
														
 
															-	.destroy	= perf_cgroup_destroy,
														
 
															+	.css_alloc	= perf_cgroup_css_alloc,
														
 
															+	.css_free	= perf_cgroup_css_free,
														
 
															 	.exit		= perf_cgroup_exit,
														
 
															 	.attach		= perf_cgroup_attach,
														
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1137,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
															 {
														
 
															 	int retval;
														
 
															 	struct task_struct *p;
														
 
															-	int cgroup_callbacks_done = 0;
														
 
															 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
														
 
															 		return ERR_PTR(-EINVAL);
														
@@ -1395,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
															 	INIT_LIST_HEAD(&p->thread_group);
														
 
															 	p->task_works = NULL;
														
 
															-	/* Now that the task is set up, run cgroup callbacks if
														
 
															-	 * necessary. We need to run them before the task is visible
														
 
															-	 * on the tasklist. */
														
 
															-	cgroup_fork_callbacks(p);
														
 
															-	cgroup_callbacks_done = 1;
														
 
															-
														
 
															 	/* Need tasklist lock for parent etc handling! */
														
 
															 	write_lock_irq(&tasklist_lock);
														
@@ -1505,7 +1498,7 @@ bad_fork_cleanup_cgroup:
 
															 #endif
														
 
															 	if (clone_flags & CLONE_THREAD)
														
 
															 		threadgroup_change_end(current);
														
 
															-	cgroup_exit(p, cgroup_callbacks_done);
														
 
															+	cgroup_exit(p, 0);
														
 
															 	delayacct_tsk_free(p);
														
 
															 	module_put(task_thread_info(p)->exec_domain->module);
														
 
															 bad_fork_cleanup_count:
														
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
 
															 		return false;
														
 
															 	}
														
 
															-	if (!(p->flags & PF_KTHREAD)) {
														
 
															+	if (!(p->flags & PF_KTHREAD))
														
 
															 		fake_signal_wake_up(p);
														
 
															-		/*
														
 
															-		 * fake_signal_wake_up() goes through p's scheduler
														
 
															-		 * lock and guarantees that TASK_STOPPED/TRACED ->
														
 
															-		 * TASK_RUNNING transition can't race with task state
														
 
															-		 * testing in try_to_freeze_tasks().
														
 
															-		 */
														
 
															-	} else {
														
 
															+	else
														
 
															 		wake_up_state(p, TASK_INTERRUPTIBLE);
														
 
															-	}
														
 
															 	spin_unlock_irqrestore(&freezer_lock, flags);
														
 
															 	return true;
														
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
 
															 			if (p == current || !freeze_task(p))
														
 
															 				continue;
														
 
															-			/*
														
 
															-			 * Now that we've done set_freeze_flag, don't
														
 
															-			 * perturb a task in TASK_STOPPED or TASK_TRACED.
														
 
															-			 * It is "frozen enough".  If the task does wake
														
 
															-			 * up, it will immediately call try_to_freeze.
														
 
															-			 *
														
 
															-			 * Because freeze_task() goes through p's scheduler lock, it's
														
 
															-			 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
														
 
															-			 * transition can't race with task state testing here.
														
 
															-			 */
														
 
															-			if (!task_is_stopped_or_traced(p) &&
														
 
															-			    !freezer_should_skip(p))
														
 
															+			if (!freezer_should_skip(p))
														
 
															 				todo++;
														
 
															 		} while_each_thread(g, p);
														
 
															 		read_unlock(&tasklist_lock);
														
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7484,7 +7484,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
 
															 			    struct task_group, css);
														
 
															 }
														
 
															-static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
														
 
															+static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
														
 
															 {
														
 
															 	struct task_group *tg, *parent;
														
@@ -7501,7 +7501,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
 
															 	return &tg->css;
														
 
															 }
														
 
															-static void cpu_cgroup_destroy(struct cgroup *cgrp)
														
 
															+static void cpu_cgroup_css_free(struct cgroup *cgrp)
														
 
															 {
														
 
															 	struct task_group *tg = cgroup_tg(cgrp);
														
@@ -7861,8 +7861,8 @@ static struct cftype cpu_files[] = {
 
															 struct cgroup_subsys cpu_cgroup_subsys = {
														
 
															 	.name		= "cpu",
														
 
															-	.create		= cpu_cgroup_create,
														
 
															-	.destroy	= cpu_cgroup_destroy,
														
 
															+	.css_alloc	= cpu_cgroup_css_alloc,
														
 
															+	.css_free	= cpu_cgroup_css_free,
														
 
															 	.can_attach	= cpu_cgroup_can_attach,
														
 
															 	.attach		= cpu_cgroup_attach,
														
 
															 	.exit		= cpu_cgroup_exit,
														
@@ -7885,7 +7885,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 
															 struct cpuacct root_cpuacct;
														
 
															 /* create a new cpu accounting group */
														
 
															-static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
														
 
															+static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
														
 
															 {
														
 
															 	struct cpuacct *ca;
														
@@ -7915,7 +7915,7 @@ out:
 
															 }
														
 
															 /* destroy an existing cpu accounting group */
														
 
															-static void cpuacct_destroy(struct cgroup *cgrp)
														
 
															+static void cpuacct_css_free(struct cgroup *cgrp)
														
 
															 {
														
 
															 	struct cpuacct *ca = cgroup_ca(cgrp);
														
@@ -8086,8 +8086,8 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 
															 struct cgroup_subsys cpuacct_subsys = {
														
 
															 	.name = "cpuacct",
														
 
															-	.create = cpuacct_create,
														
 
															-	.destroy = cpuacct_destroy,
														
 
															+	.css_alloc = cpuacct_css_alloc,
														
 
															+	.css_free = cpuacct_css_free,
														
 
															 	.subsys_id = cpuacct_subsys_id,
														
 
															 	.base_cftypes = files,
														
 
															 };
														
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1908,7 +1908,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 
															 		preempt_disable();
														
 
															 		read_unlock(&tasklist_lock);
														
 
															 		preempt_enable_no_resched();
														
 
															-		schedule();
														
 
															+		freezable_schedule();
														
 
															 	} else {
														
 
															 		/*
														
 
															 		 * By the time we got the lock, our tracer went away.
														
@@ -1929,13 +1929,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 
															 		read_unlock(&tasklist_lock);
														
 
															 	}
														
 
															-	/*
														
 
															-	 * While in TASK_TRACED, we were considered "frozen enough".
														
 
															-	 * Now that we woke up, it's crucial if we're supposed to be
														
 
															-	 * frozen that we freeze now before running anything substantial.
														
 
															-	 */
														
 
															-	try_to_freeze();
														
 
															-
														
 
															 	/*
														
 
															 	 * We are back.  Now reacquire the siglock before touching
														
 
															 	 * last_siginfo, so that we are sure to have synchronized with
														
@@ -2092,7 +2085,7 @@ static bool do_signal_stop(int signr)
 
															 		}
														
 
															 		/* Now we don't run again until woken by SIGCONT or SIGKILL */
														
 
															-		schedule();
														
 
															+		freezable_schedule();
														
 
															 		return true;
														
 
															 	} else {
														
 
															 		/*
														
@@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 
															 	if (unlikely(uprobe_deny_signal()))
														
 
															 		return 0;
														
 
															-relock:
														
 
															 	/*
														
 
															-	 * We'll jump back here after any time we were stopped in TASK_STOPPED.
														
 
															-	 * While in TASK_STOPPED, we were considered "frozen enough".
														
 
															-	 * Now that we woke up, it's crucial if we're supposed to be
														
 
															-	 * frozen that we freeze now before running anything substantial.
														
 
															+	 * Do this once, we can't return to user-mode if freezing() == T.
														
 
															+	 * do_signal_stop() and ptrace_stop() do freezable_schedule() and
														
 
															+	 * thus do not need another check after return.
														
 
															 	 */
														
 
															 	try_to_freeze();
														
 
															+relock:
														
 
															 	spin_lock_irq(&sighand->siglock);
														
 
															 	/*
														
 
															 	 * Every stopped thread goes here after wakeup. Check to see if
														
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
 
															 	return false;
														
 
															 }
														
 
															-static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
														
 
															+static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
														
 
															 {
														
 
															 	int idx;
														
 
															 	struct cgroup *parent_cgroup;
														
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
 
															 	return &h_cgroup->css;
														
 
															 }
														
 
															-static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
														
 
															+static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
														
 
															 {
														
 
															 	struct hugetlb_cgroup *h_cgroup;
														
@@ -155,18 +155,13 @@ out:
 
															  * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
														
 
															  * the parent cgroup.
														
 
															  */
														
 
															-static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
														
 
															+static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
														
 
															 {
														
 
															 	struct hstate *h;
														
 
															 	struct page *page;
														
 
															-	int ret = 0, idx = 0;
														
 
															+	int idx = 0;
														
 
															 	do {
														
 
															-		if (cgroup_task_count(cgroup) ||
														
 
															-		    !list_empty(&cgroup->children)) {
														
 
															-			ret = -EBUSY;
														
 
															-			goto out;
														
 
															-		}
														
 
															 		for_each_hstate(h) {
														
 
															 			spin_lock(&hugetlb_lock);
														
 
															 			list_for_each_entry(page, &h->hugepage_activelist, lru)
														
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
 
															 		}
														
 
															 		cond_resched();
														
 
															 	} while (hugetlb_cgroup_have_usage(cgroup));
														
 
															-out:
														
 
															-	return ret;
														
 
															 }
														
 
															 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
														
@@ -411,8 +404,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
 
															 struct cgroup_subsys hugetlb_subsys = {
														
 
															 	.name = "hugetlb",
														
 
															-	.create     = hugetlb_cgroup_create,
														
 
															-	.pre_destroy = hugetlb_cgroup_pre_destroy,
														
 
															-	.destroy    = hugetlb_cgroup_destroy,
														
 
															-	.subsys_id  = hugetlb_subsys_id,
														
 
															+	.css_alloc	= hugetlb_cgroup_css_alloc,
														
 
															+	.css_offline	= hugetlb_cgroup_css_offline,
														
 
															+	.css_free	= hugetlb_cgroup_css_free,
														
 
															+	.subsys_id	= hugetlb_subsys_id,
														
 
															 };
														
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2370,7 +2370,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 
															 again:
														
 
															 	if (*ptr) { /* css should be a valid one */
														
 
															 		memcg = *ptr;
														
 
															-		VM_BUG_ON(css_is_removed(&memcg->css));
														
 
															 		if (mem_cgroup_is_root(memcg))
														
 
															 			goto done;
														
 
															 		if (nr_pages == 1 && consume_stock(memcg))
														
@@ -2510,9 +2509,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
 
															 /*
														
 
															  * A helper function to get mem_cgroup from ID. must be called under
														
 
															- * rcu_read_lock(). The caller must check css_is_removed() or some if
														
 
															- * it's concern. (dropping refcnt from swap can be called against removed
														
 
															- * memcg.)
														
 
															+ * rcu_read_lock().  The caller is responsible for calling css_tryget if
														
 
															+ * the mem_cgroup is used for charging. (dropping refcnt from swap can be
														
 
															+ * called against removed memcg.)
														
 
															  */
														
 
															 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
														
 
															 {
														
@@ -2709,13 +2708,6 @@ static int mem_cgroup_move_account(struct page *page,
 
															 	/* caller should have done css_get */
														
 
															 	pc->mem_cgroup = to;
														
 
															 	mem_cgroup_charge_statistics(to, anon, nr_pages);
														
 
															-	/*
														
 
															-	 * We charges against "to" which may not have any tasks. Then, "to"
														
 
															-	 * can be under rmdir(). But in current implementation, caller of
														
 
															-	 * this function is just force_empty() and move charge, so it's
														
 
															-	 * guaranteed that "to" is never removed. So, we don't check rmdir
														
 
															-	 * status here.
														
 
															-	 */
														
 
															 	move_unlock_mem_cgroup(from, &flags);
														
 
															 	ret = 0;
														
 
															 unlock:
														
@@ -2729,10 +2721,27 @@ out:
 
															 	return ret;
														
 
															 }
														
 
															-/*
														
 
															- * move charges to its parent.
														
 
															+/**
														
 
															+ * mem_cgroup_move_parent - moves page to the parent group
														
 
															+ * @page: the page to move
														
 
															+ * @pc: page_cgroup of the page
														
 
															+ * @child: page's cgroup
														
 
															+ *
														
 
															+ * move charges to its parent or the root cgroup if the group has no
														
 
															+ * parent (aka use_hierarchy==0).
														
 
															+ * Although this might fail (get_page_unless_zero, isolate_lru_page or
														
 
															+ * mem_cgroup_move_account fails) the failure is always temporary and
														
 
															+ * it signals a race with a page removal/uncharge or migration. In the
														
 
															+ * first case the page is on the way out and it will vanish from the LRU
														
 
															+ * on the next attempt and the call should be retried later.
														
 
															+ * Isolation from the LRU fails only if page has been isolated from
														
 
															+ * the LRU since we looked at it and that usually means either global
														
 
															+ * reclaim or migration going on. The page will either get back to the
														
 
															+ * LRU or vanish.
														
 
															+ * Finaly mem_cgroup_move_account fails only if the page got uncharged
														
 
															+ * (!PageCgroupUsed) or moved to a different group. The page will
														
 
															+ * disappear in the next attempt.
														
 
															  */
														
 
															-
														
 
															 static int mem_cgroup_move_parent(struct page *page,
														
 
															 				  struct page_cgroup *pc,
														
 
															 				  struct mem_cgroup *child)
														
@@ -2742,9 +2751,7 @@ static int mem_cgroup_move_parent(struct page *page,
 
															 	unsigned long uninitialized_var(flags);
														
 
															 	int ret;
														
 
															-	/* Is ROOT ? */
														
 
															-	if (mem_cgroup_is_root(child))
														
 
															-		return -EINVAL;
														
 
															+	VM_BUG_ON(mem_cgroup_is_root(child));
														
 
															 	ret = -EBUSY;
														
 
															 	if (!get_page_unless_zero(page))
														
@@ -2761,8 +2768,10 @@ static int mem_cgroup_move_parent(struct page *page,
 
															 	if (!parent)
														
 
															 		parent = root_mem_cgroup;
														
 
															-	if (nr_pages > 1)
														
 
															+	if (nr_pages > 1) {
														
 
															+		VM_BUG_ON(!PageTransHuge(page));
														
 
															 		flags = compound_lock_irqsave(page);
														
 
															+	}
														
 
															 	ret = mem_cgroup_move_account(page, nr_pages,
														
 
															 				pc, child, parent);
														
@@ -2904,7 +2913,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
 
															 		return;
														
 
															 	if (!memcg)
														
 
															 		return;
														
 
															-	cgroup_exclude_rmdir(&memcg->css);
														
 
															 	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
														
 
															 	/*
														
@@ -2918,12 +2926,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
 
															 		swp_entry_t ent = {.val = page_private(page)};
														
 
															 		mem_cgroup_uncharge_swap(ent);
														
 
															 	}
														
 
															-	/*
														
 
															-	 * At swapin, we may charge account against cgroup which has no tasks.
														
 
															-	 * So, rmdir()->pre_destroy() can be called while we do this charge.
														
 
															-	 * In that case, we need to call pre_destroy() again. check it here.
														
 
															-	 */
														
 
															-	cgroup_release_and_wakeup_rmdir(&memcg->css);
														
 
															 }
														
 
															 void mem_cgroup_commit_charge_swapin(struct page *page,
														
@@ -3371,8 +3373,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 
															 	if (!memcg)
														
 
															 		return;
														
 
															-	/* blocks rmdir() */
														
 
															-	cgroup_exclude_rmdir(&memcg->css);
														
 
															+
														
 
															 	if (!migration_ok) {
														
 
															 		used = oldpage;
														
 
															 		unused = newpage;
														
@@ -3406,13 +3407,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 
															 	 */
														
 
															 	if (anon)
														
 
															 		mem_cgroup_uncharge_page(used);
														
 
															-	/*
														
 
															-	 * At migration, we may charge account against cgroup which has no
														
 
															-	 * tasks.
														
 
															-	 * So, rmdir()->pre_destroy() can be called while we do this charge.
														
 
															-	 * In that case, we need to call pre_destroy() again. check it here.
														
 
															-	 */
														
 
															-	cgroup_release_and_wakeup_rmdir(&memcg->css);
														
 
															 }
														
 
															 /*
														
@@ -3712,17 +3706,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 
															 	return nr_reclaimed;
														
 
															 }
														
 
															-/*
														
 
															+/**
														
 
															+ * mem_cgroup_force_empty_list - clears LRU of a group
														
 
															+ * @memcg: group to clear
														
 
															+ * @node: NUMA node
														
 
															+ * @zid: zone id
														
 
															+ * @lru: lru to to clear
														
 
															+ *
														
 
															  * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
														
 
															- * reclaim the pages page themselves - it just removes the page_cgroups.
														
 
															- * Returns true if some page_cgroups were not freed, indicating that the caller
														
 
															- * must retry this operation.
														
 
															+ * reclaim the pages page themselves - pages are moved to the parent (or root)
														
 
															+ * group.
														
 
															  */
														
 
															-static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
														
 
															+static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
														
 
															 				int node, int zid, enum lru_list lru)
														
 
															 {
														
 
															 	struct lruvec *lruvec;
														
 
															-	unsigned long flags, loop;
														
 
															+	unsigned long flags;
														
 
															 	struct list_head *list;
														
 
															 	struct page *busy;
														
 
															 	struct zone *zone;
														
@@ -3731,11 +3730,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 
															 	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
														
 
															 	list = &lruvec->lists[lru];
														
 
															-	loop = mem_cgroup_get_lru_size(lruvec, lru);
														
 
															-	/* give some margin against EBUSY etc...*/
														
 
															-	loop += 256;
														
 
															 	busy = NULL;
														
 
															-	while (loop--) {
														
 
															+	do {
														
 
															 		struct page_cgroup *pc;
														
 
															 		struct page *page;
														
@@ -3761,76 +3757,72 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 
															 			cond_resched();
														
 
															 		} else
														
 
															 			busy = NULL;
														
 
															-	}
														
 
															-	return !list_empty(list);
														
 
															+	} while (!list_empty(list));
														
 
															 }
														
 
															 /*
														
 
															- * make mem_cgroup's charge to be 0 if there is no task.
														
 
															+ * make mem_cgroup's charge to be 0 if there is no task by moving
														
 
															+ * all the charges and pages to the parent.
														
 
															  * This enables deleting this mem_cgroup.
														
 
															+ *
														
 
															+ * Caller is responsible for holding css reference on the memcg.
														
 
															  */
														
 
															-static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
														
 
															+static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
														
 
															 {
														
 
															-	int ret;
														
 
															-	int node, zid, shrink;
														
 
															-	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
														
 
															-	struct cgroup *cgrp = memcg->css.cgroup;
														
 
															-
														
 
															-	css_get(&memcg->css);
														
 
															+	int node, zid;
														
 
															-	shrink = 0;
														
 
															-	/* should free all ? */
														
 
															-	if (free_all)
														
 
															-		goto try_to_free;
														
 
															-move_account:
														
 
															 	do {
														
 
															-		ret = -EBUSY;
														
 
															-		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
														
 
															-			goto out;
														
 
															 		/* This is for making all *used* pages to be on LRU. */
														
 
															 		lru_add_drain_all();
														
 
															 		drain_all_stock_sync(memcg);
														
 
															-		ret = 0;
														
 
															 		mem_cgroup_start_move(memcg);
														
 
															 		for_each_node_state(node, N_HIGH_MEMORY) {
														
 
															-			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
														
 
															+			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
														
 
															 				enum lru_list lru;
														
 
															 				for_each_lru(lru) {
														
 
															-					ret = mem_cgroup_force_empty_list(memcg,
														
 
															+					mem_cgroup_force_empty_list(memcg,
														
 
															 							node, zid, lru);
														
 
															-					if (ret)
														
 
															-						break;
														
 
															 				}
														
 
															 			}
														
 
															-			if (ret)
														
 
															-				break;
														
 
															 		}
														
 
															 		mem_cgroup_end_move(memcg);
														
 
															 		memcg_oom_recover(memcg);
														
 
															 		cond_resched();
														
 
															-	/* "ret" should also be checked to ensure all lists are empty. */
														
 
															-	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
														
 
															-out:
														
 
															-	css_put(&memcg->css);
														
 
															-	return ret;
														
 
															-try_to_free:
														
 
															+		/*
														
 
															+		 * This is a safety check because mem_cgroup_force_empty_list
														
 
															+		 * could have raced with mem_cgroup_replace_page_cache callers
														
 
															+		 * so the lru seemed empty but the page could have been added
														
 
															+		 * right after the check. RES_USAGE should be safe as we always
														
 
															+		 * charge before adding to the LRU.
														
 
															+		 */
														
 
															+	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Reclaims as many pages from the given memcg as possible and moves
														
 
															+ * the rest to the parent.
														
 
															+ *
														
 
															+ * Caller is responsible for holding css reference for memcg.
														
 
															+ */
														
 
															+static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
														
 
															+{
														
 
															+	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
														
 
															+	struct cgroup *cgrp = memcg->css.cgroup;
														
 
															+
														
 
															 	/* returns EBUSY if there is a task or if we come here twice. */
														
 
															-	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
														
 
															-		ret = -EBUSY;
														
 
															-		goto out;
														
 
															-	}
														
 
															+	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
														
 
															+		return -EBUSY;
														
 
															+
														
 
															 	/* we call try-to-free pages for make this cgroup empty */
														
 
															 	lru_add_drain_all();
														
 
															 	/* try to free all pages in this cgroup */
														
 
															-	shrink = 1;
														
 
															 	while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
														
 
															 		int progress;
														
 
															-		if (signal_pending(current)) {
														
 
															-			ret = -EINTR;
														
 
															-			goto out;
														
 
															-		}
														
 
															+		if (signal_pending(current))
														
 
															+			return -EINTR;
														
 
															+
														
 
															 		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
														
 
															 						false);
														
 
															 		if (!progress) {
														
@@ -3841,13 +3833,23 @@ try_to_free:
 
															 	}
														
 
															 	lru_add_drain();
														
 
															-	/* try move_account...there may be some *locked* pages. */
														
 
															-	goto move_account;
														
 
															+	mem_cgroup_reparent_charges(memcg);
														
 
															+
														
 
															+	return 0;
														
 
															 }
														
 
															 static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
														
 
															 {
														
 
															-	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
														
 
															+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
														
 
															+	int ret;
														
 
															+
														
 
															+	if (mem_cgroup_is_root(memcg))
														
 
															+		return -EINVAL;
														
 
															+	css_get(&memcg->css);
														
 
															+	ret = mem_cgroup_force_empty(memcg);
														
 
															+	css_put(&memcg->css);
														
 
															+
														
 
															+	return ret;
														
 
															 }
														
@@ -4953,7 +4955,7 @@ err_cleanup:
 
															 }
														
 
															 static struct cgroup_subsys_state * __ref
														
 
															-mem_cgroup_create(struct cgroup *cont)
														
 
															+mem_cgroup_css_alloc(struct cgroup *cont)
														
 
															 {
														
 
															 	struct mem_cgroup *memcg, *parent;
														
 
															 	long error = -ENOMEM;
														
@@ -5034,14 +5036,14 @@ free_out:
 
															 	return ERR_PTR(error);
														
 
															 }
														
 
															-static int mem_cgroup_pre_destroy(struct cgroup *cont)
														
 
															+static void mem_cgroup_css_offline(struct cgroup *cont)
														
 
															 {
														
 
															 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
														
 
															-	return mem_cgroup_force_empty(memcg, false);
														
 
															+	mem_cgroup_reparent_charges(memcg);
														
 
															 }
														
 
															-static void mem_cgroup_destroy(struct cgroup *cont)
														
 
															+static void mem_cgroup_css_free(struct cgroup *cont)
														
 
															 {
														
 
															 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
														
@@ -5631,16 +5633,15 @@ static void mem_cgroup_move_task(struct cgroup *cont,
 
															 struct cgroup_subsys mem_cgroup_subsys = {
														
 
															 	.name = "memory",
														
 
															 	.subsys_id = mem_cgroup_subsys_id,
														
 
															-	.create = mem_cgroup_create,
														
 
															-	.pre_destroy = mem_cgroup_pre_destroy,
														
 
															-	.destroy = mem_cgroup_destroy,
														
 
															+	.css_alloc = mem_cgroup_css_alloc,
														
 
															+	.css_offline = mem_cgroup_css_offline,
														
 
															+	.css_free = mem_cgroup_css_free,
														
 
															 	.can_attach = mem_cgroup_can_attach,
														
 
															 	.cancel_attach = mem_cgroup_cancel_attach,
														
 
															 	.attach = mem_cgroup_move_task,
														
 
															 	.base_cftypes = mem_cgroup_files,
														
 
															 	.early_init = 0,
														
 
															 	.use_id = 1,
														
 
															-	.__DEPRECATED_clear_css_refs = true,
														
 
															 };
														
 
															 #ifdef CONFIG_MEMCG_SWAP
														
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -27,11 +27,7 @@
 
															 #include <linux/fdtable.h>
														
 
															-#define PRIOIDX_SZ 128
														
 
															-
														
 
															-static unsigned long prioidx_map[PRIOIDX_SZ];
														
 
															-static DEFINE_SPINLOCK(prioidx_map_lock);
														
 
															-static atomic_t max_prioidx = ATOMIC_INIT(0);
														
 
															+#define PRIOMAP_MIN_SZ		128
														
 
															 static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
														
 
															 {
														
@@ -39,136 +35,157 @@ static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgr
 
															 			    struct cgroup_netprio_state, css);
														
 
															 }
														
 
															-static int get_prioidx(u32 *prio)
														
 
															-{
														
 
															-	unsigned long flags;
														
 
															-	u32 prioidx;
														
 
															-
														
 
															-	spin_lock_irqsave(&prioidx_map_lock, flags);
														
 
															-	prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
														
 
															-	if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) {
														
 
															-		spin_unlock_irqrestore(&prioidx_map_lock, flags);
														
 
															-		return -ENOSPC;
														
 
															-	}
														
 
															-	set_bit(prioidx, prioidx_map);
														
 
															-	if (atomic_read(&max_prioidx) < prioidx)
														
 
															-		atomic_set(&max_prioidx, prioidx);
														
 
															-	spin_unlock_irqrestore(&prioidx_map_lock, flags);
														
 
															-	*prio = prioidx;
														
 
															-	return 0;
														
 
															-}
														
 
															-
														
 
															-static void put_prioidx(u32 idx)
														
 
															+/*
														
 
															+ * Extend @dev->priomap so that it's large enough to accomodate
														
 
															+ * @target_idx.  @dev->priomap.priomap_len > @target_idx after successful
														
 
															+ * return.  Must be called under rtnl lock.
														
 
															+ */
														
 
															+static int extend_netdev_table(struct net_device *dev, u32 target_idx)
														
 
															 {
														
 
															-	unsigned long flags;
														
 
															-
														
 
															-	spin_lock_irqsave(&prioidx_map_lock, flags);
														
 
															-	clear_bit(idx, prioidx_map);
														
 
															-	spin_unlock_irqrestore(&prioidx_map_lock, flags);
														
 
															-}
														
 
															+	struct netprio_map *old, *new;
														
 
															+	size_t new_sz, new_len;
														
 
															-static int extend_netdev_table(struct net_device *dev, u32 new_len)
														
 
															-{
														
 
															-	size_t new_size = sizeof(struct netprio_map) +
														
 
															-			   ((sizeof(u32) * new_len));
														
 
															-	struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL);
														
 
															-	struct netprio_map *old_priomap;
														
 
															+	/* is the existing priomap large enough? */
														
 
															+	old = rtnl_dereference(dev->priomap);
														
 
															+	if (old && old->priomap_len > target_idx)
														
 
															+		return 0;
														
 
															-	old_priomap  = rtnl_dereference(dev->priomap);
														
 
															+	/*
														
 
															+	 * Determine the new size.  Let's keep it power-of-two.  We start
														
 
															+	 * from PRIOMAP_MIN_SZ and double it until it's large enough to
														
 
															+	 * accommodate @target_idx.
														
 
															+	 */
														
 
															+	new_sz = PRIOMAP_MIN_SZ;
														
 
															+	while (true) {
														
 
															+		new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
														
 
															+			sizeof(new->priomap[0]);
														
 
															+		if (new_len > target_idx)
														
 
															+			break;
														
 
															+		new_sz *= 2;
														
 
															+		/* overflowed? */
														
 
															+		if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
														
 
															+			return -ENOSPC;
														
 
															+	}
														
 
															-	if (!new_priomap) {
														
 
															+	/* allocate & copy */
														
 
															+	new = kzalloc(new_sz, GFP_KERNEL);
														
 
															+	if (!new) {
														
 
															 		pr_warn("Unable to alloc new priomap!\n");
														
 
															 		return -ENOMEM;
														
 
															 	}
														
 
															-	if (old_priomap)
														
 
															-		memcpy(new_priomap->priomap, old_priomap->priomap,
														
 
															-		       old_priomap->priomap_len *
														
 
															-		       sizeof(old_priomap->priomap[0]));
														
 
															+	if (old)
														
 
															+		memcpy(new->priomap, old->priomap,
														
 
															+		       old->priomap_len * sizeof(old->priomap[0]));
														
 
															-	new_priomap->priomap_len = new_len;
														
 
															+	new->priomap_len = new_len;
														
 
															-	rcu_assign_pointer(dev->priomap, new_priomap);
														
 
															-	if (old_priomap)
														
 
															-		kfree_rcu(old_priomap, rcu);
														
 
															+	/* install the new priomap */
														
 
															+	rcu_assign_pointer(dev->priomap, new);
														
 
															+	if (old)
														
 
															+		kfree_rcu(old, rcu);
														
 
															 	return 0;
														
 
															 }
														
 
															-static int write_update_netdev_table(struct net_device *dev)
														
 
															+/**
														
 
															+ * netprio_prio - return the effective netprio of a cgroup-net_device pair
														
 
															+ * @cgrp: cgroup part of the target pair
														
 
															+ * @dev: net_device part of the target pair
														
 
															+ *
														
 
															+ * Should be called under RCU read or rtnl lock.
														
 
															+ */
														
 
															+static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev)
														
 
															+{
														
 
															+	struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
														
 
															+
														
 
															+	if (map && cgrp->id < map->priomap_len)
														
 
															+		return map->priomap[cgrp->id];
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * netprio_set_prio - set netprio on a cgroup-net_device pair
														
 
															+ * @cgrp: cgroup part of the target pair
														
 
															+ * @dev: net_device part of the target pair
														
 
															+ * @prio: prio to set
														
 
															+ *
														
 
															+ * Set netprio to @prio on @cgrp-@dev pair.  Should be called under rtnl
														
 
															+ * lock and may fail under memory pressure for non-zero @prio.
														
 
															+ */
														
 
															+static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev,
														
 
															+			    u32 prio)
														
 
															 {
														
 
															-	int ret = 0;
														
 
															-	u32 max_len;
														
 
															 	struct netprio_map *map;
														
 
															+	int ret;
														
 
															-	max_len = atomic_read(&max_prioidx) + 1;
														
 
															+	/* avoid extending priomap for zero writes */
														
 
															 	map = rtnl_dereference(dev->priomap);
														
 
															-	if (!map || map->priomap_len < max_len)
														
 
															-		ret = extend_netdev_table(dev, max_len);
														
 
															+	if (!prio && (!map || map->priomap_len <= cgrp->id))
														
 
															+		return 0;
														
 
															-	return ret;
														
 
															+	ret = extend_netdev_table(dev, cgrp->id);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	map = rtnl_dereference(dev->priomap);
														
 
															+	map->priomap[cgrp->id] = prio;
														
 
															+	return 0;
														
 
															 }
														
 
															-static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
														
 
															+static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
														
 
															 {
														
 
															 	struct cgroup_netprio_state *cs;
														
 
															-	int ret = -EINVAL;
														
 
															 	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
														
 
															 	if (!cs)
														
 
															 		return ERR_PTR(-ENOMEM);
														
 
															-	if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx)
														
 
															-		goto out;
														
 
															-
														
 
															-	ret = get_prioidx(&cs->prioidx);
														
 
															-	if (ret < 0) {
														
 
															-		pr_warn("No space in priority index array\n");
														
 
															-		goto out;
														
 
															-	}
														
 
															-
														
 
															 	return &cs->css;
														
 
															-out:
														
 
															-	kfree(cs);
														
 
															-	return ERR_PTR(ret);
														
 
															 }
														
 
															-static void cgrp_destroy(struct cgroup *cgrp)
														
 
															+static int cgrp_css_online(struct cgroup *cgrp)
														
 
															 {
														
 
															-	struct cgroup_netprio_state *cs;
														
 
															+	struct cgroup *parent = cgrp->parent;
														
 
															 	struct net_device *dev;
														
 
															-	struct netprio_map *map;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	if (!parent)
														
 
															+		return 0;
														
 
															-	cs = cgrp_netprio_state(cgrp);
														
 
															 	rtnl_lock();
														
 
															+	/*
														
 
															+	 * Inherit prios from the parent.  As all prios are set during
														
 
															+	 * onlining, there is no need to clear them on offline.
														
 
															+	 */
														
 
															 	for_each_netdev(&init_net, dev) {
														
 
															-		map = rtnl_dereference(dev->priomap);
														
 
															-		if (map && cs->prioidx < map->priomap_len)
														
 
															-			map->priomap[cs->prioidx] = 0;
														
 
															+		u32 prio = netprio_prio(parent, dev);
														
 
															+
														
 
															+		ret = netprio_set_prio(cgrp, dev, prio);
														
 
															+		if (ret)
														
 
															+			break;
														
 
															 	}
														
 
															 	rtnl_unlock();
														
 
															-	put_prioidx(cs->prioidx);
														
 
															-	kfree(cs);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void cgrp_css_free(struct cgroup *cgrp)
														
 
															+{
														
 
															+	kfree(cgrp_netprio_state(cgrp));
														
 
															 }
														
 
															 static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)
														
 
															 {
														
 
															-	return (u64)cgrp_netprio_state(cgrp)->prioidx;
														
 
															+	return cgrp->id;
														
 
															 }
														
 
															 static int read_priomap(struct cgroup *cont, struct cftype *cft,
														
 
															 			struct cgroup_map_cb *cb)
														
 
															 {
														
 
															 	struct net_device *dev;
														
 
															-	u32 prioidx = cgrp_netprio_state(cont)->prioidx;
														
 
															-	u32 priority;
														
 
															-	struct netprio_map *map;
														
 
															 	rcu_read_lock();
														
 
															-	for_each_netdev_rcu(&init_net, dev) {
														
 
															-		map = rcu_dereference(dev->priomap);
														
 
															-		priority = (map && prioidx < map->priomap_len) ? map->priomap[prioidx] : 0;
														
 
															-		cb->fill(cb, dev->name, priority);
														
 
															-	}
														
 
															+	for_each_netdev_rcu(&init_net, dev)
														
 
															+		cb->fill(cb, dev->name, netprio_prio(cont, dev));
														
 
															 	rcu_read_unlock();
														
 
															 	return 0;
														
 
															 }
														
@@ -176,66 +193,24 @@ static int read_priomap(struct cgroup *cont, struct cftype *cft,
 
															 static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
														
 
															 			 const char *buffer)
														
 
															 {
														
 
															-	char *devname = kstrdup(buffer, GFP_KERNEL);
														
 
															-	int ret = -EINVAL;
														
 
															-	u32 prioidx = cgrp_netprio_state(cgrp)->prioidx;
														
 
															-	unsigned long priority;
														
 
															-	char *priostr;
														
 
															+	char devname[IFNAMSIZ + 1];
														
 
															 	struct net_device *dev;
														
 
															-	struct netprio_map *map;
														
 
															-
														
 
															-	if (!devname)
														
 
															-		return -ENOMEM;
														
 
															-
														
 
															-	/*
														
 
															-	 * Minimally sized valid priomap string
														
 
															-	 */
														
 
															-	if (strlen(devname) < 3)
														
 
															-		goto out_free_devname;
														
 
															-
														
 
															-	priostr = strstr(devname, " ");
														
 
															-	if (!priostr)
														
 
															-		goto out_free_devname;
														
 
															-
														
 
															-	/*
														
 
															-	 *Separate the devname from the associated priority
														
 
															-	 *and advance the priostr pointer to the priority value
														
 
															-	 */
														
 
															-	*priostr = '\0';
														
 
															-	priostr++;
														
 
															-
														
 
															-	/*
														
 
															-	 * If the priostr points to NULL, we're at the end of the passed
														
 
															-	 * in string, and its not a valid write
														
 
															-	 */
														
 
															-	if (*priostr == '\0')
														
 
															-		goto out_free_devname;
														
 
															-
														
 
															-	ret = kstrtoul(priostr, 10, &priority);
														
 
															-	if (ret < 0)
														
 
															-		goto out_free_devname;
														
 
															+	u32 prio;
														
 
															+	int ret;
														
 
															-	ret = -ENODEV;
														
 
															+	if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
														
 
															+		return -EINVAL;
														
 
															 	dev = dev_get_by_name(&init_net, devname);
														
 
															 	if (!dev)
														
 
															-		goto out_free_devname;
														
 
															+		return -ENODEV;
														
 
															 	rtnl_lock();
														
 
															-	ret = write_update_netdev_table(dev);
														
 
															-	if (ret < 0)
														
 
															-		goto out_put_dev;
														
 
															-	map = rtnl_dereference(dev->priomap);
														
 
															-	if (map)
														
 
															-		map->priomap[prioidx] = priority;
														
 
															+	ret = netprio_set_prio(cgrp, dev, prio);
														
 
															-out_put_dev:
														
 
															 	rtnl_unlock();
														
 
															 	dev_put(dev);
														
 
															-
														
 
															-out_free_devname:
														
 
															-	kfree(devname);
														
 
															 	return ret;
														
 
															 }
														
@@ -276,22 +251,13 @@ static struct cftype ss_files[] = {
 
															 struct cgroup_subsys net_prio_subsys = {
														
 
															 	.name		= "net_prio",
														
 
															-	.create		= cgrp_create,
														
 
															-	.destroy	= cgrp_destroy,
														
 
															+	.css_alloc	= cgrp_css_alloc,
														
 
															+	.css_online	= cgrp_css_online,
														
 
															+	.css_free	= cgrp_css_free,
														
 
															 	.attach		= net_prio_attach,
														
 
															 	.subsys_id	= net_prio_subsys_id,
														
 
															 	.base_cftypes	= ss_files,
														
 
															 	.module		= THIS_MODULE,
														
 
															-
														
 
															-	/*
														
 
															-	 * net_prio has artificial limit on the number of cgroups and
														
 
															-	 * disallows nesting making it impossible to co-mount it with other
														
 
															-	 * hierarchical subsystems.  Remove the artificially low PRIOIDX_SZ
														
 
															-	 * limit and properly nest configuration such that children follow
														
 
															-	 * their parents' configurations by default and are allowed to
														
 
															-	 * override and remove the following.
														
 
															-	 */
														
 
															-	.broken_hierarchy = true,
														
 
															 };
														
 
															 static int netprio_device_event(struct notifier_block *unused,
														
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -34,21 +34,25 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
 
															 			    struct cgroup_cls_state, css);
														
 
															 }
														
 
															-static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
														
 
															+static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
														
 
															 {
														
 
															 	struct cgroup_cls_state *cs;
														
 
															 	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
														
 
															 	if (!cs)
														
 
															 		return ERR_PTR(-ENOMEM);
														
 
															+	return &cs->css;
														
 
															+}
														
 
															+static int cgrp_css_online(struct cgroup *cgrp)
														
 
															+{
														
 
															 	if (cgrp->parent)
														
 
															-		cs->classid = cgrp_cls_state(cgrp->parent)->classid;
														
 
															-
														
 
															-	return &cs->css;
														
 
															+		cgrp_cls_state(cgrp)->classid =
														
 
															+			cgrp_cls_state(cgrp->parent)->classid;
														
 
															+	return 0;
														
 
															 }
														
 
															-static void cgrp_destroy(struct cgroup *cgrp)
														
 
															+static void cgrp_css_free(struct cgroup *cgrp)
														
 
															 {
														
 
															 	kfree(cgrp_cls_state(cgrp));
														
 
															 }
														
@@ -75,20 +79,12 @@ static struct cftype ss_files[] = {
 
															 struct cgroup_subsys net_cls_subsys = {
														
 
															 	.name		= "net_cls",
														
 
															-	.create		= cgrp_create,
														
 
															-	.destroy	= cgrp_destroy,
														
 
															+	.css_alloc	= cgrp_css_alloc,
														
 
															+	.css_online	= cgrp_css_online,
														
 
															+	.css_free	= cgrp_css_free,
														
 
															 	.subsys_id	= net_cls_subsys_id,
														
 
															 	.base_cftypes	= ss_files,
														
 
															 	.module		= THIS_MODULE,
														
 
															-
														
 
															-	/*
														
 
															-	 * While net_cls cgroup has the rudimentary hierarchy support of
														
 
															-	 * inheriting the parent's classid on cgroup creation, it doesn't
														
 
															-	 * properly propagates config changes in ancestors to their
														
 
															-	 * descendents.  A child should follow the parent's configuration
														
 
															-	 * but be allowed to override it.  Fix it and remove the following.
														
 
															-	 */
														
 
															-	.broken_hierarchy = true,
														
 
															 };
														
 
															 struct cls_cgroup_head {
														
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -82,6 +82,8 @@ static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig)
 
															 {
														
 
															 	struct dev_exception_item *ex, *tmp, *new;
														
 
															+	lockdep_assert_held(&devcgroup_mutex);
														
 
															+
														
 
															 	list_for_each_entry(ex, orig, list) {
														
 
															 		new = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
														
 
															 		if (!new)
														
@@ -107,6 +109,8 @@ static int dev_exception_add(struct dev_cgroup *dev_cgroup,
 
															 {
														
 
															 	struct dev_exception_item *excopy, *walk;
														
 
															+	lockdep_assert_held(&devcgroup_mutex);
														
 
															+
														
 
															 	excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
														
 
															 	if (!excopy)
														
 
															 		return -ENOMEM;
														
@@ -137,6 +141,8 @@ static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
 
															 {
														
 
															 	struct dev_exception_item *walk, *tmp;
														
 
															+	lockdep_assert_held(&devcgroup_mutex);
														
 
															+
														
 
															 	list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
														
 
															 		if (walk->type != ex->type)
														
 
															 			continue;
														
@@ -163,6 +169,8 @@ static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
 
															 {
														
 
															 	struct dev_exception_item *ex, *tmp;
														
 
															+	lockdep_assert_held(&devcgroup_mutex);
														
 
															+
														
 
															 	list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) {
														
 
															 		list_del_rcu(&ex->list);
														
 
															 		kfree_rcu(ex, rcu);
														
@@ -172,7 +180,7 @@ static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
 
															 /*
														
 
															  * called from kernel/cgroup.c with cgroup_lock() held.
														
 
															  */
														
 
															-static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup)
														
 
															+static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
														
 
															 {
														
 
															 	struct dev_cgroup *dev_cgroup, *parent_dev_cgroup;
														
 
															 	struct cgroup *parent_cgroup;
														
@@ -202,7 +210,7 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup)
 
															 	return &dev_cgroup->css;
														
 
															 }
														
 
															-static void devcgroup_destroy(struct cgroup *cgroup)
														
 
															+static void devcgroup_css_free(struct cgroup *cgroup)
														
 
															 {
														
 
															 	struct dev_cgroup *dev_cgroup;
														
@@ -298,6 +306,10 @@ static int may_access(struct dev_cgroup *dev_cgroup,
 
															 	struct dev_exception_item *ex;
														
 
															 	bool match = false;
														
 
															+	rcu_lockdep_assert(rcu_read_lock_held() ||
														
 
															+			   lockdep_is_held(&devcgroup_mutex),
														
 
															+			   "device_cgroup::may_access() called without proper synchronization");
														
 
															+
														
 
															 	list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
														
 
															 		if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
														
 
															 			continue;
														
@@ -552,8 +564,8 @@ static struct cftype dev_cgroup_files[] = {
 
															 struct cgroup_subsys devices_subsys = {
														
 
															 	.name = "devices",
														
 
															 	.can_attach = devcgroup_can_attach,
														
 
															-	.create = devcgroup_create,
														
 
															-	.destroy = devcgroup_destroy,
														
 
															+	.css_alloc = devcgroup_css_alloc,
														
 
															+	.css_free = devcgroup_css_free,
														
 
															 	.subsys_id = devices_subsys_id,
														
 
															 	.base_cftypes = dev_cgroup_files,