8 years ago · f7878dc3a9
--- a/Documentation/cgroup-v1/rdma.txt
+++ b/Documentation/cgroup-v1/rdma.txt
@@ -0,0 +1,109 @@
 
				+				RDMA Controller
			
 
				+				----------------
			
 
				+
			
 
				+Contents
			
 
				+--------
			
 
				+
			
 
				+1. Overview
			
 
				+  1-1. What is RDMA controller?
			
 
				+  1-2. Why RDMA controller needed?
			
 
				+  1-3. How is RDMA controller implemented?
			
 
				+2. Usage Examples
			
 
				+
			
 
				+1. Overview
			
 
				+
			
 
				+1-1. What is RDMA controller?
			
 
				+-----------------------------
			
 
				+
			
 
				+RDMA controller allows user to limit RDMA/IB specific resources that a given
			
 
				+set of processes can use. These processes are grouped using RDMA controller.
			
 
				+
			
 
				+RDMA controller defines two resources which can be limited for processes of a
			
 
				+cgroup.
			
 
				+
			
 
				+1-2. Why RDMA controller needed?
			
 
				+--------------------------------
			
 
				+
			
 
				+Currently user space applications can easily take away all the rdma verb
			
 
				+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
			
 
				+in other cgroup or kernel space ULPs may not even get chance to allocate any
			
 
				+rdma resources. This can leads to service unavailability.
			
 
				+
			
 
				+Therefore RDMA controller is needed through which resource consumption
			
 
				+of processes can be limited. Through this controller different rdma
			
 
				+resources can be accounted.
			
 
				+
			
 
				+1-3. How is RDMA controller implemented?
			
 
				+----------------------------------------
			
 
				+
			
 
				+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
			
 
				+resource accounting per cgroup, per device using resource pool structure.
			
 
				+Each such resource pool is limited up to 64 resources in given resource pool
			
 
				+by rdma cgroup, which can be extended later if required.
			
 
				+
			
 
				+This resource pool object is linked to the cgroup css. Typically there
			
 
				+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
			
 
				+But nothing limits to have it more. At present hundreds of RDMA devices per
			
 
				+single cgroup may not be handled optimally, however there is no
			
 
				+known use case or requirement for such configuration either.
			
 
				+
			
 
				+Since RDMA resources can be allocated from any process and can be freed by any
			
 
				+of the child processes which shares the address space, rdma resources are
			
 
				+always owned by the creator cgroup css. This allows process migration from one
			
 
				+to other cgroup without major complexity of transferring resource ownership;
			
 
				+because such ownership is not really present due to shared nature of
			
 
				+rdma resources. Linking resources around css also ensures that cgroups can be
			
 
				+deleted after processes migrated. This allow progress migration as well with
			
 
				+active resources, even though that is not a primary use case.
			
 
				+
			
 
				+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
			
 
				+the caller. Same rdma cgroup should be passed while uncharging the resource.
			
 
				+This also allows process migrated with active RDMA resource to charge
			
 
				+to new owner cgroup for new resource. It also allows to uncharge resource of
			
 
				+a process from previously charged cgroup which is migrated to new cgroup,
			
 
				+even though that is not a primary use case.
			
 
				+
			
 
				+Resource pool object is created in following situations.
			
 
				+(a) User sets the limit and no previous resource pool exist for the device
			
 
				+of interest for the cgroup.
			
 
				+(b) No resource limits were configured, but IB/RDMA stack tries to
			
 
				+charge the resource. So that it correctly uncharge them when applications are
			
 
				+running without limits and later on when limits are enforced during uncharging,
			
 
				+otherwise usage count will drop to negative.
			
 
				+
			
 
				+Resource pool is destroyed if all the resource limits are set to max and
			
 
				+it is the last resource getting deallocated.
			
 
				+
			
 
				+User should set all the limit to max value if it intents to remove/unconfigure
			
 
				+the resource pool for a particular device.
			
 
				+
			
 
				+IB stack honors limits enforced by the rdma controller. When application
			
 
				+query about maximum resource limits of IB device, it returns minimum of
			
 
				+what is configured by user for a given cgroup and what is supported by
			
 
				+IB device.
			
 
				+
			
 
				+Following resources can be accounted by rdma controller.
			
 
				+  hca_handle	Maximum number of HCA Handles
			
 
				+  hca_object 	Maximum number of HCA Objects
			
 
				+
			
 
				+2. Usage Examples
			
 
				+-----------------
			
 
				+
			
 
				+(a) Configure resource limit:
			
 
				+echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
			
 
				+echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
			
 
				+
			
 
				+(b) Query resource limit:
			
 
				+cat /sys/fs/cgroup/rdma/2/rdma.max
			
 
				+#Output:
			
 
				+mlx4_0 hca_handle=2 hca_object=2000
			
 
				+ocrdma1 hca_handle=3 hca_object=max
			
 
				+
			
 
				+(c) Query current usage:
			
 
				+cat /sys/fs/cgroup/rdma/2/rdma.current
			
 
				+#Output:
			
 
				+mlx4_0 hca_handle=1 hca_object=20
			
 
				+ocrdma1 hca_handle=1 hca_object=23
			
 
				+
			
 
				+(d) Delete resource limit:
			
 
				+echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
			
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -47,6 +47,12 @@ CONTENTS
 
				   5-3. IO
			
 
				     5-3-1. IO Interface Files
			
 
				     5-3-2. Writeback
			
 
				+  5-4. PID
			
 
				+    5-4-1. PID Interface Files
			
 
				+  5-5. RDMA
			
 
				+    5-5-1. RDMA Interface Files
			
 
				+  5-6. Misc
			
 
				+    5-6-1. perf_event
			
 
				 6. Namespace
			
 
				   6-1. Basics
			
 
				   6-2. The Root and Views
			
@@ -328,14 +334,12 @@ a process with a non-root euid to migrate a target process into a
 
				 cgroup by writing its PID to the "cgroup.procs" file, the following
			
 
				 conditions must be met.
			
 
				 
			
 
				-- The writer's euid must match either uid or suid of the target process.
			
 
				-
			
 
				 - The writer must have write access to the "cgroup.procs" file.
			
 
				 
			
 
				 - The writer must have write access to the "cgroup.procs" file of the
			
 
				   common ancestor of the source and destination cgroups.
			
 
				 
			
 
				-The above three constraints ensure that while a delegatee may migrate
			
 
				+The above two constraints ensure that while a delegatee may migrate
			
 
				 processes around freely in the delegated sub-hierarchy it can't pull
			
 
				 in from or push out to outside the sub-hierarchy.
			
 
				 
			
@@ -350,10 +354,10 @@ all processes under C0 and C1 belong to U0.
 
				 
			
 
				 Let's also say U0 wants to write the PID of a process which is
			
 
				 currently in C10 into "C00/cgroup.procs".  U0 has write access to the
			
 
				-file and uid match on the process; however, the common ancestor of the
			
 
				-source cgroup C10 and the destination cgroup C00 is above the points
			
 
				-of delegation and U0 would not have write access to its "cgroup.procs"
			
 
				-files and thus the write will be denied with -EACCES.
			
 
				+file; however, the common ancestor of the source cgroup C10 and the
			
 
				+destination cgroup C00 is above the points of delegation and U0 would
			
 
				+not have write access to its "cgroup.procs" files and thus the write
			
 
				+will be denied with -EACCES.
			
 
				 
			
 
				 
			
 
				 2-6. Guidelines
			
@@ -1119,6 +1123,91 @@ writeback as follows.
 
				 	vm.dirty[_background]_ratio.
			
 
				 
			
 
				 
			
 
				+5-4. PID
			
 
				+
			
 
				+The process number controller is used to allow a cgroup to stop any
			
 
				+new tasks from being fork()'d or clone()'d after a specified limit is
			
 
				+reached.
			
 
				+
			
 
				+The number of tasks in a cgroup can be exhausted in ways which other
			
 
				+controllers cannot prevent, thus warranting its own controller.  For
			
 
				+example, a fork bomb is likely to exhaust the number of tasks before
			
 
				+hitting memory restrictions.
			
 
				+
			
 
				+Note that PIDs used in this controller refer to TIDs, process IDs as
			
 
				+used by the kernel.
			
 
				+
			
 
				+
			
 
				+5-4-1. PID Interface Files
			
 
				+
			
 
				+  pids.max
			
 
				+
			
 
				+ A read-write single value file which exists on non-root cgroups.  The
			
 
				+ default is "max".
			
 
				+
			
 
				+ Hard limit of number of processes.
			
 
				+
			
 
				+  pids.current
			
 
				+
			
 
				+ A read-only single value file which exists on all cgroups.
			
 
				+
			
 
				+ The number of processes currently in the cgroup and its descendants.
			
 
				+
			
 
				+Organisational operations are not blocked by cgroup policies, so it is
			
 
				+possible to have pids.current > pids.max.  This can be done by either
			
 
				+setting the limit to be smaller than pids.current, or attaching enough
			
 
				+processes to the cgroup such that pids.current is larger than
			
 
				+pids.max.  However, it is not possible to violate a cgroup PID policy
			
 
				+through fork() or clone(). These will return -EAGAIN if the creation
			
 
				+of a new process would cause a cgroup policy to be violated.
			
 
				+
			
 
				+
			
 
				+5-5. RDMA
			
 
				+
			
 
				+The "rdma" controller regulates the distribution and accounting of
			
 
				+of RDMA resources.
			
 
				+
			
 
				+5-5-1. RDMA Interface Files
			
 
				+
			
 
				+  rdma.max
			
 
				+	A readwrite nested-keyed file that exists for all the cgroups
			
 
				+	except root that describes current configured resource limit
			
 
				+	for a RDMA/IB device.
			
 
				+
			
 
				+	Lines are keyed by device name and are not ordered.
			
 
				+	Each line contains space separated resource name and its configured
			
 
				+	limit that can be distributed.
			
 
				+
			
 
				+	The following nested keys are defined.
			
 
				+
			
 
				+	  hca_handle	Maximum number of HCA Handles
			
 
				+	  hca_object 	Maximum number of HCA Objects
			
 
				+
			
 
				+	An example for mlx4 and ocrdma device follows.
			
 
				+
			
 
				+	  mlx4_0 hca_handle=2 hca_object=2000
			
 
				+	  ocrdma1 hca_handle=3 hca_object=max
			
 
				+
			
 
				+  rdma.current
			
 
				+	A read-only file that describes current resource usage.
			
 
				+	It exists for all the cgroup except root.
			
 
				+
			
 
				+	An example for mlx4 and ocrdma device follows.
			
 
				+
			
 
				+	  mlx4_0 hca_handle=1 hca_object=20
			
 
				+	  ocrdma1 hca_handle=1 hca_object=23
			
 
				+
			
 
				+
			
 
				+5-6. Misc
			
 
				+
			
 
				+5-6-1. perf_event
			
 
				+
			
 
				+perf_event controller, if not mounted on a legacy hierarchy, is
			
 
				+automatically enabled on the v2 hierarchy so that perf events can
			
 
				+always be filtered by cgroup v2 path.  The controller can still be
			
 
				+moved to a legacy hierarchy after v2 hierarchy is populated.
			
 
				+
			
 
				+
			
 
				 6. Namespace
			
 
				 
			
 
				 6-1. Basics
			
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -13,6 +13,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 
				 				multicast.o mad.o smi.o agent.o mad_rmpp.o
			
 
				 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
			
 
				 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
			
 
				+ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
			
 
				 
			
 
				 ib_cm-y :=			cm.o
			
 
				 
			
--- a/drivers/infiniband/core/cgroup.c
+++ b/drivers/infiniband/core/cgroup.c
@@ -0,0 +1,62 @@
 
				+/*
			
 
				+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify it
			
 
				+ * under the terms and conditions of the GNU General Public License,
			
 
				+ * version 2, as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope it will be useful, but WITHOUT
			
 
				+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
			
 
				+ * more details.
			
 
				+ */
			
 
				+
			
 
				+#include "core_priv.h"
			
 
				+
			
 
				+/**
			
 
				+ * ib_device_register_rdmacg - register with rdma cgroup.
			
 
				+ * @device: device to register to participate in resource
			
 
				+ *          accounting by rdma cgroup.
			
 
				+ *
			
 
				+ * Register with the rdma cgroup. Should be called before
			
 
				+ * exposing rdma device to user space applications to avoid
			
 
				+ * resource accounting leak.
			
 
				+ * Returns 0 on success or otherwise failure code.
			
 
				+ */
			
 
				+int ib_device_register_rdmacg(struct ib_device *device)
			
 
				+{
			
 
				+	device->cg_device.name = device->name;
			
 
				+	return rdmacg_register_device(&device->cg_device);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * ib_device_unregister_rdmacg - unregister with rdma cgroup.
			
 
				+ * @device: device to unregister.
			
 
				+ *
			
 
				+ * Unregister with the rdma cgroup. Should be called after
			
 
				+ * all the resources are deallocated, and after a stage when any
			
 
				+ * other resource allocation by user application cannot be done
			
 
				+ * for this device to avoid any leak in accounting.
			
 
				+ */
			
 
				+void ib_device_unregister_rdmacg(struct ib_device *device)
			
 
				+{
			
 
				+	rdmacg_unregister_device(&device->cg_device);
			
 
				+}
			
 
				+
			
 
				+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
			
 
				+			 struct ib_device *device,
			
 
				+			 enum rdmacg_resource_type resource_index)
			
 
				+{
			
 
				+	return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
			
 
				+				 resource_index);
			
 
				+}
			
 
				+EXPORT_SYMBOL(ib_rdmacg_try_charge);
			
 
				+
			
 
				+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
			
 
				+			struct ib_device *device,
			
 
				+			enum rdmacg_resource_type resource_index)
			
 
				+{
			
 
				+	rdmacg_uncharge(cg_obj->cg, &device->cg_device,
			
 
				+			resource_index);
			
 
				+}
			
 
				+EXPORT_SYMBOL(ib_rdmacg_uncharge);
			
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -35,6 +35,7 @@
 
				 
			
 
				 #include <linux/list.h>
			
 
				 #include <linux/spinlock.h>
			
 
				+#include <linux/cgroup_rdma.h>
			
 
				 
			
 
				 #include <rdma/ib_verbs.h>
			
 
				 
			
@@ -124,6 +125,35 @@ int ib_cache_setup_one(struct ib_device *device);
 
				 void ib_cache_cleanup_one(struct ib_device *device);
			
 
				 void ib_cache_release_one(struct ib_device *device);
			
 
				 
			
 
				+#ifdef CONFIG_CGROUP_RDMA
			
 
				+int ib_device_register_rdmacg(struct ib_device *device);
			
 
				+void ib_device_unregister_rdmacg(struct ib_device *device);
			
 
				+
			
 
				+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
			
 
				+			 struct ib_device *device,
			
 
				+			 enum rdmacg_resource_type resource_index);
			
 
				+
			
 
				+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
			
 
				+			struct ib_device *device,
			
 
				+			enum rdmacg_resource_type resource_index);
			
 
				+#else
			
 
				+static inline int ib_device_register_rdmacg(struct ib_device *device)
			
 
				+{ return 0; }
			
 
				+
			
 
				+static inline void ib_device_unregister_rdmacg(struct ib_device *device)
			
 
				+{ }
			
 
				+
			
 
				+static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
			
 
				+				       struct ib_device *device,
			
 
				+				       enum rdmacg_resource_type resource_index)
			
 
				+{ return 0; }
			
 
				+
			
 
				+static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
			
 
				+				      struct ib_device *device,
			
 
				+				      enum rdmacg_resource_type resource_index)
			
 
				+{ }
			
 
				+#endif
			
 
				+
			
 
				 static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
			
 
				 					 struct net_device *upper)
			
 
				 {
			
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -369,10 +369,18 @@ int ib_register_device(struct ib_device *device,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				+	ret = ib_device_register_rdmacg(device);
			
 
				+	if (ret) {
			
 
				+		pr_warn("Couldn't register device with rdma cgroup\n");
			
 
				+		ib_cache_cleanup_one(device);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				 	memset(&device->attrs, 0, sizeof(device->attrs));
			
 
				 	ret = device->query_device(device, &device->attrs, &uhw);
			
 
				 	if (ret) {
			
 
				 		pr_warn("Couldn't query the device attributes\n");
			
 
				+		ib_device_unregister_rdmacg(device);
			
 
				 		ib_cache_cleanup_one(device);
			
 
				 		goto out;
			
 
				 	}
			
@@ -381,6 +389,7 @@ int ib_register_device(struct ib_device *device,
 
				 	if (ret) {
			
 
				 		pr_warn("Couldn't register device %s with driver model\n",
			
 
				 			device->name);
			
 
				+		ib_device_unregister_rdmacg(device);
			
 
				 		ib_cache_cleanup_one(device);
			
 
				 		goto out;
			
 
				 	}
			
@@ -430,6 +439,7 @@ void ib_unregister_device(struct ib_device *device)
 
				 
			
 
				 	mutex_unlock(&device_mutex);
			
 
				 
			
 
				+	ib_device_unregister_rdmacg(device);
			
 
				 	ib_device_unregister_sysfs(device);
			
 
				 	ib_cache_cleanup_one(device);
			
 
				 
			
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 
				 	struct ib_udata                   udata;
			
 
				 	struct ib_ucontext		 *ucontext;
			
 
				 	struct file			 *filp;
			
 
				+	struct ib_rdmacg_object		 cg_obj;
			
 
				 	int ret;
			
 
				 
			
 
				 	if (out_len < sizeof resp)
			
@@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 
				 		   (unsigned long) cmd.response + sizeof resp,
			
 
				 		   in_len - sizeof cmd, out_len - sizeof resp);
			
 
				 
			
 
				+	ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
			
 
				+	if (ret)
			
 
				+		goto err;
			
 
				+
			
 
				 	ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
			
 
				 	if (IS_ERR(ucontext)) {
			
 
				 		ret = PTR_ERR(ucontext);
			
 
				-		goto err;
			
 
				+		goto err_alloc;
			
 
				 	}
			
 
				 
			
 
				 	ucontext->device = ib_dev;
			
 
				+	ucontext->cg_obj = cg_obj;
			
 
				 	INIT_LIST_HEAD(&ucontext->pd_list);
			
 
				 	INIT_LIST_HEAD(&ucontext->mr_list);
			
 
				 	INIT_LIST_HEAD(&ucontext->mw_list);
			
@@ -407,6 +413,9 @@ err_free:
 
				 	put_pid(ucontext->tgid);
			
 
				 	ib_dev->dealloc_ucontext(ucontext);
			
 
				 
			
 
				+err_alloc:
			
 
				+	ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
			
 
				+
			
 
				 err:
			
 
				 	mutex_unlock(&file->mutex);
			
 
				 	return ret;
			
@@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
 
				 		return -ENOMEM;
			
 
				 
			
 
				 	init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
			
 
				+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+	if (ret) {
			
 
				+		kfree(uobj);
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				 	down_write(&uobj->mutex);
			
 
				 
			
 
				 	pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
			
@@ -605,6 +621,7 @@ err_idr:
 
				 	ib_dealloc_pd(pd);
			
 
				 
			
 
				 err:
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 	put_uobj_write(uobj);
			
 
				 	return ret;
			
 
				 }
			
@@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
 
				 	if (ret)
			
 
				 		goto err_put;
			
 
				 
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				 	uobj->live = 0;
			
 
				 	put_uobj_write(uobj);
			
 
				 
			
@@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
 
				 			goto err_put;
			
 
				 		}
			
 
				 	}
			
 
				+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+	if (ret)
			
 
				+		goto err_charge;
			
 
				 
			
 
				 	mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
			
 
				 				     cmd.access_flags, &udata);
			
@@ -1054,6 +1077,9 @@ err_unreg:
 
				 	ib_dereg_mr(mr);
			
 
				 
			
 
				 err_put:
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				+err_charge:
			
 
				 	put_pd_read(pd);
			
 
				 
			
 
				 err_free:
			
@@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				 	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
			
 
				 
			
 
				 	mutex_lock(&file->mutex);
			
@@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
 
				 		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
			
 
				 		   out_len - sizeof(resp));
			
 
				 
			
 
				+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+	if (ret)
			
 
				+		goto err_charge;
			
 
				+
			
 
				 	mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
			
 
				 	if (IS_ERR(mw)) {
			
 
				 		ret = PTR_ERR(mw);
			
@@ -1271,6 +1304,9 @@ err_unalloc:
 
				 	uverbs_dealloc_mw(mw);
			
 
				 
			
 
				 err_put:
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				+err_charge:
			
 
				 	put_pd_read(pd);
			
 
				 
			
 
				 err_free:
			
@@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				 	idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
			
 
				 
			
 
				 	mutex_lock(&file->mutex);
			
@@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
 
				 	if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
			
 
				 		attr.flags = cmd->flags;
			
 
				 
			
 
				+	ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+	if (ret)
			
 
				+		goto err_charge;
			
 
				+
			
 
				 	cq = ib_dev->create_cq(ib_dev, &attr,
			
 
				 					     file->ucontext, uhw);
			
 
				 	if (IS_ERR(cq)) {
			
@@ -1452,6 +1495,10 @@ err_free:
 
				 	ib_destroy_cq(cq);
			
 
				 
			
 
				 err_file:
			
 
				+	ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev,
			
 
				+			   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				+err_charge:
			
 
				 	if (ev_file)
			
 
				 		ib_uverbs_release_ucq(file, ev_file, obj);
			
 
				 
			
@@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				 	idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
			
 
				 
			
 
				 	mutex_lock(&file->mutex);
			
@@ -1905,6 +1954,11 @@ static int create_qp(struct ib_uverbs_file *file,
 
				 			goto err_put;
			
 
				 		}
			
 
				 
			
 
				+	ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+	if (ret)
			
 
				+		goto err_put;
			
 
				+
			
 
				 	if (cmd->qp_type == IB_QPT_XRC_TGT)
			
 
				 		qp = ib_create_qp(pd, &attr);
			
 
				 	else
			
@@ -1912,7 +1966,7 @@ static int create_qp(struct ib_uverbs_file *file,
 
				 
			
 
				 	if (IS_ERR(qp)) {
			
 
				 		ret = PTR_ERR(qp);
			
 
				-		goto err_put;
			
 
				+		goto err_create;
			
 
				 	}
			
 
				 
			
 
				 	if (cmd->qp_type != IB_QPT_XRC_TGT) {
			
@@ -1993,6 +2047,10 @@ err_cb:
 
				 err_destroy:
			
 
				 	ib_destroy_qp(qp);
			
 
				 
			
 
				+err_create:
			
 
				+	ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device,
			
 
				+			   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				 err_put:
			
 
				 	if (xrcd)
			
 
				 		put_xrcd_read(xrcd_uobj);
			
@@ -2519,6 +2577,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				 	if (obj->uxrcd)
			
 
				 		atomic_dec(&obj->uxrcd->refcnt);
			
 
				 
			
@@ -2970,11 +3030,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
 
				 	memset(&attr.dmac, 0, sizeof(attr.dmac));
			
 
				 	memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
			
 
				 
			
 
				+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+	if (ret)
			
 
				+		goto err_charge;
			
 
				+
			
 
				 	ah = pd->device->create_ah(pd, &attr, &udata);
			
 
				 
			
 
				 	if (IS_ERR(ah)) {
			
 
				 		ret = PTR_ERR(ah);
			
 
				-		goto err_put;
			
 
				+		goto err_create;
			
 
				 	}
			
 
				 
			
 
				 	ah->device  = pd->device;
			
@@ -3013,7 +3078,10 @@ err_copy:
 
				 err_destroy:
			
 
				 	ib_destroy_ah(ah);
			
 
				 
			
 
				-err_put:
			
 
				+err_create:
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				+err_charge:
			
 
				 	put_pd_read(pd);
			
 
				 
			
 
				 err:
			
@@ -3047,6 +3115,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				 	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
			
 
				 
			
 
				 	mutex_lock(&file->mutex);
			
@@ -3861,10 +3931,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
 
				 		err = -EINVAL;
			
 
				 		goto err_free;
			
 
				 	}
			
 
				+
			
 
				+	err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+	if (err)
			
 
				+		goto err_free;
			
 
				+
			
 
				 	flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
			
 
				 	if (IS_ERR(flow_id)) {
			
 
				 		err = PTR_ERR(flow_id);
			
 
				-		goto err_free;
			
 
				+		goto err_create;
			
 
				 	}
			
 
				 	flow_id->uobject = uobj;
			
 
				 	uobj->object = flow_id;
			
@@ -3897,6 +3973,8 @@ err_copy:
 
				 	idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
			
 
				 destroy_flow:
			
 
				 	ib_destroy_flow(flow_id);
			
 
				+err_create:
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 err_free:
			
 
				 	kfree(flow_attr);
			
 
				 err_put:
			
@@ -3936,8 +4014,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
 
				 	flow_id = uobj->object;
			
 
				 
			
 
				 	ret = ib_destroy_flow(flow_id);
			
 
				-	if (!ret)
			
 
				+	if (!ret) {
			
 
				+		ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 		uobj->live = 0;
			
 
				+	}
			
 
				 
			
 
				 	put_uobj_write(uobj);
			
 
				 
			
@@ -4005,6 +4086,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
 
				 	obj->uevent.events_reported = 0;
			
 
				 	INIT_LIST_HEAD(&obj->uevent.event_list);
			
 
				 
			
 
				+	ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+	if (ret)
			
 
				+		goto err_put_cq;
			
 
				+
			
 
				 	srq = pd->device->create_srq(pd, &attr, udata);
			
 
				 	if (IS_ERR(srq)) {
			
 
				 		ret = PTR_ERR(srq);
			
@@ -4069,6 +4155,8 @@ err_destroy:
 
				 	ib_destroy_srq(srq);
			
 
				 
			
 
				 err_put:
			
 
				+	ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev,
			
 
				+			   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 	put_pd_read(pd);
			
 
				 
			
 
				 err_put_cq:
			
@@ -4255,6 +4343,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
			
 
				+
			
 
				 	if (srq_type == IB_SRQT_XRC) {
			
 
				 		us = container_of(obj, struct ib_usrq_object, uevent);
			
 
				 		atomic_dec(&us->uxrcd->refcnt);
			
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -51,6 +51,7 @@
 
				 #include <rdma/ib.h>
			
 
				 
			
 
				 #include "uverbs.h"
			
 
				+#include "core_priv.h"
			
 
				 
			
 
				 MODULE_AUTHOR("Roland Dreier");
			
 
				 MODULE_DESCRIPTION("InfiniBand userspace verbs access");
			
@@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
				 
			
 
				 		idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
			
 
				 		ib_destroy_ah(ah);
			
 
				+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 		kfree(uobj);
			
 
				 	}
			
 
				 
			
@@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
				 
			
 
				 		idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
			
 
				 		uverbs_dealloc_mw(mw);
			
 
				+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 		kfree(uobj);
			
 
				 	}
			
 
				 
			
@@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
				 
			
 
				 		idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
			
 
				 		ib_destroy_flow(flow_id);
			
 
				+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 		kfree(uobj);
			
 
				 	}
			
 
				 
			
@@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
				 		if (qp == qp->real_qp)
			
 
				 			ib_uverbs_detach_umcast(qp, uqp);
			
 
				 		ib_destroy_qp(qp);
			
 
				+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 		ib_uverbs_release_uevent(file, &uqp->uevent);
			
 
				 		kfree(uqp);
			
 
				 	}
			
@@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
				 
			
 
				 		idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
			
 
				 		ib_destroy_srq(srq);
			
 
				+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 		ib_uverbs_release_uevent(file, uevent);
			
 
				 		kfree(uevent);
			
 
				 	}
			
@@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
				 
			
 
				 		idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
			
 
				 		ib_destroy_cq(cq);
			
 
				+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 		ib_uverbs_release_ucq(file, ev_file, ucq);
			
 
				 		kfree(ucq);
			
 
				 	}
			
@@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
				 
			
 
				 		idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
			
 
				 		ib_dereg_mr(mr);
			
 
				+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 		kfree(uobj);
			
 
				 	}
			
 
				 
			
@@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
				 
			
 
				 		idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
			
 
				 		ib_dealloc_pd(pd);
			
 
				+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
			
 
				+				   RDMACG_RESOURCE_HCA_OBJECT);
			
 
				 		kfree(uobj);
			
 
				 	}
			
 
				 
			
 
				 	put_pid(context->tgid);
			
 
				 
			
 
				+	ib_rdmacg_uncharge(&context->cg_obj, context->device,
			
 
				+			   RDMACG_RESOURCE_HCA_HANDLE);
			
 
				+
			
 
				 	return context->device->dealloc_ucontext(context);
			
 
				 }
			
 
				 
			
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -478,7 +478,7 @@ static void kernfs_drain(struct kernfs_node *kn)
 
				 		rwsem_release(&kn->dep_map, 1, _RET_IP_);
			
 
				 	}
			
 
				 
			
 
				-	kernfs_unmap_bin_file(kn);
			
 
				+	kernfs_drain_open_files(kn);
			
 
				 
			
 
				 	mutex_lock(&kernfs_mutex);
			
 
				 }
			
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -515,7 +515,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
 
				 		goto out_put;
			
 
				 
			
 
				 	rc = 0;
			
 
				-	of->mmapped = 1;
			
 
				+	of->mmapped = true;
			
 
				 	of->vm_ops = vma->vm_ops;
			
 
				 	vma->vm_ops = &kernfs_vm_ops;
			
 
				 out_put:
			
@@ -707,7 +707,8 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 
				 	if (error)
			
 
				 		goto err_free;
			
 
				 
			
 
				-	((struct seq_file *)file->private_data)->private = of;
			
 
				+	of->seq_file = file->private_data;
			
 
				+	of->seq_file->private = of;
			
 
				 
			
 
				 	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
			
 
				 	if (file->f_mode & FMODE_WRITE)
			
@@ -716,13 +717,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 
				 	/* make sure we have open node struct */
			
 
				 	error = kernfs_get_open_node(kn, of);
			
 
				 	if (error)
			
 
				-		goto err_close;
			
 
				+		goto err_seq_release;
			
 
				+
			
 
				+	if (ops->open) {
			
 
				+		/* nobody has access to @of yet, skip @of->mutex */
			
 
				+		error = ops->open(of);
			
 
				+		if (error)
			
 
				+			goto err_put_node;
			
 
				+	}
			
 
				 
			
 
				 	/* open succeeded, put active references */
			
 
				 	kernfs_put_active(kn);
			
 
				 	return 0;
			
 
				 
			
 
				-err_close:
			
 
				+err_put_node:
			
 
				+	kernfs_put_open_node(kn, of);
			
 
				+err_seq_release:
			
 
				 	seq_release(inode, file);
			
 
				 err_free:
			
 
				 	kfree(of->prealloc_buf);
			
@@ -732,11 +742,41 @@ err_out:
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				+/* used from release/drain to ensure that ->release() is called exactly once */
			
 
				+static void kernfs_release_file(struct kernfs_node *kn,
			
 
				+				struct kernfs_open_file *of)
			
 
				+{
			
 
				+	/*
			
 
				+	 * @of is guaranteed to have no other file operations in flight and
			
 
				+	 * we just want to synchronize release and drain paths.
			
 
				+	 * @kernfs_open_file_mutex is enough.  @of->mutex can't be used
			
 
				+	 * here because drain path may be called from places which can
			
 
				+	 * cause circular dependency.
			
 
				+	 */
			
 
				+	lockdep_assert_held(&kernfs_open_file_mutex);
			
 
				+
			
 
				+	if (!of->released) {
			
 
				+		/*
			
 
				+		 * A file is never detached without being released and we
			
 
				+		 * need to be able to release files which are deactivated
			
 
				+		 * and being drained.  Don't use kernfs_ops().
			
 
				+		 */
			
 
				+		kn->attr.ops->release(of);
			
 
				+		of->released = true;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static int kernfs_fop_release(struct inode *inode, struct file *filp)
			
 
				 {
			
 
				 	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
			
 
				 	struct kernfs_open_file *of = kernfs_of(filp);
			
 
				 
			
 
				+	if (kn->flags & KERNFS_HAS_RELEASE) {
			
 
				+		mutex_lock(&kernfs_open_file_mutex);
			
 
				+		kernfs_release_file(kn, of);
			
 
				+		mutex_unlock(&kernfs_open_file_mutex);
			
 
				+	}
			
 
				+
			
 
				 	kernfs_put_open_node(kn, of);
			
 
				 	seq_release(inode, filp);
			
 
				 	kfree(of->prealloc_buf);
			
@@ -745,12 +785,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void kernfs_unmap_bin_file(struct kernfs_node *kn)
			
 
				+void kernfs_drain_open_files(struct kernfs_node *kn)
			
 
				 {
			
 
				 	struct kernfs_open_node *on;
			
 
				 	struct kernfs_open_file *of;
			
 
				 
			
 
				-	if (!(kn->flags & KERNFS_HAS_MMAP))
			
 
				+	if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
			
 
				 		return;
			
 
				 
			
 
				 	spin_lock_irq(&kernfs_open_node_lock);
			
@@ -762,10 +802,16 @@ void kernfs_unmap_bin_file(struct kernfs_node *kn)
 
				 		return;
			
 
				 
			
 
				 	mutex_lock(&kernfs_open_file_mutex);
			
 
				+
			
 
				 	list_for_each_entry(of, &on->files, list) {
			
 
				 		struct inode *inode = file_inode(of->file);
			
 
				-		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
			
 
				+
			
 
				+		if (kn->flags & KERNFS_HAS_MMAP)
			
 
				+			unmap_mapping_range(inode->i_mapping, 0, 0, 1);
			
 
				+
			
 
				+		kernfs_release_file(kn, of);
			
 
				 	}
			
 
				+
			
 
				 	mutex_unlock(&kernfs_open_file_mutex);
			
 
				 
			
 
				 	kernfs_put_open_node(kn, NULL);
			
@@ -964,6 +1010,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 
				 		kn->flags |= KERNFS_HAS_SEQ_SHOW;
			
 
				 	if (ops->mmap)
			
 
				 		kn->flags |= KERNFS_HAS_MMAP;
			
 
				+	if (ops->release)
			
 
				+		kn->flags |= KERNFS_HAS_RELEASE;
			
 
				 
			
 
				 	rc = kernfs_add_one(kn);
			
 
				 	if (rc) {
			
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -104,7 +104,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
 
				  */
			
 
				 extern const struct file_operations kernfs_file_fops;
			
 
				 
			
 
				-void kernfs_unmap_bin_file(struct kernfs_node *kn);
			
 
				+void kernfs_drain_open_files(struct kernfs_node *kn);
			
 
				 
			
 
				 /*
			
 
				  * symlink.c
			
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -148,14 +148,18 @@ struct cgroup_subsys_state {
 
				  * set for a task.
			
 
				  */
			
 
				 struct css_set {
			
 
				-	/* Reference count */
			
 
				-	atomic_t refcount;
			
 
				-
			
 
				 	/*
			
 
				-	 * List running through all cgroup groups in the same hash
			
 
				-	 * slot. Protected by css_set_lock
			
 
				+	 * Set of subsystem states, one for each subsystem. This array is
			
 
				+	 * immutable after creation apart from the init_css_set during
			
 
				+	 * subsystem registration (at boot time).
			
 
				 	 */
			
 
				-	struct hlist_node hlist;
			
 
				+	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
			
 
				+
			
 
				+	/* reference count */
			
 
				+	atomic_t refcount;
			
 
				+
			
 
				+	/* the default cgroup associated with this css_set */
			
 
				+	struct cgroup *dfl_cgrp;
			
 
				 
			
 
				 	/*
			
 
				 	 * Lists running through all tasks using this cgroup group.
			
@@ -167,21 +171,29 @@ struct css_set {
 
				 	struct list_head tasks;
			
 
				 	struct list_head mg_tasks;
			
 
				 
			
 
				+	/* all css_task_iters currently walking this cset */
			
 
				+	struct list_head task_iters;
			
 
				+
			
 
				 	/*
			
 
				-	 * List of cgrp_cset_links pointing at cgroups referenced from this
			
 
				-	 * css_set.  Protected by css_set_lock.
			
 
				+	 * On the default hierarhcy, ->subsys[ssid] may point to a css
			
 
				+	 * attached to an ancestor instead of the cgroup this css_set is
			
 
				+	 * associated with.  The following node is anchored at
			
 
				+	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
			
 
				+	 * iterate through all css's attached to a given cgroup.
			
 
				 	 */
			
 
				-	struct list_head cgrp_links;
			
 
				+	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
			
 
				 
			
 
				-	/* the default cgroup associated with this css_set */
			
 
				-	struct cgroup *dfl_cgrp;
			
 
				+	/*
			
 
				+	 * List running through all cgroup groups in the same hash
			
 
				+	 * slot. Protected by css_set_lock
			
 
				+	 */
			
 
				+	struct hlist_node hlist;
			
 
				 
			
 
				 	/*
			
 
				-	 * Set of subsystem states, one for each subsystem. This array is
			
 
				-	 * immutable after creation apart from the init_css_set during
			
 
				-	 * subsystem registration (at boot time).
			
 
				+	 * List of cgrp_cset_links pointing at cgroups referenced from this
			
 
				+	 * css_set.  Protected by css_set_lock.
			
 
				 	 */
			
 
				-	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
			
 
				+	struct list_head cgrp_links;
			
 
				 
			
 
				 	/*
			
 
				 	 * List of csets participating in the on-going migration either as
			
@@ -201,18 +213,6 @@ struct css_set {
 
				 	struct cgroup *mg_dst_cgrp;
			
 
				 	struct css_set *mg_dst_cset;
			
 
				 
			
 
				-	/*
			
 
				-	 * On the default hierarhcy, ->subsys[ssid] may point to a css
			
 
				-	 * attached to an ancestor instead of the cgroup this css_set is
			
 
				-	 * associated with.  The following node is anchored at
			
 
				-	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
			
 
				-	 * iterate through all css's attached to a given cgroup.
			
 
				-	 */
			
 
				-	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
			
 
				-
			
 
				-	/* all css_task_iters currently walking this cset */
			
 
				-	struct list_head task_iters;
			
 
				-
			
 
				 	/* dead and being drained, ignore for migration */
			
 
				 	bool dead;
			
 
				 
			
@@ -388,6 +388,9 @@ struct cftype {
 
				 	struct list_head node;		/* anchored at ss->cfts */
			
 
				 	struct kernfs_ops *kf_ops;
			
 
				 
			
 
				+	int (*open)(struct kernfs_open_file *of);
			
 
				+	void (*release)(struct kernfs_open_file *of);
			
 
				+
			
 
				 	/*
			
 
				 	 * read_u64() is a shortcut for the common case of returning a
			
 
				 	 * single integer. Use it in place of read()
			
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -266,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it);
 
				  * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
			
 
				  * @leader: the loop cursor
			
 
				  * @dst_css: the destination css
			
 
				- * @tset: takset to iterate
			
 
				+ * @tset: taskset to iterate
			
 
				  *
			
 
				  * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
			
 
				  * may not contain any.
			
--- a/include/linux/cgroup_rdma.h
+++ b/include/linux/cgroup_rdma.h
@@ -0,0 +1,53 @@
 
				+/*
			
 
				+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
			
 
				+ *
			
 
				+ * This file is subject to the terms and conditions of version 2 of the GNU
			
 
				+ * General Public License. See the file COPYING in the main directory of the
			
 
				+ * Linux distribution for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef _CGROUP_RDMA_H
			
 
				+#define _CGROUP_RDMA_H
			
 
				+
			
 
				+#include <linux/cgroup.h>
			
 
				+
			
 
				+enum rdmacg_resource_type {
			
 
				+	RDMACG_RESOURCE_HCA_HANDLE,
			
 
				+	RDMACG_RESOURCE_HCA_OBJECT,
			
 
				+	RDMACG_RESOURCE_MAX,
			
 
				+};
			
 
				+
			
 
				+#ifdef CONFIG_CGROUP_RDMA
			
 
				+
			
 
				+struct rdma_cgroup {
			
 
				+	struct cgroup_subsys_state	css;
			
 
				+
			
 
				+	/*
			
 
				+	 * head to keep track of all resource pools
			
 
				+	 * that belongs to this cgroup.
			
 
				+	 */
			
 
				+	struct list_head		rpools;
			
 
				+};
			
 
				+
			
 
				+struct rdmacg_device {
			
 
				+	struct list_head	dev_node;
			
 
				+	struct list_head	rpools;
			
 
				+	char			*name;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * APIs for RDMA/IB stack to publish when a device wants to
			
 
				+ * participate in resource accounting
			
 
				+ */
			
 
				+int rdmacg_register_device(struct rdmacg_device *device);
			
 
				+void rdmacg_unregister_device(struct rdmacg_device *device);
			
 
				+
			
 
				+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
			
 
				+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
			
 
				+		      struct rdmacg_device *device,
			
 
				+		      enum rdmacg_resource_type index);
			
 
				+void rdmacg_uncharge(struct rdma_cgroup *cg,
			
 
				+		     struct rdmacg_device *device,
			
 
				+		     enum rdmacg_resource_type index);
			
 
				+#endif	/* CONFIG_CGROUP_RDMA */
			
 
				+#endif	/* _CGROUP_RDMA_H */
			
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
 
				 SUBSYS(pids)
			
 
				 #endif
			
 
				 
			
 
				+#if IS_ENABLED(CONFIG_CGROUP_RDMA)
			
 
				+SUBSYS(rdma)
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * The following subsystems are not supported on the default hierarchy.
			
 
				  */
			
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -46,6 +46,7 @@ enum kernfs_node_flag {
 
				 	KERNFS_SUICIDAL		= 0x0400,
			
 
				 	KERNFS_SUICIDED		= 0x0800,
			
 
				 	KERNFS_EMPTY_DIR	= 0x1000,
			
 
				+	KERNFS_HAS_RELEASE	= 0x2000,
			
 
				 };
			
 
				 
			
 
				 /* @flags for kernfs_create_root() */
			
@@ -175,6 +176,7 @@ struct kernfs_open_file {
 
				 	/* published fields */
			
 
				 	struct kernfs_node	*kn;
			
 
				 	struct file		*file;
			
 
				+	struct seq_file		*seq_file;
			
 
				 	void			*priv;
			
 
				 
			
 
				 	/* private fields, do not use outside kernfs proper */
			
@@ -185,11 +187,19 @@ struct kernfs_open_file {
 
				 	char			*prealloc_buf;
			
 
				 
			
 
				 	size_t			atomic_write_len;
			
 
				-	bool			mmapped;
			
 
				+	bool			mmapped:1;
			
 
				+	bool			released:1;
			
 
				 	const struct vm_operations_struct *vm_ops;
			
 
				 };
			
 
				 
			
 
				 struct kernfs_ops {
			
 
				+	/*
			
 
				+	 * Optional open/release methods.  Both are called with
			
 
				+	 * @of->seq_file populated.
			
 
				+	 */
			
 
				+	int (*open)(struct kernfs_open_file *of);
			
 
				+	void (*release)(struct kernfs_open_file *of);
			
 
				+
			
 
				 	/*
			
 
				 	 * Read is handled by either seq_file or raw_read().
			
 
				 	 *
			
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -60,6 +60,7 @@
 
				 #include <linux/atomic.h>
			
 
				 #include <linux/mmu_notifier.h>
			
 
				 #include <linux/uaccess.h>
			
 
				+#include <linux/cgroup_rdma.h>
			
 
				 
			
 
				 extern struct workqueue_struct *ib_wq;
			
 
				 extern struct workqueue_struct *ib_comp_wq;
			
@@ -1356,6 +1357,12 @@ struct ib_fmr_attr {
 
				 
			
 
				 struct ib_umem;
			
 
				 
			
 
				+struct ib_rdmacg_object {
			
 
				+#ifdef CONFIG_CGROUP_RDMA
			
 
				+	struct rdma_cgroup	*cg;		/* owner rdma cgroup */
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				 struct ib_ucontext {
			
 
				 	struct ib_device       *device;
			
 
				 	struct list_head	pd_list;
			
@@ -1388,6 +1395,8 @@ struct ib_ucontext {
 
				 	struct list_head	no_private_counters;
			
 
				 	int                     odp_mrs_count;
			
 
				 #endif
			
 
				+
			
 
				+	struct ib_rdmacg_object	cg_obj;
			
 
				 };
			
 
				 
			
 
				 struct ib_uobject {
			
@@ -1395,6 +1404,7 @@ struct ib_uobject {
 
				 	struct ib_ucontext     *context;	/* associated user context */
			
 
				 	void		       *object;		/* containing object */
			
 
				 	struct list_head	list;		/* link to context's list */
			
 
				+	struct ib_rdmacg_object	cg_obj;		/* rdmacg object */
			
 
				 	int			id;		/* index into kernel idr */
			
 
				 	struct kref		ref;
			
 
				 	struct rw_semaphore	mutex;		/* protects .live */
			
@@ -2128,6 +2138,10 @@ struct ib_device {
 
				 	struct attribute_group	     *hw_stats_ag;
			
 
				 	struct rdma_hw_stats         *hw_stats;
			
 
				 
			
 
				+#ifdef CONFIG_CGROUP_RDMA
			
 
				+	struct rdmacg_device         cg_device;
			
 
				+#endif
			
 
				+
			
 
				 	/**
			
 
				 	 * The following mandatory functions are used only at device
			
 
				 	 * registration.  Keep functions such as these at the end of this
			
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1078,6 +1078,16 @@ config CGROUP_PIDS
 
				 	  since the PIDs limit only affects a process's ability to fork, not to
			
 
				 	  attach to a cgroup.
			
 
				 
			
 
				+config CGROUP_RDMA
			
 
				+	bool "RDMA controller"
			
 
				+	help
			
 
				+	  Provides enforcement of RDMA resources defined by IB stack.
			
 
				+	  It is fairly easy for consumers to exhaust RDMA resources, which
			
 
				+	  can result into resource unavailability to other consumers.
			
 
				+	  RDMA controller is designed to stop this from happening.
			
 
				+	  Attaching processes with active RDMA resources to the cgroup
			
 
				+	  hierarchy is allowed even if can cross the hierarchy's limit.
			
 
				+
			
 
				 config CGROUP_FREEZER
			
 
				 	bool "Freezer controller"
			
 
				 	help
			
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,10 +64,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
 
				 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
			
 
				 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
			
 
				 obj-$(CONFIG_COMPAT) += compat.o
			
 
				-obj-$(CONFIG_CGROUPS) += cgroup.o
			
 
				-obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
			
 
				-obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
			
 
				-obj-$(CONFIG_CPUSETS) += cpuset.o
			
 
				+obj-$(CONFIG_CGROUPS) += cgroup/
			
 
				 obj-$(CONFIG_UTS_NS) += utsname.o
			
 
				 obj-$(CONFIG_USER_NS) += user_namespace.o
			
 
				 obj-$(CONFIG_PID_NS) += pid_namespace.o
			
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -0,0 +1,6 @@
 
				+obj-y := cgroup.o namespace.o cgroup-v1.o
			
 
				+
			
 
				+obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
			
 
				+obj-$(CONFIG_CGROUP_PIDS) += pids.o
			
 
				+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
			
 
				+obj-$(CONFIG_CPUSETS) += cpuset.o
			
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -0,0 +1,214 @@
 
				+#ifndef __CGROUP_INTERNAL_H
			
 
				+#define __CGROUP_INTERNAL_H
			
 
				+
			
 
				+#include <linux/cgroup.h>
			
 
				+#include <linux/kernfs.h>
			
 
				+#include <linux/workqueue.h>
			
 
				+#include <linux/list.h>
			
 
				+
			
 
				+/*
			
 
				+ * A cgroup can be associated with multiple css_sets as different tasks may
			
 
				+ * belong to different cgroups on different hierarchies.  In the other
			
 
				+ * direction, a css_set is naturally associated with multiple cgroups.
			
 
				+ * This M:N relationship is represented by the following link structure
			
 
				+ * which exists for each association and allows traversing the associations
			
 
				+ * from both sides.
			
 
				+ */
			
 
				+struct cgrp_cset_link {
			
 
				+	/* the cgroup and css_set this link associates */
			
 
				+	struct cgroup		*cgrp;
			
 
				+	struct css_set		*cset;
			
 
				+
			
 
				+	/* list of cgrp_cset_links anchored at cgrp->cset_links */
			
 
				+	struct list_head	cset_link;
			
 
				+
			
 
				+	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
			
 
				+	struct list_head	cgrp_link;
			
 
				+};
			
 
				+
			
 
				+/* used to track tasks and csets during migration */
			
 
				+struct cgroup_taskset {
			
 
				+	/* the src and dst cset list running through cset->mg_node */
			
 
				+	struct list_head	src_csets;
			
 
				+	struct list_head	dst_csets;
			
 
				+
			
 
				+	/* the subsys currently being processed */
			
 
				+	int			ssid;
			
 
				+
			
 
				+	/*
			
 
				+	 * Fields for cgroup_taskset_*() iteration.
			
 
				+	 *
			
 
				+	 * Before migration is committed, the target migration tasks are on
			
 
				+	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
			
 
				+	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
			
 
				+	 * or ->dst_csets depending on whether migration is committed.
			
 
				+	 *
			
 
				+	 * ->cur_csets and ->cur_task point to the current task position
			
 
				+	 * during iteration.
			
 
				+	 */
			
 
				+	struct list_head	*csets;
			
 
				+	struct css_set		*cur_cset;
			
 
				+	struct task_struct	*cur_task;
			
 
				+};
			
 
				+
			
 
				+/* migration context also tracks preloading */
			
 
				+struct cgroup_mgctx {
			
 
				+	/*
			
 
				+	 * Preloaded source and destination csets.  Used to guarantee
			
 
				+	 * atomic success or failure on actual migration.
			
 
				+	 */
			
 
				+	struct list_head	preloaded_src_csets;
			
 
				+	struct list_head	preloaded_dst_csets;
			
 
				+
			
 
				+	/* tasks and csets to migrate */
			
 
				+	struct cgroup_taskset	tset;
			
 
				+
			
 
				+	/* subsystems affected by migration */
			
 
				+	u16			ss_mask;
			
 
				+};
			
 
				+
			
 
				+#define CGROUP_TASKSET_INIT(tset)						\
			
 
				+{										\
			
 
				+	.src_csets		= LIST_HEAD_INIT(tset.src_csets),		\
			
 
				+	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),		\
			
 
				+	.csets			= &tset.src_csets,				\
			
 
				+}
			
 
				+
			
 
				+#define CGROUP_MGCTX_INIT(name)							\
			
 
				+{										\
			
 
				+	LIST_HEAD_INIT(name.preloaded_src_csets),				\
			
 
				+	LIST_HEAD_INIT(name.preloaded_dst_csets),				\
			
 
				+	CGROUP_TASKSET_INIT(name.tset),						\
			
 
				+}
			
 
				+
			
 
				+#define DEFINE_CGROUP_MGCTX(name)						\
			
 
				+	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
			
 
				+
			
 
				+struct cgroup_sb_opts {
			
 
				+	u16 subsys_mask;
			
 
				+	unsigned int flags;
			
 
				+	char *release_agent;
			
 
				+	bool cpuset_clone_children;
			
 
				+	char *name;
			
 
				+	/* User explicitly requested empty subsystem */
			
 
				+	bool none;
			
 
				+};
			
 
				+
			
 
				+extern struct mutex cgroup_mutex;
			
 
				+extern spinlock_t css_set_lock;
			
 
				+extern struct cgroup_subsys *cgroup_subsys[];
			
 
				+extern struct list_head cgroup_roots;
			
 
				+extern struct file_system_type cgroup_fs_type;
			
 
				+
			
 
				+/* iterate across the hierarchies */
			
 
				+#define for_each_root(root)						\
			
 
				+	list_for_each_entry((root), &cgroup_roots, root_list)
			
 
				+
			
 
				+/**
			
 
				+ * for_each_subsys - iterate all enabled cgroup subsystems
			
 
				+ * @ss: the iteration cursor
			
 
				+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
			
 
				+ */
			
 
				+#define for_each_subsys(ss, ssid)					\
			
 
				+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
			
 
				+	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
			
 
				+
			
 
				+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
			
 
				+{
			
 
				+	return !(cgrp->self.flags & CSS_ONLINE);
			
 
				+}
			
 
				+
			
 
				+static inline bool notify_on_release(const struct cgroup *cgrp)
			
 
				+{
			
 
				+	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
			
 
				+}
			
 
				+
			
 
				+void put_css_set_locked(struct css_set *cset);
			
 
				+
			
 
				+static inline void put_css_set(struct css_set *cset)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	/*
			
 
				+	 * Ensure that the refcount doesn't hit zero while any readers
			
 
				+	 * can see it. Similar to atomic_dec_and_lock(), but for an
			
 
				+	 * rwlock
			
 
				+	 */
			
 
				+	if (atomic_add_unless(&cset->refcount, -1, 1))
			
 
				+		return;
			
 
				+
			
 
				+	spin_lock_irqsave(&css_set_lock, flags);
			
 
				+	put_css_set_locked(cset);
			
 
				+	spin_unlock_irqrestore(&css_set_lock, flags);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * refcounted get/put for css_set objects
			
 
				+ */
			
 
				+static inline void get_css_set(struct css_set *cset)
			
 
				+{
			
 
				+	atomic_inc(&cset->refcount);
			
 
				+}
			
 
				+
			
 
				+bool cgroup_ssid_enabled(int ssid);
			
 
				+bool cgroup_on_dfl(const struct cgroup *cgrp);
			
 
				+
			
 
				+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
			
 
				+struct cgroup *task_cgroup_from_root(struct task_struct *task,
			
 
				+				     struct cgroup_root *root);
			
 
				+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
			
 
				+void cgroup_kn_unlock(struct kernfs_node *kn);
			
 
				+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
			
 
				+			  struct cgroup_namespace *ns);
			
 
				+
			
 
				+void cgroup_free_root(struct cgroup_root *root);
			
 
				+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
			
 
				+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
			
 
				+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
			
 
				+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
			
 
				+			       struct cgroup_root *root, unsigned long magic,
			
 
				+			       struct cgroup_namespace *ns);
			
 
				+
			
 
				+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
			
 
				+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
			
 
				+void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
			
 
				+			    struct cgroup_mgctx *mgctx);
			
 
				+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
			
 
				+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
			
 
				+		   struct cgroup_mgctx *mgctx);
			
 
				+
			
 
				+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
			
 
				+		       bool threadgroup);
			
 
				+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
			
 
				+			     size_t nbytes, loff_t off, bool threadgroup);
			
 
				+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
			
 
				+			   loff_t off);
			
 
				+
			
 
				+void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
			
 
				+
			
 
				+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
			
 
				+int cgroup_rmdir(struct kernfs_node *kn);
			
 
				+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
			
 
				+		     struct kernfs_root *kf_root);
			
 
				+
			
 
				+/*
			
 
				+ * namespace.c
			
 
				+ */
			
 
				+extern const struct proc_ns_operations cgroupns_operations;
			
 
				+
			
 
				+/*
			
 
				+ * cgroup-v1.c
			
 
				+ */
			
 
				+extern struct cftype cgroup1_base_files[];
			
 
				+extern const struct file_operations proc_cgroupstats_operations;
			
 
				+extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
			
 
				+
			
 
				+bool cgroup1_ssid_disabled(int ssid);
			
 
				+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
			
 
				+void cgroup1_release_agent(struct work_struct *work);
			
 
				+void cgroup1_check_for_release(struct cgroup *cgrp);
			
 
				+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
			
 
				+			     void *data, unsigned long magic,
			
 
				+			     struct cgroup_namespace *ns);
			
 
				+
			
 
				+#endif /* __CGROUP_INTERNAL_H */
			
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -0,0 +1,1395 @@
 
				+#include "cgroup-internal.h"
			
 
				+
			
 
				+#include <linux/ctype.h>
			
 
				+#include <linux/kmod.h>
			
 
				+#include <linux/sort.h>
			
 
				+#include <linux/delay.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+#include <linux/delayacct.h>
			
 
				+#include <linux/pid_namespace.h>
			
 
				+#include <linux/cgroupstats.h>
			
 
				+
			
 
				+#include <trace/events/cgroup.h>
			
 
				+
			
 
				+/*
			
 
				+ * pidlists linger the following amount before being destroyed.  The goal
			
 
				+ * is avoiding frequent destruction in the middle of consecutive read calls
			
 
				+ * Expiring in the middle is a performance problem not a correctness one.
			
 
				+ * 1 sec should be enough.
			
 
				+ */
			
 
				+#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
			
 
				+
			
 
				+/* Controllers blocked by the commandline in v1 */
			
 
				+static u16 cgroup_no_v1_mask;
			
 
				+
			
 
				+/*
			
 
				+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
			
 
				+ * separate workqueue as flush domain.
			
 
				+ */
			
 
				+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
			
 
				+
			
 
				+/*
			
 
				+ * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
			
 
				+ * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
			
 
				+ */
			
 
				+static DEFINE_SPINLOCK(release_agent_path_lock);
			
 
				+
			
 
				+bool cgroup1_ssid_disabled(int ssid)
			
 
				+{
			
 
				+	return cgroup_no_v1_mask & (1 << ssid);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
			
 
				+ * @from: attach to all cgroups of a given task
			
 
				+ * @tsk: the task to be attached
			
 
				+ */
			
 
				+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
			
 
				+{
			
 
				+	struct cgroup_root *root;
			
 
				+	int retval = 0;
			
 
				+
			
 
				+	mutex_lock(&cgroup_mutex);
			
 
				+	percpu_down_write(&cgroup_threadgroup_rwsem);
			
 
				+	for_each_root(root) {
			
 
				+		struct cgroup *from_cgrp;
			
 
				+
			
 
				+		if (root == &cgrp_dfl_root)
			
 
				+			continue;
			
 
				+
			
 
				+		spin_lock_irq(&css_set_lock);
			
 
				+		from_cgrp = task_cgroup_from_root(from, root);
			
 
				+		spin_unlock_irq(&css_set_lock);
			
 
				+
			
 
				+		retval = cgroup_attach_task(from_cgrp, tsk, false);
			
 
				+		if (retval)
			
 
				+			break;
			
 
				+	}
			
 
				+	percpu_up_write(&cgroup_threadgroup_rwsem);
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				+
			
 
				+	return retval;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
			
 
				+
			
 
				+/**
			
 
				+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
			
 
				+ * @to: cgroup to which the tasks will be moved
			
 
				+ * @from: cgroup in which the tasks currently reside
			
 
				+ *
			
 
				+ * Locking rules between cgroup_post_fork() and the migration path
			
 
				+ * guarantee that, if a task is forking while being migrated, the new child
			
 
				+ * is guaranteed to be either visible in the source cgroup after the
			
 
				+ * parent's migration is complete or put into the target cgroup.  No task
			
 
				+ * can slip out of migration through forking.
			
 
				+ */
			
 
				+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
			
 
				+{
			
 
				+	DEFINE_CGROUP_MGCTX(mgctx);
			
 
				+	struct cgrp_cset_link *link;
			
 
				+	struct css_task_iter it;
			
 
				+	struct task_struct *task;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (cgroup_on_dfl(to))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (!cgroup_may_migrate_to(to))
			
 
				+		return -EBUSY;
			
 
				+
			
 
				+	mutex_lock(&cgroup_mutex);
			
 
				+
			
 
				+	percpu_down_write(&cgroup_threadgroup_rwsem);
			
 
				+
			
 
				+	/* all tasks in @from are being moved, all csets are source */
			
 
				+	spin_lock_irq(&css_set_lock);
			
 
				+	list_for_each_entry(link, &from->cset_links, cset_link)
			
 
				+		cgroup_migrate_add_src(link->cset, to, &mgctx);
			
 
				+	spin_unlock_irq(&css_set_lock);
			
 
				+
			
 
				+	ret = cgroup_migrate_prepare_dst(&mgctx);
			
 
				+	if (ret)
			
 
				+		goto out_err;
			
 
				+
			
 
				+	/*
			
 
				+	 * Migrate tasks one-by-one until @from is empty.  This fails iff
			
 
				+	 * ->can_attach() fails.
			
 
				+	 */
			
 
				+	do {
			
 
				+		css_task_iter_start(&from->self, &it);
			
 
				+		task = css_task_iter_next(&it);
			
 
				+		if (task)
			
 
				+			get_task_struct(task);
			
 
				+		css_task_iter_end(&it);
			
 
				+
			
 
				+		if (task) {
			
 
				+			ret = cgroup_migrate(task, false, &mgctx);
			
 
				+			if (!ret)
			
 
				+				trace_cgroup_transfer_tasks(to, task, false);
			
 
				+			put_task_struct(task);
			
 
				+		}
			
 
				+	} while (task && !ret);
			
 
				+out_err:
			
 
				+	cgroup_migrate_finish(&mgctx);
			
 
				+	percpu_up_write(&cgroup_threadgroup_rwsem);
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Stuff for reading the 'tasks'/'procs' files.
			
 
				+ *
			
 
				+ * Reading this file can return large amounts of data if a cgroup has
			
 
				+ * *lots* of attached tasks. So it may need several calls to read(),
			
 
				+ * but we cannot guarantee that the information we produce is correct
			
 
				+ * unless we produce it entirely atomically.
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+/* which pidlist file are we talking about? */
			
 
				+enum cgroup_filetype {
			
 
				+	CGROUP_FILE_PROCS,
			
 
				+	CGROUP_FILE_TASKS,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * A pidlist is a list of pids that virtually represents the contents of one
			
 
				+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
			
 
				+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
			
 
				+ * to the cgroup.
			
 
				+ */
			
 
				+struct cgroup_pidlist {
			
 
				+	/*
			
 
				+	 * used to find which pidlist is wanted. doesn't change as long as
			
 
				+	 * this particular list stays in the list.
			
 
				+	*/
			
 
				+	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
			
 
				+	/* array of xids */
			
 
				+	pid_t *list;
			
 
				+	/* how many elements the above list has */
			
 
				+	int length;
			
 
				+	/* each of these stored in a list by its cgroup */
			
 
				+	struct list_head links;
			
 
				+	/* pointer to the cgroup we belong to, for list removal purposes */
			
 
				+	struct cgroup *owner;
			
 
				+	/* for delayed destruction */
			
 
				+	struct delayed_work destroy_dwork;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * The following two functions "fix" the issue where there are more pids
			
 
				+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
			
 
				+ * TODO: replace with a kernel-wide solution to this problem
			
 
				+ */
			
 
				+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
			
 
				+static void *pidlist_allocate(int count)
			
 
				+{
			
 
				+	if (PIDLIST_TOO_LARGE(count))
			
 
				+		return vmalloc(count * sizeof(pid_t));
			
 
				+	else
			
 
				+		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
			
 
				+}
			
 
				+
			
 
				+static void pidlist_free(void *p)
			
 
				+{
			
 
				+	kvfree(p);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Used to destroy all pidlists lingering waiting for destroy timer.  None
			
 
				+ * should be left afterwards.
			
 
				+ */
			
 
				+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
			
 
				+{
			
 
				+	struct cgroup_pidlist *l, *tmp_l;
			
 
				+
			
 
				+	mutex_lock(&cgrp->pidlist_mutex);
			
 
				+	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
			
 
				+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
			
 
				+	mutex_unlock(&cgrp->pidlist_mutex);
			
 
				+
			
 
				+	flush_workqueue(cgroup_pidlist_destroy_wq);
			
 
				+	BUG_ON(!list_empty(&cgrp->pidlists));
			
 
				+}
			
 
				+
			
 
				+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
			
 
				+{
			
 
				+	struct delayed_work *dwork = to_delayed_work(work);
			
 
				+	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
			
 
				+						destroy_dwork);
			
 
				+	struct cgroup_pidlist *tofree = NULL;
			
 
				+
			
 
				+	mutex_lock(&l->owner->pidlist_mutex);
			
 
				+
			
 
				+	/*
			
 
				+	 * Destroy iff we didn't get queued again.  The state won't change
			
 
				+	 * as destroy_dwork can only be queued while locked.
			
 
				+	 */
			
 
				+	if (!delayed_work_pending(dwork)) {
			
 
				+		list_del(&l->links);
			
 
				+		pidlist_free(l->list);
			
 
				+		put_pid_ns(l->key.ns);
			
 
				+		tofree = l;
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&l->owner->pidlist_mutex);
			
 
				+	kfree(tofree);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
			
 
				+ * Returns the number of unique elements.
			
 
				+ */
			
 
				+static int pidlist_uniq(pid_t *list, int length)
			
 
				+{
			
 
				+	int src, dest = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * we presume the 0th element is unique, so i starts at 1. trivial
			
 
				+	 * edge cases first; no work needs to be done for either
			
 
				+	 */
			
 
				+	if (length == 0 || length == 1)
			
 
				+		return length;
			
 
				+	/* src and dest walk down the list; dest counts unique elements */
			
 
				+	for (src = 1; src < length; src++) {
			
 
				+		/* find next unique element */
			
 
				+		while (list[src] == list[src-1]) {
			
 
				+			src++;
			
 
				+			if (src == length)
			
 
				+				goto after;
			
 
				+		}
			
 
				+		/* dest always points to where the next unique element goes */
			
 
				+		list[dest] = list[src];
			
 
				+		dest++;
			
 
				+	}
			
 
				+after:
			
 
				+	return dest;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The two pid files - task and cgroup.procs - guaranteed that the result
			
 
				+ * is sorted, which forced this whole pidlist fiasco.  As pid order is
			
 
				+ * different per namespace, each namespace needs differently sorted list,
			
 
				+ * making it impossible to use, for example, single rbtree of member tasks
			
 
				+ * sorted by task pointer.  As pidlists can be fairly large, allocating one
			
 
				+ * per open file is dangerous, so cgroup had to implement shared pool of
			
 
				+ * pidlists keyed by cgroup and namespace.
			
 
				+ */
			
 
				+static int cmppid(const void *a, const void *b)
			
 
				+{
			
 
				+	return *(pid_t *)a - *(pid_t *)b;
			
 
				+}
			
 
				+
			
 
				+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
			
 
				+						  enum cgroup_filetype type)
			
 
				+{
			
 
				+	struct cgroup_pidlist *l;
			
 
				+	/* don't need task_nsproxy() if we're looking at ourself */
			
 
				+	struct pid_namespace *ns = task_active_pid_ns(current);
			
 
				+
			
 
				+	lockdep_assert_held(&cgrp->pidlist_mutex);
			
 
				+
			
 
				+	list_for_each_entry(l, &cgrp->pidlists, links)
			
 
				+		if (l->key.type == type && l->key.ns == ns)
			
 
				+			return l;
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * find the appropriate pidlist for our purpose (given procs vs tasks)
			
 
				+ * returns with the lock on that pidlist already held, and takes care
			
 
				+ * of the use count, or returns NULL with no locks held if we're out of
			
 
				+ * memory.
			
 
				+ */
			
 
				+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
			
 
				+						enum cgroup_filetype type)
			
 
				+{
			
 
				+	struct cgroup_pidlist *l;
			
 
				+
			
 
				+	lockdep_assert_held(&cgrp->pidlist_mutex);
			
 
				+
			
 
				+	l = cgroup_pidlist_find(cgrp, type);
			
 
				+	if (l)
			
 
				+		return l;
			
 
				+
			
 
				+	/* entry not found; create a new one */
			
 
				+	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
			
 
				+	if (!l)
			
 
				+		return l;
			
 
				+
			
 
				+	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
			
 
				+	l->key.type = type;
			
 
				+	/* don't need task_nsproxy() if we're looking at ourself */
			
 
				+	l->key.ns = get_pid_ns(task_active_pid_ns(current));
			
 
				+	l->owner = cgrp;
			
 
				+	list_add(&l->links, &cgrp->pidlists);
			
 
				+	return l;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * cgroup_task_count - count the number of tasks in a cgroup.
			
 
				+ * @cgrp: the cgroup in question
			
 
				+ *
			
 
				+ * Return the number of tasks in the cgroup.  The returned number can be
			
 
				+ * higher than the actual number of tasks due to css_set references from
			
 
				+ * namespace roots and temporary usages.
			
 
				+ */
			
 
				+static int cgroup_task_count(const struct cgroup *cgrp)
			
 
				+{
			
 
				+	int count = 0;
			
 
				+	struct cgrp_cset_link *link;
			
 
				+
			
 
				+	spin_lock_irq(&css_set_lock);
			
 
				+	list_for_each_entry(link, &cgrp->cset_links, cset_link)
			
 
				+		count += atomic_read(&link->cset->refcount);
			
 
				+	spin_unlock_irq(&css_set_lock);
			
 
				+	return count;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
			
 
				+ */
			
 
				+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
			
 
				+			      struct cgroup_pidlist **lp)
			
 
				+{
			
 
				+	pid_t *array;
			
 
				+	int length;
			
 
				+	int pid, n = 0; /* used for populating the array */
			
 
				+	struct css_task_iter it;
			
 
				+	struct task_struct *tsk;
			
 
				+	struct cgroup_pidlist *l;
			
 
				+
			
 
				+	lockdep_assert_held(&cgrp->pidlist_mutex);
			
 
				+
			
 
				+	/*
			
 
				+	 * If cgroup gets more users after we read count, we won't have
			
 
				+	 * enough space - tough.  This race is indistinguishable to the
			
 
				+	 * caller from the case that the additional cgroup users didn't
			
 
				+	 * show up until sometime later on.
			
 
				+	 */
			
 
				+	length = cgroup_task_count(cgrp);
			
 
				+	array = pidlist_allocate(length);
			
 
				+	if (!array)
			
 
				+		return -ENOMEM;
			
 
				+	/* now, populate the array */
			
 
				+	css_task_iter_start(&cgrp->self, &it);
			
 
				+	while ((tsk = css_task_iter_next(&it))) {
			
 
				+		if (unlikely(n == length))
			
 
				+			break;
			
 
				+		/* get tgid or pid for procs or tasks file respectively */
			
 
				+		if (type == CGROUP_FILE_PROCS)
			
 
				+			pid = task_tgid_vnr(tsk);
			
 
				+		else
			
 
				+			pid = task_pid_vnr(tsk);
			
 
				+		if (pid > 0) /* make sure to only use valid results */
			
 
				+			array[n++] = pid;
			
 
				+	}
			
 
				+	css_task_iter_end(&it);
			
 
				+	length = n;
			
 
				+	/* now sort & (if procs) strip out duplicates */
			
 
				+	sort(array, length, sizeof(pid_t), cmppid, NULL);
			
 
				+	if (type == CGROUP_FILE_PROCS)
			
 
				+		length = pidlist_uniq(array, length);
			
 
				+
			
 
				+	l = cgroup_pidlist_find_create(cgrp, type);
			
 
				+	if (!l) {
			
 
				+		pidlist_free(array);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	/* store array, freeing old if necessary */
			
 
				+	pidlist_free(l->list);
			
 
				+	l->list = array;
			
 
				+	l->length = length;
			
 
				+	*lp = l;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * seq_file methods for the tasks/procs files. The seq_file position is the
			
 
				+ * next pid to display; the seq_file iterator is a pointer to the pid
			
 
				+ * in the cgroup->l->list array.
			
 
				+ */
			
 
				+
			
 
				+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Initially we receive a position value that corresponds to
			
 
				+	 * one more than the last pid shown (or 0 on the first call or
			
 
				+	 * after a seek to the start). Use a binary-search to find the
			
 
				+	 * next pid to display, if any
			
 
				+	 */
			
 
				+	struct kernfs_open_file *of = s->private;
			
 
				+	struct cgroup *cgrp = seq_css(s)->cgroup;
			
 
				+	struct cgroup_pidlist *l;
			
 
				+	enum cgroup_filetype type = seq_cft(s)->private;
			
 
				+	int index = 0, pid = *pos;
			
 
				+	int *iter, ret;
			
 
				+
			
 
				+	mutex_lock(&cgrp->pidlist_mutex);
			
 
				+
			
 
				+	/*
			
 
				+	 * !NULL @of->priv indicates that this isn't the first start()
			
 
				+	 * after open.  If the matching pidlist is around, we can use that.
			
 
				+	 * Look for it.  Note that @of->priv can't be used directly.  It
			
 
				+	 * could already have been destroyed.
			
 
				+	 */
			
 
				+	if (of->priv)
			
 
				+		of->priv = cgroup_pidlist_find(cgrp, type);
			
 
				+
			
 
				+	/*
			
 
				+	 * Either this is the first start() after open or the matching
			
 
				+	 * pidlist has been destroyed inbetween.  Create a new one.
			
 
				+	 */
			
 
				+	if (!of->priv) {
			
 
				+		ret = pidlist_array_load(cgrp, type,
			
 
				+					 (struct cgroup_pidlist **)&of->priv);
			
 
				+		if (ret)
			
 
				+			return ERR_PTR(ret);
			
 
				+	}
			
 
				+	l = of->priv;
			
 
				+
			
 
				+	if (pid) {
			
 
				+		int end = l->length;
			
 
				+
			
 
				+		while (index < end) {
			
 
				+			int mid = (index + end) / 2;
			
 
				+			if (l->list[mid] == pid) {
			
 
				+				index = mid;
			
 
				+				break;
			
 
				+			} else if (l->list[mid] <= pid)
			
 
				+				index = mid + 1;
			
 
				+			else
			
 
				+				end = mid;
			
 
				+		}
			
 
				+	}
			
 
				+	/* If we're off the end of the array, we're done */
			
 
				+	if (index >= l->length)
			
 
				+		return NULL;
			
 
				+	/* Update the abstract position to be the actual pid that we found */
			
 
				+	iter = l->list + index;
			
 
				+	*pos = *iter;
			
 
				+	return iter;
			
 
				+}
			
 
				+
			
 
				+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
			
 
				+{
			
 
				+	struct kernfs_open_file *of = s->private;
			
 
				+	struct cgroup_pidlist *l = of->priv;
			
 
				+
			
 
				+	if (l)
			
 
				+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
			
 
				+				 CGROUP_PIDLIST_DESTROY_DELAY);
			
 
				+	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
			
 
				+}
			
 
				+
			
 
				+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
			
 
				+{
			
 
				+	struct kernfs_open_file *of = s->private;
			
 
				+	struct cgroup_pidlist *l = of->priv;
			
 
				+	pid_t *p = v;
			
 
				+	pid_t *end = l->list + l->length;
			
 
				+	/*
			
 
				+	 * Advance to the next pid in the array. If this goes off the
			
 
				+	 * end, we're done
			
 
				+	 */
			
 
				+	p++;
			
 
				+	if (p >= end) {
			
 
				+		return NULL;
			
 
				+	} else {
			
 
				+		*pos = *p;
			
 
				+		return p;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int cgroup_pidlist_show(struct seq_file *s, void *v)
			
 
				+{
			
 
				+	seq_printf(s, "%d\n", *(int *)v);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
			
 
				+				  char *buf, size_t nbytes, loff_t off)
			
 
				+{
			
 
				+	return __cgroup_procs_write(of, buf, nbytes, off, false);
			
 
				+}
			
 
				+
			
 
				+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
			
 
				+					  char *buf, size_t nbytes, loff_t off)
			
 
				+{
			
 
				+	struct cgroup *cgrp;
			
 
				+
			
 
				+	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
			
 
				+
			
 
				+	cgrp = cgroup_kn_lock_live(of->kn, false);
			
 
				+	if (!cgrp)
			
 
				+		return -ENODEV;
			
 
				+	spin_lock(&release_agent_path_lock);
			
 
				+	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
			
 
				+		sizeof(cgrp->root->release_agent_path));
			
 
				+	spin_unlock(&release_agent_path_lock);
			
 
				+	cgroup_kn_unlock(of->kn);
			
 
				+	return nbytes;
			
 
				+}
			
 
				+
			
 
				+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
			
 
				+{
			
 
				+	struct cgroup *cgrp = seq_css(seq)->cgroup;
			
 
				+
			
 
				+	spin_lock(&release_agent_path_lock);
			
 
				+	seq_puts(seq, cgrp->root->release_agent_path);
			
 
				+	spin_unlock(&release_agent_path_lock);
			
 
				+	seq_putc(seq, '\n');
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
			
 
				+{
			
 
				+	seq_puts(seq, "0\n");
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
			
 
				+					 struct cftype *cft)
			
 
				+{
			
 
				+	return notify_on_release(css->cgroup);
			
 
				+}
			
 
				+
			
 
				+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
			
 
				+					  struct cftype *cft, u64 val)
			
 
				+{
			
 
				+	if (val)
			
 
				+		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
			
 
				+	else
			
 
				+		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
			
 
				+				      struct cftype *cft)
			
 
				+{
			
 
				+	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
			
 
				+}
			
 
				+
			
 
				+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
			
 
				+				       struct cftype *cft, u64 val)
			
 
				+{
			
 
				+	if (val)
			
 
				+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
			
 
				+	else
			
 
				+		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* cgroup core interface files for the legacy hierarchies */
			
 
				+struct cftype cgroup1_base_files[] = {
			
 
				+	{
			
 
				+		.name = "cgroup.procs",
			
 
				+		.seq_start = cgroup_pidlist_start,
			
 
				+		.seq_next = cgroup_pidlist_next,
			
 
				+		.seq_stop = cgroup_pidlist_stop,
			
 
				+		.seq_show = cgroup_pidlist_show,
			
 
				+		.private = CGROUP_FILE_PROCS,
			
 
				+		.write = cgroup_procs_write,
			
 
				+	},
			
 
				+	{
			
 
				+		.name = "cgroup.clone_children",
			
 
				+		.read_u64 = cgroup_clone_children_read,
			
 
				+		.write_u64 = cgroup_clone_children_write,
			
 
				+	},
			
 
				+	{
			
 
				+		.name = "cgroup.sane_behavior",
			
 
				+		.flags = CFTYPE_ONLY_ON_ROOT,
			
 
				+		.seq_show = cgroup_sane_behavior_show,
			
 
				+	},
			
 
				+	{
			
 
				+		.name = "tasks",
			
 
				+		.seq_start = cgroup_pidlist_start,
			
 
				+		.seq_next = cgroup_pidlist_next,
			
 
				+		.seq_stop = cgroup_pidlist_stop,
			
 
				+		.seq_show = cgroup_pidlist_show,
			
 
				+		.private = CGROUP_FILE_TASKS,
			
 
				+		.write = cgroup_tasks_write,
			
 
				+	},
			
 
				+	{
			
 
				+		.name = "notify_on_release",
			
 
				+		.read_u64 = cgroup_read_notify_on_release,
			
 
				+		.write_u64 = cgroup_write_notify_on_release,
			
 
				+	},
			
 
				+	{
			
 
				+		.name = "release_agent",
			
 
				+		.flags = CFTYPE_ONLY_ON_ROOT,
			
 
				+		.seq_show = cgroup_release_agent_show,
			
 
				+		.write = cgroup_release_agent_write,
			
 
				+		.max_write_len = PATH_MAX - 1,
			
 
				+	},
			
 
				+	{ }	/* terminate */
			
 
				+};
			
 
				+
			
 
				+/* Display information about each subsystem and each hierarchy */
			
 
				+static int proc_cgroupstats_show(struct seq_file *m, void *v)
			
 
				+{
			
 
				+	struct cgroup_subsys *ss;
			
 
				+	int i;
			
 
				+
			
 
				+	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
			
 
				+	/*
			
 
				+	 * ideally we don't want subsystems moving around while we do this.
			
 
				+	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
			
 
				+	 * subsys/hierarchy state.
			
 
				+	 */
			
 
				+	mutex_lock(&cgroup_mutex);
			
 
				+
			
 
				+	for_each_subsys(ss, i)
			
 
				+		seq_printf(m, "%s\t%d\t%d\t%d\n",
			
 
				+			   ss->legacy_name, ss->root->hierarchy_id,
			
 
				+			   atomic_read(&ss->root->nr_cgrps),
			
 
				+			   cgroup_ssid_enabled(i));
			
 
				+
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cgroupstats_open(struct inode *inode, struct file *file)
			
 
				+{
			
 
				+	return single_open(file, proc_cgroupstats_show, NULL);
			
 
				+}
			
 
				+
			
 
				+const struct file_operations proc_cgroupstats_operations = {
			
 
				+	.open = cgroupstats_open,
			
 
				+	.read = seq_read,
			
 
				+	.llseek = seq_lseek,
			
 
				+	.release = single_release,
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * cgroupstats_build - build and fill cgroupstats
			
 
				+ * @stats: cgroupstats to fill information into
			
 
				+ * @dentry: A dentry entry belonging to the cgroup for which stats have
			
 
				+ * been requested.
			
 
				+ *
			
 
				+ * Build and fill cgroupstats so that taskstats can export it to user
			
 
				+ * space.
			
 
				+ */
			
 
				+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
			
 
				+{
			
 
				+	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
			
 
				+	struct cgroup *cgrp;
			
 
				+	struct css_task_iter it;
			
 
				+	struct task_struct *tsk;
			
 
				+
			
 
				+	/* it should be kernfs_node belonging to cgroupfs and is a directory */
			
 
				+	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
			
 
				+	    kernfs_type(kn) != KERNFS_DIR)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	mutex_lock(&cgroup_mutex);
			
 
				+
			
 
				+	/*
			
 
				+	 * We aren't being called from kernfs and there's no guarantee on
			
 
				+	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
			
 
				+	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
			
 
				+	 */
			
 
				+	rcu_read_lock();
			
 
				+	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
			
 
				+	if (!cgrp || cgroup_is_dead(cgrp)) {
			
 
				+		rcu_read_unlock();
			
 
				+		mutex_unlock(&cgroup_mutex);
			
 
				+		return -ENOENT;
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	css_task_iter_start(&cgrp->self, &it);
			
 
				+	while ((tsk = css_task_iter_next(&it))) {
			
 
				+		switch (tsk->state) {
			
 
				+		case TASK_RUNNING:
			
 
				+			stats->nr_running++;
			
 
				+			break;
			
 
				+		case TASK_INTERRUPTIBLE:
			
 
				+			stats->nr_sleeping++;
			
 
				+			break;
			
 
				+		case TASK_UNINTERRUPTIBLE:
			
 
				+			stats->nr_uninterruptible++;
			
 
				+			break;
			
 
				+		case TASK_STOPPED:
			
 
				+			stats->nr_stopped++;
			
 
				+			break;
			
 
				+		default:
			
 
				+			if (delayacct_is_task_waiting_on_io(tsk))
			
 
				+				stats->nr_io_wait++;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	css_task_iter_end(&it);
			
 
				+
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void cgroup1_check_for_release(struct cgroup *cgrp)
			
 
				+{
			
 
				+	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
			
 
				+	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
			
 
				+		schedule_work(&cgrp->release_agent_work);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Notify userspace when a cgroup is released, by running the
			
 
				+ * configured release agent with the name of the cgroup (path
			
 
				+ * relative to the root of cgroup file system) as the argument.
			
 
				+ *
			
 
				+ * Most likely, this user command will try to rmdir this cgroup.
			
 
				+ *
			
 
				+ * This races with the possibility that some other task will be
			
 
				+ * attached to this cgroup before it is removed, or that some other
			
 
				+ * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
			
 
				+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
			
 
				+ * unused, and this cgroup will be reprieved from its death sentence,
			
 
				+ * to continue to serve a useful existence.  Next time it's released,
			
 
				+ * we will get notified again, if it still has 'notify_on_release' set.
			
 
				+ *
			
 
				+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
			
 
				+ * means only wait until the task is successfully execve()'d.  The
			
 
				+ * separate release agent task is forked by call_usermodehelper(),
			
 
				+ * then control in this thread returns here, without waiting for the
			
 
				+ * release agent task.  We don't bother to wait because the caller of
			
 
				+ * this routine has no use for the exit status of the release agent
			
 
				+ * task, so no sense holding our caller up for that.
			
 
				+ */
			
 
				+void cgroup1_release_agent(struct work_struct *work)
			
 
				+{
			
 
				+	struct cgroup *cgrp =
			
 
				+		container_of(work, struct cgroup, release_agent_work);
			
 
				+	char *pathbuf = NULL, *agentbuf = NULL;
			
 
				+	char *argv[3], *envp[3];
			
 
				+	int ret;
			
 
				+
			
 
				+	mutex_lock(&cgroup_mutex);
			
 
				+
			
 
				+	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
			
 
				+	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
			
 
				+	if (!pathbuf || !agentbuf)
			
 
				+		goto out;
			
 
				+
			
 
				+	spin_lock_irq(&css_set_lock);
			
 
				+	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
			
 
				+	spin_unlock_irq(&css_set_lock);
			
 
				+	if (ret < 0 || ret >= PATH_MAX)
			
 
				+		goto out;
			
 
				+
			
 
				+	argv[0] = agentbuf;
			
 
				+	argv[1] = pathbuf;
			
 
				+	argv[2] = NULL;
			
 
				+
			
 
				+	/* minimal command environment */
			
 
				+	envp[0] = "HOME=/";
			
 
				+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
			
 
				+	envp[2] = NULL;
			
 
				+
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				+	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
			
 
				+	goto out_free;
			
 
				+out:
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				+out_free:
			
 
				+	kfree(agentbuf);
			
 
				+	kfree(pathbuf);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * cgroup_rename - Only allow simple rename of directories in place.
			
 
				+ */
			
 
				+static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
			
 
				+			  const char *new_name_str)
			
 
				+{
			
 
				+	struct cgroup *cgrp = kn->priv;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (kernfs_type(kn) != KERNFS_DIR)
			
 
				+		return -ENOTDIR;
			
 
				+	if (kn->parent != new_parent)
			
 
				+		return -EIO;
			
 
				+
			
 
				+	/*
			
 
				+	 * We're gonna grab cgroup_mutex which nests outside kernfs
			
 
				+	 * active_ref.  kernfs_rename() doesn't require active_ref
			
 
				+	 * protection.  Break them before grabbing cgroup_mutex.
			
 
				+	 */
			
 
				+	kernfs_break_active_protection(new_parent);
			
 
				+	kernfs_break_active_protection(kn);
			
 
				+
			
 
				+	mutex_lock(&cgroup_mutex);
			
 
				+
			
 
				+	ret = kernfs_rename(kn, new_parent, new_name_str);
			
 
				+	if (!ret)
			
 
				+		trace_cgroup_rename(cgrp);
			
 
				+
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				+
			
 
				+	kernfs_unbreak_active_protection(kn);
			
 
				+	kernfs_unbreak_active_protection(new_parent);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
			
 
				+{
			
 
				+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
			
 
				+	struct cgroup_subsys *ss;
			
 
				+	int ssid;
			
 
				+
			
 
				+	for_each_subsys(ss, ssid)
			
 
				+		if (root->subsys_mask & (1 << ssid))
			
 
				+			seq_show_option(seq, ss->legacy_name, NULL);
			
 
				+	if (root->flags & CGRP_ROOT_NOPREFIX)
			
 
				+		seq_puts(seq, ",noprefix");
			
 
				+	if (root->flags & CGRP_ROOT_XATTR)
			
 
				+		seq_puts(seq, ",xattr");
			
 
				+
			
 
				+	spin_lock(&release_agent_path_lock);
			
 
				+	if (strlen(root->release_agent_path))
			
 
				+		seq_show_option(seq, "release_agent",
			
 
				+				root->release_agent_path);
			
 
				+	spin_unlock(&release_agent_path_lock);
			
 
				+
			
 
				+	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
			
 
				+		seq_puts(seq, ",clone_children");
			
 
				+	if (strlen(root->name))
			
 
				+		seq_show_option(seq, "name", root->name);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
			
 
				+{
			
 
				+	char *token, *o = data;
			
 
				+	bool all_ss = false, one_ss = false;
			
 
				+	u16 mask = U16_MAX;
			
 
				+	struct cgroup_subsys *ss;
			
 
				+	int nr_opts = 0;
			
 
				+	int i;
			
 
				+
			
 
				+#ifdef CONFIG_CPUSETS
			
 
				+	mask = ~((u16)1 << cpuset_cgrp_id);
			
 
				+#endif
			
 
				+
			
 
				+	memset(opts, 0, sizeof(*opts));
			
 
				+
			
 
				+	while ((token = strsep(&o, ",")) != NULL) {
			
 
				+		nr_opts++;
			
 
				+
			
 
				+		if (!*token)
			
 
				+			return -EINVAL;
			
 
				+		if (!strcmp(token, "none")) {
			
 
				+			/* Explicitly have no subsystems */
			
 
				+			opts->none = true;
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (!strcmp(token, "all")) {
			
 
				+			/* Mutually exclusive option 'all' + subsystem name */
			
 
				+			if (one_ss)
			
 
				+				return -EINVAL;
			
 
				+			all_ss = true;
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (!strcmp(token, "noprefix")) {
			
 
				+			opts->flags |= CGRP_ROOT_NOPREFIX;
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (!strcmp(token, "clone_children")) {
			
 
				+			opts->cpuset_clone_children = true;
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (!strcmp(token, "xattr")) {
			
 
				+			opts->flags |= CGRP_ROOT_XATTR;
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (!strncmp(token, "release_agent=", 14)) {
			
 
				+			/* Specifying two release agents is forbidden */
			
 
				+			if (opts->release_agent)
			
 
				+				return -EINVAL;
			
 
				+			opts->release_agent =
			
 
				+				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
			
 
				+			if (!opts->release_agent)
			
 
				+				return -ENOMEM;
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (!strncmp(token, "name=", 5)) {
			
 
				+			const char *name = token + 5;
			
 
				+			/* Can't specify an empty name */
			
 
				+			if (!strlen(name))
			
 
				+				return -EINVAL;
			
 
				+			/* Must match [\w.-]+ */
			
 
				+			for (i = 0; i < strlen(name); i++) {
			
 
				+				char c = name[i];
			
 
				+				if (isalnum(c))
			
 
				+					continue;
			
 
				+				if ((c == '.') || (c == '-') || (c == '_'))
			
 
				+					continue;
			
 
				+				return -EINVAL;
			
 
				+			}
			
 
				+			/* Specifying two names is forbidden */
			
 
				+			if (opts->name)
			
 
				+				return -EINVAL;
			
 
				+			opts->name = kstrndup(name,
			
 
				+					      MAX_CGROUP_ROOT_NAMELEN - 1,
			
 
				+					      GFP_KERNEL);
			
 
				+			if (!opts->name)
			
 
				+				return -ENOMEM;
			
 
				+
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		for_each_subsys(ss, i) {
			
 
				+			if (strcmp(token, ss->legacy_name))
			
 
				+				continue;
			
 
				+			if (!cgroup_ssid_enabled(i))
			
 
				+				continue;
			
 
				+			if (cgroup1_ssid_disabled(i))
			
 
				+				continue;
			
 
				+
			
 
				+			/* Mutually exclusive option 'all' + subsystem name */
			
 
				+			if (all_ss)
			
 
				+				return -EINVAL;
			
 
				+			opts->subsys_mask |= (1 << i);
			
 
				+			one_ss = true;
			
 
				+
			
 
				+			break;
			
 
				+		}
			
 
				+		if (i == CGROUP_SUBSYS_COUNT)
			
 
				+			return -ENOENT;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * If the 'all' option was specified select all the subsystems,
			
 
				+	 * otherwise if 'none', 'name=' and a subsystem name options were
			
 
				+	 * not specified, let's default to 'all'
			
 
				+	 */
			
 
				+	if (all_ss || (!one_ss && !opts->none && !opts->name))
			
 
				+		for_each_subsys(ss, i)
			
 
				+			if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
			
 
				+				opts->subsys_mask |= (1 << i);
			
 
				+
			
 
				+	/*
			
 
				+	 * We either have to specify by name or by subsystems. (So all
			
 
				+	 * empty hierarchies must have a name).
			
 
				+	 */
			
 
				+	if (!opts->subsys_mask && !opts->name)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/*
			
 
				+	 * Option noprefix was introduced just for backward compatibility
			
 
				+	 * with the old cpuset, so we allow noprefix only if mounting just
			
 
				+	 * the cpuset subsystem.
			
 
				+	 */
			
 
				+	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* Can't specify "none" and some subsystems */
			
 
				+	if (opts->subsys_mask && opts->none)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
			
 
				+	struct cgroup_sb_opts opts;
			
 
				+	u16 added_mask, removed_mask;
			
 
				+
			
 
				+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
			
 
				+
			
 
				+	/* See what subsystems are wanted */
			
 
				+	ret = parse_cgroupfs_options(data, &opts);
			
 
				+	if (ret)
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
			
 
				+		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
			
 
				+			task_tgid_nr(current), current->comm);
			
 
				+
			
 
				+	added_mask = opts.subsys_mask & ~root->subsys_mask;
			
 
				+	removed_mask = root->subsys_mask & ~opts.subsys_mask;
			
 
				+
			
 
				+	/* Don't allow flags or name to change at remount */
			
 
				+	if ((opts.flags ^ root->flags) ||
			
 
				+	    (opts.name && strcmp(opts.name, root->name))) {
			
 
				+		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
			
 
				+		       opts.flags, opts.name ?: "", root->flags, root->name);
			
 
				+		ret = -EINVAL;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	/* remounting is not allowed for populated hierarchies */
			
 
				+	if (!list_empty(&root->cgrp.self.children)) {
			
 
				+		ret = -EBUSY;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	ret = rebind_subsystems(root, added_mask);
			
 
				+	if (ret)
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
			
 
				+
			
 
				+	if (opts.release_agent) {
			
 
				+		spin_lock(&release_agent_path_lock);
			
 
				+		strcpy(root->release_agent_path, opts.release_agent);
			
 
				+		spin_unlock(&release_agent_path_lock);
			
 
				+	}
			
 
				+
			
 
				+	trace_cgroup_remount(root);
			
 
				+
			
 
				+ out_unlock:
			
 
				+	kfree(opts.release_agent);
			
 
				+	kfree(opts.name);
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
			
 
				+	.rename			= cgroup1_rename,
			
 
				+	.show_options		= cgroup1_show_options,
			
 
				+	.remount_fs		= cgroup1_remount,
			
 
				+	.mkdir			= cgroup_mkdir,
			
 
				+	.rmdir			= cgroup_rmdir,
			
 
				+	.show_path		= cgroup_show_path,
			
 
				+};
			
 
				+
			
 
				+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
			
 
				+			     void *data, unsigned long magic,
			
 
				+			     struct cgroup_namespace *ns)
			
 
				+{
			
 
				+	struct super_block *pinned_sb = NULL;
			
 
				+	struct cgroup_sb_opts opts;
			
 
				+	struct cgroup_root *root;
			
 
				+	struct cgroup_subsys *ss;
			
 
				+	struct dentry *dentry;
			
 
				+	int i, ret;
			
 
				+
			
 
				+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
			
 
				+
			
 
				+	/* First find the desired set of subsystems */
			
 
				+	ret = parse_cgroupfs_options(data, &opts);
			
 
				+	if (ret)
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	/*
			
 
				+	 * Destruction of cgroup root is asynchronous, so subsystems may
			
 
				+	 * still be dying after the previous unmount.  Let's drain the
			
 
				+	 * dying subsystems.  We just need to ensure that the ones
			
 
				+	 * unmounted previously finish dying and don't care about new ones
			
 
				+	 * starting.  Testing ref liveliness is good enough.
			
 
				+	 */
			
 
				+	for_each_subsys(ss, i) {
			
 
				+		if (!(opts.subsys_mask & (1 << i)) ||
			
 
				+		    ss->root == &cgrp_dfl_root)
			
 
				+			continue;
			
 
				+
			
 
				+		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
			
 
				+			mutex_unlock(&cgroup_mutex);
			
 
				+			msleep(10);
			
 
				+			ret = restart_syscall();
			
 
				+			goto out_free;
			
 
				+		}
			
 
				+		cgroup_put(&ss->root->cgrp);
			
 
				+	}
			
 
				+
			
 
				+	for_each_root(root) {
			
 
				+		bool name_match = false;
			
 
				+
			
 
				+		if (root == &cgrp_dfl_root)
			
 
				+			continue;
			
 
				+
			
 
				+		/*
			
 
				+		 * If we asked for a name then it must match.  Also, if
			
 
				+		 * name matches but sybsys_mask doesn't, we should fail.
			
 
				+		 * Remember whether name matched.
			
 
				+		 */
			
 
				+		if (opts.name) {
			
 
				+			if (strcmp(opts.name, root->name))
			
 
				+				continue;
			
 
				+			name_match = true;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * If we asked for subsystems (or explicitly for no
			
 
				+		 * subsystems) then they must match.
			
 
				+		 */
			
 
				+		if ((opts.subsys_mask || opts.none) &&
			
 
				+		    (opts.subsys_mask != root->subsys_mask)) {
			
 
				+			if (!name_match)
			
 
				+				continue;
			
 
				+			ret = -EBUSY;
			
 
				+			goto out_unlock;
			
 
				+		}
			
 
				+
			
 
				+		if (root->flags ^ opts.flags)
			
 
				+			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
			
 
				+
			
 
				+		/*
			
 
				+		 * We want to reuse @root whose lifetime is governed by its
			
 
				+		 * ->cgrp.  Let's check whether @root is alive and keep it
			
 
				+		 * that way.  As cgroup_kill_sb() can happen anytime, we
			
 
				+		 * want to block it by pinning the sb so that @root doesn't
			
 
				+		 * get killed before mount is complete.
			
 
				+		 *
			
 
				+		 * With the sb pinned, tryget_live can reliably indicate
			
 
				+		 * whether @root can be reused.  If it's being killed,
			
 
				+		 * drain it.  We can use wait_queue for the wait but this
			
 
				+		 * path is super cold.  Let's just sleep a bit and retry.
			
 
				+		 */
			
 
				+		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
			
 
				+		if (IS_ERR(pinned_sb) ||
			
 
				+		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
			
 
				+			mutex_unlock(&cgroup_mutex);
			
 
				+			if (!IS_ERR_OR_NULL(pinned_sb))
			
 
				+				deactivate_super(pinned_sb);
			
 
				+			msleep(10);
			
 
				+			ret = restart_syscall();
			
 
				+			goto out_free;
			
 
				+		}
			
 
				+
			
 
				+		ret = 0;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * No such thing, create a new one.  name= matching without subsys
			
 
				+	 * specification is allowed for already existing hierarchies but we
			
 
				+	 * can't create new one without subsys specification.
			
 
				+	 */
			
 
				+	if (!opts.subsys_mask && !opts.none) {
			
 
				+		ret = -EINVAL;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	/* Hierarchies may only be created in the initial cgroup namespace. */
			
 
				+	if (ns != &init_cgroup_ns) {
			
 
				+		ret = -EPERM;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	root = kzalloc(sizeof(*root), GFP_KERNEL);
			
 
				+	if (!root) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	init_cgroup_root(root, &opts);
			
 
				+
			
 
				+	ret = cgroup_setup_root(root, opts.subsys_mask);
			
 
				+	if (ret)
			
 
				+		cgroup_free_root(root);
			
 
				+
			
 
				+out_unlock:
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				+out_free:
			
 
				+	kfree(opts.release_agent);
			
 
				+	kfree(opts.name);
			
 
				+
			
 
				+	if (ret)
			
 
				+		return ERR_PTR(ret);
			
 
				+
			
 
				+	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
			
 
				+				 CGROUP_SUPER_MAGIC, ns);
			
 
				+
			
 
				+	/*
			
 
				+	 * If @pinned_sb, we're reusing an existing root and holding an
			
 
				+	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
			
 
				+	 */
			
 
				+	if (pinned_sb)
			
 
				+		deactivate_super(pinned_sb);
			
 
				+
			
 
				+	return dentry;
			
 
				+}
			
 
				+
			
 
				+static int __init cgroup1_wq_init(void)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Used to destroy pidlists and separate to serve as flush domain.
			
 
				+	 * Cap @max_active to 1 too.
			
 
				+	 */
			
 
				+	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
			
 
				+						    0, 1);
			
 
				+	BUG_ON(!cgroup_pidlist_destroy_wq);
			
 
				+	return 0;
			
 
				+}
			
 
				+core_initcall(cgroup1_wq_init);
			
 
				+
			
 
				+static int __init cgroup_no_v1(char *str)
			
 
				+{
			
 
				+	struct cgroup_subsys *ss;
			
 
				+	char *token;
			
 
				+	int i;
			
 
				+
			
 
				+	while ((token = strsep(&str, ",")) != NULL) {
			
 
				+		if (!*token)
			
 
				+			continue;
			
 
				+
			
 
				+		if (!strcmp(token, "all")) {
			
 
				+			cgroup_no_v1_mask = U16_MAX;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		for_each_subsys(ss, i) {
			
 
				+			if (strcmp(token, ss->name) &&
			
 
				+			    strcmp(token, ss->legacy_name))
			
 
				+				continue;
			
 
				+
			
 
				+			cgroup_no_v1_mask |= 1 << i;
			
 
				+		}
			
 
				+	}
			
 
				+	return 1;
			
 
				+}
			
 
				+__setup("cgroup_no_v1=", cgroup_no_v1);
			
 
				+
			
 
				+
			
 
				+#ifdef CONFIG_CGROUP_DEBUG
			
 
				+static struct cgroup_subsys_state *
			
 
				+debug_css_alloc(struct cgroup_subsys_state *parent_css)
			
 
				+{
			
 
				+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
			
 
				+
			
 
				+	if (!css)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	return css;
			
 
				+}
			
 
				+
			
 
				+static void debug_css_free(struct cgroup_subsys_state *css)
			
 
				+{
			
 
				+	kfree(css);
			
 
				+}
			
 
				+
			
 
				+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
			
 
				+				struct cftype *cft)
			
 
				+{
			
 
				+	return cgroup_task_count(css->cgroup);
			
 
				+}
			
 
				+
			
 
				+static u64 current_css_set_read(struct cgroup_subsys_state *css,
			
 
				+				struct cftype *cft)
			
 
				+{
			
 
				+	return (u64)(unsigned long)current->cgroups;
			
 
				+}
			
 
				+
			
 
				+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
			
 
				+					 struct cftype *cft)
			
 
				+{
			
 
				+	u64 count;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	count = atomic_read(&task_css_set(current)->refcount);
			
 
				+	rcu_read_unlock();
			
 
				+	return count;
			
 
				+}
			
 
				+
			
 
				+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
			
 
				+{
			
 
				+	struct cgrp_cset_link *link;
			
 
				+	struct css_set *cset;
			
 
				+	char *name_buf;
			
 
				+
			
 
				+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
			
 
				+	if (!name_buf)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	spin_lock_irq(&css_set_lock);
			
 
				+	rcu_read_lock();
			
 
				+	cset = rcu_dereference(current->cgroups);
			
 
				+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
			
 
				+		struct cgroup *c = link->cgrp;
			
 
				+
			
 
				+		cgroup_name(c, name_buf, NAME_MAX + 1);
			
 
				+		seq_printf(seq, "Root %d group %s\n",
			
 
				+			   c->root->hierarchy_id, name_buf);
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+	spin_unlock_irq(&css_set_lock);
			
 
				+	kfree(name_buf);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#define MAX_TASKS_SHOWN_PER_CSS 25
			
 
				+static int cgroup_css_links_read(struct seq_file *seq, void *v)
			
 
				+{
			
 
				+	struct cgroup_subsys_state *css = seq_css(seq);
			
 
				+	struct cgrp_cset_link *link;
			
 
				+
			
 
				+	spin_lock_irq(&css_set_lock);
			
 
				+	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
			
 
				+		struct css_set *cset = link->cset;
			
 
				+		struct task_struct *task;
			
 
				+		int count = 0;
			
 
				+
			
 
				+		seq_printf(seq, "css_set %p\n", cset);
			
 
				+
			
 
				+		list_for_each_entry(task, &cset->tasks, cg_list) {
			
 
				+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
			
 
				+				goto overflow;
			
 
				+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
			
 
				+		}
			
 
				+
			
 
				+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
			
 
				+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
			
 
				+				goto overflow;
			
 
				+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
			
 
				+		}
			
 
				+		continue;
			
 
				+	overflow:
			
 
				+		seq_puts(seq, "  ...\n");
			
 
				+	}
			
 
				+	spin_unlock_irq(&css_set_lock);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
			
 
				+{
			
 
				+	return (!cgroup_is_populated(css->cgroup) &&
			
 
				+		!css_has_online_children(&css->cgroup->self));
			
 
				+}
			
 
				+
			
 
				+static struct cftype debug_files[] =  {
			
 
				+	{
			
 
				+		.name = "taskcount",
			
 
				+		.read_u64 = debug_taskcount_read,
			
 
				+	},
			
 
				+
			
 
				+	{
			
 
				+		.name = "current_css_set",
			
 
				+		.read_u64 = current_css_set_read,
			
 
				+	},
			
 
				+
			
 
				+	{
			
 
				+		.name = "current_css_set_refcount",
			
 
				+		.read_u64 = current_css_set_refcount_read,
			
 
				+	},
			
 
				+
			
 
				+	{
			
 
				+		.name = "current_css_set_cg_links",
			
 
				+		.seq_show = current_css_set_cg_links_read,
			
 
				+	},
			
 
				+
			
 
				+	{
			
 
				+		.name = "cgroup_css_links",
			
 
				+		.seq_show = cgroup_css_links_read,
			
 
				+	},
			
 
				+
			
 
				+	{
			
 
				+		.name = "releasable",
			
 
				+		.read_u64 = releasable_read,
			
 
				+	},
			
 
				+
			
 
				+	{ }	/* terminate */
			
 
				+};
			
 
				+
			
 
				+struct cgroup_subsys debug_cgrp_subsys = {
			
 
				+	.css_alloc = debug_css_alloc,
			
 
				+	.css_free = debug_css_free,
			
 
				+	.legacy_cftypes = debug_files,
			
 
				+};
			
 
				+#endif /* CONFIG_CGROUP_DEBUG */
			
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,15 +28,13 @@
 
				 
			
 
				 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
			
 
				 
			
 
				-#include <linux/cgroup.h>
			
 
				+#include "cgroup-internal.h"
			
 
				+
			
 
				 #include <linux/cred.h>
			
 
				-#include <linux/ctype.h>
			
 
				 #include <linux/errno.h>
			
 
				 #include <linux/init_task.h>
			
 
				 #include <linux/kernel.h>
			
 
				-#include <linux/list.h>
			
 
				 #include <linux/magic.h>
			
 
				-#include <linux/mm.h>
			
 
				 #include <linux/mutex.h>
			
 
				 #include <linux/mount.h>
			
 
				 #include <linux/pagemap.h>
			
@@ -47,16 +45,9 @@
 
				 #include <linux/spinlock.h>
			
 
				 #include <linux/percpu-rwsem.h>
			
 
				 #include <linux/string.h>
			
 
				-#include <linux/sort.h>
			
 
				-#include <linux/kmod.h>
			
 
				-#include <linux/delayacct.h>
			
 
				-#include <linux/cgroupstats.h>
			
 
				 #include <linux/hashtable.h>
			
 
				-#include <linux/pid_namespace.h>
			
 
				 #include <linux/idr.h>
			
 
				-#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
			
 
				 #include <linux/kthread.h>
			
 
				-#include <linux/delay.h>
			
 
				 #include <linux/atomic.h>
			
 
				 #include <linux/cpuset.h>
			
 
				 #include <linux/proc_ns.h>
			
@@ -67,14 +58,6 @@
 
				 #define CREATE_TRACE_POINTS
			
 
				 #include <trace/events/cgroup.h>
			
 
				 
			
 
				-/*
			
 
				- * pidlists linger the following amount before being destroyed.  The goal
			
 
				- * is avoiding frequent destruction in the middle of consecutive read calls
			
 
				- * Expiring in the middle is a performance problem not a correctness one.
			
 
				- * 1 sec should be enough.
			
 
				- */
			
 
				-#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
			
 
				-
			
 
				 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
			
 
				 					 MAX_CFTYPE_NAME + 2)
			
 
				 
			
@@ -88,14 +71,12 @@
 
				  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
			
 
				  * cgroup.h can use them for lockdep annotations.
			
 
				  */
			
 
				-#ifdef CONFIG_PROVE_RCU
			
 
				 DEFINE_MUTEX(cgroup_mutex);
			
 
				 DEFINE_SPINLOCK(css_set_lock);
			
 
				+
			
 
				+#ifdef CONFIG_PROVE_RCU
			
 
				 EXPORT_SYMBOL_GPL(cgroup_mutex);
			
 
				 EXPORT_SYMBOL_GPL(css_set_lock);
			
 
				-#else
			
 
				-static DEFINE_MUTEX(cgroup_mutex);
			
 
				-static DEFINE_SPINLOCK(css_set_lock);
			
 
				 #endif
			
 
				 
			
 
				 /*
			
@@ -110,12 +91,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
 
				  */
			
 
				 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
			
 
				 
			
 
				-/*
			
 
				- * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
			
 
				- * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
			
 
				- */
			
 
				-static DEFINE_SPINLOCK(release_agent_path_lock);
			
 
				-
			
 
				 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
			
 
				 
			
 
				 #define cgroup_assert_mutex_or_rcu_locked()				\
			
@@ -131,15 +106,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 
				  */
			
 
				 static struct workqueue_struct *cgroup_destroy_wq;
			
 
				 
			
 
				-/*
			
 
				- * pidlist destructions need to be flushed on cgroup destruction.  Use a
			
 
				- * separate workqueue as flush domain.
			
 
				- */
			
 
				-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
			
 
				-
			
 
				 /* generate an array of cgroup subsystem pointers */
			
 
				 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
			
 
				-static struct cgroup_subsys *cgroup_subsys[] = {
			
 
				+struct cgroup_subsys *cgroup_subsys[] = {
			
 
				 #include <linux/cgroup_subsys.h>
			
 
				 };
			
 
				 #undef SUBSYS
			
@@ -186,18 +155,14 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 
				  */
			
 
				 static bool cgrp_dfl_visible;
			
 
				 
			
 
				-/* Controllers blocked by the commandline in v1 */
			
 
				-static u16 cgroup_no_v1_mask;
			
 
				-
			
 
				 /* some controllers are not supported in the default hierarchy */
			
 
				 static u16 cgrp_dfl_inhibit_ss_mask;
			
 
				 
			
 
				 /* some controllers are implicitly enabled on the default hierarchy */
			
 
				-static unsigned long cgrp_dfl_implicit_ss_mask;
			
 
				+static u16 cgrp_dfl_implicit_ss_mask;
			
 
				 
			
 
				 /* The list of hierarchy roots */
			
 
				-
			
 
				-static LIST_HEAD(cgroup_roots);
			
 
				+LIST_HEAD(cgroup_roots);
			
 
				 static int cgroup_root_count;
			
 
				 
			
 
				 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
			
@@ -213,13 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
 
				 static u64 css_serial_nr_next = 1;
			
 
				 
			
 
				 /*
			
 
				- * These bitmask flags indicate whether tasks in the fork and exit paths have
			
 
				- * fork/exit handlers to call. This avoids us having to do extra work in the
			
 
				- * fork/exit path to check which subsystems have fork/exit callbacks.
			
 
				+ * These bitmasks identify subsystems with specific features to avoid
			
 
				+ * having to do iterative checks repeatedly.
			
 
				  */
			
 
				 static u16 have_fork_callback __read_mostly;
			
 
				 static u16 have_exit_callback __read_mostly;
			
 
				 static u16 have_free_callback __read_mostly;
			
 
				+static u16 have_canfork_callback __read_mostly;
			
 
				 
			
 
				 /* cgroup namespace for init task */
			
 
				 struct cgroup_namespace init_cgroup_ns = {
			
@@ -230,15 +195,9 @@ struct cgroup_namespace init_cgroup_ns = {
 
				 	.root_cset	= &init_css_set,
			
 
				 };
			
 
				 
			
 
				-/* Ditto for the can_fork callback. */
			
 
				-static u16 have_canfork_callback __read_mostly;
			
 
				-
			
 
				 static struct file_system_type cgroup2_fs_type;
			
 
				-static struct cftype cgroup_dfl_base_files[];
			
 
				-static struct cftype cgroup_legacy_base_files[];
			
 
				+static struct cftype cgroup_base_files[];
			
 
				 
			
 
				-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
			
 
				-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
			
 
				 static int cgroup_apply_control(struct cgroup *cgrp);
			
 
				 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
			
 
				 static void css_task_iter_advance(struct css_task_iter *it);
			
@@ -259,7 +218,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 
				  * is fine for individual subsystems but unsuitable for cgroup core.  This
			
 
				  * is slower static_key_enabled() based test indexed by @ssid.
			
 
				  */
			
 
				-static bool cgroup_ssid_enabled(int ssid)
			
 
				+bool cgroup_ssid_enabled(int ssid)
			
 
				 {
			
 
				 	if (CGROUP_SUBSYS_COUNT == 0)
			
 
				 		return false;
			
@@ -267,11 +226,6 @@ static bool cgroup_ssid_enabled(int ssid)
 
				 	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
			
 
				 }
			
 
				 
			
 
				-static bool cgroup_ssid_no_v1(int ssid)
			
 
				-{
			
 
				-	return cgroup_no_v1_mask & (1 << ssid);
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
			
 
				  * @cgrp: the cgroup of interest
			
@@ -325,7 +279,7 @@ static bool cgroup_ssid_no_v1(int ssid)
 
				  *
			
 
				  * - debug: disallowed on the default hierarchy.
			
 
				  */
			
 
				-static bool cgroup_on_dfl(const struct cgroup *cgrp)
			
 
				+bool cgroup_on_dfl(const struct cgroup *cgrp)
			
 
				 {
			
 
				 	return cgrp->root == &cgrp_dfl_root;
			
 
				 }
			
@@ -481,12 +435,6 @@ out_unlock:
 
				 	return css;
			
 
				 }
			
 
				 
			
 
				-/* convenient tests for these bits */
			
 
				-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
			
 
				-{
			
 
				-	return !(cgrp->self.flags & CSS_ONLINE);
			
 
				-}
			
 
				-
			
 
				 static void cgroup_get(struct cgroup *cgrp)
			
 
				 {
			
 
				 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
			
@@ -518,11 +466,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(of_css);
			
 
				 
			
 
				-static int notify_on_release(const struct cgroup *cgrp)
			
 
				-{
			
 
				-	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * for_each_css - iterate all css's of a cgroup
			
 
				  * @css: the iteration cursor
			
@@ -552,15 +495,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 
				 			;						\
			
 
				 		else
			
 
				 
			
 
				-/**
			
 
				- * for_each_subsys - iterate all enabled cgroup subsystems
			
 
				- * @ss: the iteration cursor
			
 
				- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
			
 
				- */
			
 
				-#define for_each_subsys(ss, ssid)					\
			
 
				-	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
			
 
				-	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
			
 
				-
			
 
				 /**
			
 
				  * do_each_subsys_mask - filter for_each_subsys with a bitmask
			
 
				  * @ss: the iteration cursor
			
@@ -585,10 +519,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 
				 	}								\
			
 
				 } while (false)
			
 
				 
			
 
				-/* iterate across the hierarchies */
			
 
				-#define for_each_root(root)						\
			
 
				-	list_for_each_entry((root), &cgroup_roots, root_list)
			
 
				-
			
 
				 /* iterate over child cgrps, lock should be held throughout iteration */
			
 
				 #define cgroup_for_each_live_child(child, cgrp)				\
			
 
				 	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
			
@@ -615,29 +545,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 
				 			;						\
			
 
				 		else
			
 
				 
			
 
				-static void cgroup_release_agent(struct work_struct *work);
			
 
				-static void check_for_release(struct cgroup *cgrp);
			
 
				-
			
 
				-/*
			
 
				- * A cgroup can be associated with multiple css_sets as different tasks may
			
 
				- * belong to different cgroups on different hierarchies.  In the other
			
 
				- * direction, a css_set is naturally associated with multiple cgroups.
			
 
				- * This M:N relationship is represented by the following link structure
			
 
				- * which exists for each association and allows traversing the associations
			
 
				- * from both sides.
			
 
				- */
			
 
				-struct cgrp_cset_link {
			
 
				-	/* the cgroup and css_set this link associates */
			
 
				-	struct cgroup		*cgrp;
			
 
				-	struct css_set		*cset;
			
 
				-
			
 
				-	/* list of cgrp_cset_links anchored at cgrp->cset_links */
			
 
				-	struct list_head	cset_link;
			
 
				-
			
 
				-	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
			
 
				-	struct list_head	cgrp_link;
			
 
				-};
			
 
				-
			
 
				 /*
			
 
				  * The default css_set - used by init and its children prior to any
			
 
				  * hierarchies being mounted. It contains a pointer to the root state
			
@@ -647,12 +554,12 @@ struct cgrp_cset_link {
 
				  */
			
 
				 struct css_set init_css_set = {
			
 
				 	.refcount		= ATOMIC_INIT(1),
			
 
				-	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
			
 
				 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
			
 
				 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
			
 
				+	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
			
 
				+	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
			
 
				 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
			
 
				 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
			
 
				-	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
			
 
				 };
			
 
				 
			
 
				 static int css_set_count	= 1;	/* 1 for init_css_set */
			
@@ -699,7 +606,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 
				 		if (!trigger)
			
 
				 			break;
			
 
				 
			
 
				-		check_for_release(cgrp);
			
 
				+		cgroup1_check_for_release(cgrp);
			
 
				 		cgroup_file_notify(&cgrp->events_file);
			
 
				 
			
 
				 		cgrp = cgroup_parent(cgrp);
			
@@ -808,7 +715,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 
				 	return key;
			
 
				 }
			
 
				 
			
 
				-static void put_css_set_locked(struct css_set *cset)
			
 
				+void put_css_set_locked(struct css_set *cset)
			
 
				 {
			
 
				 	struct cgrp_cset_link *link, *tmp_link;
			
 
				 	struct cgroup_subsys *ss;
			
@@ -838,31 +745,6 @@ static void put_css_set_locked(struct css_set *cset)
 
				 	kfree_rcu(cset, rcu_head);
			
 
				 }
			
 
				 
			
 
				-static void put_css_set(struct css_set *cset)
			
 
				-{
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	/*
			
 
				-	 * Ensure that the refcount doesn't hit zero while any readers
			
 
				-	 * can see it. Similar to atomic_dec_and_lock(), but for an
			
 
				-	 * rwlock
			
 
				-	 */
			
 
				-	if (atomic_add_unless(&cset->refcount, -1, 1))
			
 
				-		return;
			
 
				-
			
 
				-	spin_lock_irqsave(&css_set_lock, flags);
			
 
				-	put_css_set_locked(cset);
			
 
				-	spin_unlock_irqrestore(&css_set_lock, flags);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * refcounted get/put for css_set objects
			
 
				- */
			
 
				-static inline void get_css_set(struct css_set *cset)
			
 
				-{
			
 
				-	atomic_inc(&cset->refcount);
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * compare_css_sets - helper function for find_existing_css_set().
			
 
				  * @cset: candidate css_set being tested
			
@@ -1095,13 +977,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
				 	}
			
 
				 
			
 
				 	atomic_set(&cset->refcount, 1);
			
 
				-	INIT_LIST_HEAD(&cset->cgrp_links);
			
 
				 	INIT_LIST_HEAD(&cset->tasks);
			
 
				 	INIT_LIST_HEAD(&cset->mg_tasks);
			
 
				-	INIT_LIST_HEAD(&cset->mg_preload_node);
			
 
				-	INIT_LIST_HEAD(&cset->mg_node);
			
 
				 	INIT_LIST_HEAD(&cset->task_iters);
			
 
				 	INIT_HLIST_NODE(&cset->hlist);
			
 
				+	INIT_LIST_HEAD(&cset->cgrp_links);
			
 
				+	INIT_LIST_HEAD(&cset->mg_preload_node);
			
 
				+	INIT_LIST_HEAD(&cset->mg_node);
			
 
				 
			
 
				 	/* Copy the set of subsystem state objects generated in
			
 
				 	 * find_existing_css_set() */
			
@@ -1138,7 +1020,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
				 	return cset;
			
 
				 }
			
 
				 
			
 
				-static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
			
 
				+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
			
 
				 {
			
 
				 	struct cgroup *root_cgrp = kf_root->kn->priv;
			
 
				 
			
@@ -1166,7 +1048,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 
				 	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
			
 
				 }
			
 
				 
			
 
				-static void cgroup_free_root(struct cgroup_root *root)
			
 
				+void cgroup_free_root(struct cgroup_root *root)
			
 
				 {
			
 
				 	if (root) {
			
 
				 		idr_destroy(&root->cgroup_idr);
			
@@ -1283,8 +1165,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 
				  * Return the cgroup for "task" from the given hierarchy. Must be
			
 
				  * called with cgroup_mutex and css_set_lock held.
			
 
				  */
			
 
				-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
			
 
				-					    struct cgroup_root *root)
			
 
				+struct cgroup *task_cgroup_from_root(struct task_struct *task,
			
 
				+				     struct cgroup_root *root)
			
 
				 {
			
 
				 	/*
			
 
				 	 * No need to lock the task - since we hold cgroup_mutex the
			
@@ -1321,7 +1203,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 
				  */
			
 
				 
			
 
				 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
			
 
				-static const struct file_operations proc_cgroupstats_operations;
			
 
				 
			
 
				 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
			
 
				 			      char *buf)
			
@@ -1415,7 +1296,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
 
				  * inaccessible any time.  If the caller intends to continue to access the
			
 
				  * cgroup, it should pin it before invoking this function.
			
 
				  */
			
 
				-static void cgroup_kn_unlock(struct kernfs_node *kn)
			
 
				+void cgroup_kn_unlock(struct kernfs_node *kn)
			
 
				 {
			
 
				 	struct cgroup *cgrp;
			
 
				 
			
@@ -1447,8 +1328,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
 
				  * locking under kernfs active protection and allows all kernfs operations
			
 
				  * including self-removal.
			
 
				  */
			
 
				-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
			
 
				-					  bool drain_offline)
			
 
				+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
			
 
				 {
			
 
				 	struct cgroup *cgrp;
			
 
				 
			
@@ -1532,9 +1412,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 
				 
			
 
				 	if (!css->ss) {
			
 
				 		if (cgroup_on_dfl(cgrp))
			
 
				-			cfts = cgroup_dfl_base_files;
			
 
				+			cfts = cgroup_base_files;
			
 
				 		else
			
 
				-			cfts = cgroup_legacy_base_files;
			
 
				+			cfts = cgroup1_base_files;
			
 
				 
			
 
				 		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
			
 
				 	}
			
@@ -1559,7 +1439,7 @@ err:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
			
 
				+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
			
 
				 {
			
 
				 	struct cgroup *dcgrp = &dst_root->cgrp;
			
 
				 	struct cgroup_subsys *ss;
			
@@ -1629,8 +1509,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
			
 
				-			    struct kernfs_root *kf_root)
			
 
				+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
			
 
				+		     struct kernfs_root *kf_root)
			
 
				 {
			
 
				 	int len = 0;
			
 
				 	char *buf = NULL;
			
@@ -1656,237 +1536,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 
				 	return len;
			
 
				 }
			
 
				 
			
 
				-static int cgroup_show_options(struct seq_file *seq,
			
 
				-			       struct kernfs_root *kf_root)
			
 
				-{
			
 
				-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
			
 
				-	struct cgroup_subsys *ss;
			
 
				-	int ssid;
			
 
				-
			
 
				-	if (root != &cgrp_dfl_root)
			
 
				-		for_each_subsys(ss, ssid)
			
 
				-			if (root->subsys_mask & (1 << ssid))
			
 
				-				seq_show_option(seq, ss->legacy_name, NULL);
			
 
				-	if (root->flags & CGRP_ROOT_NOPREFIX)
			
 
				-		seq_puts(seq, ",noprefix");
			
 
				-	if (root->flags & CGRP_ROOT_XATTR)
			
 
				-		seq_puts(seq, ",xattr");
			
 
				-
			
 
				-	spin_lock(&release_agent_path_lock);
			
 
				-	if (strlen(root->release_agent_path))
			
 
				-		seq_show_option(seq, "release_agent",
			
 
				-				root->release_agent_path);
			
 
				-	spin_unlock(&release_agent_path_lock);
			
 
				-
			
 
				-	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
			
 
				-		seq_puts(seq, ",clone_children");
			
 
				-	if (strlen(root->name))
			
 
				-		seq_show_option(seq, "name", root->name);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-struct cgroup_sb_opts {
			
 
				-	u16 subsys_mask;
			
 
				-	unsigned int flags;
			
 
				-	char *release_agent;
			
 
				-	bool cpuset_clone_children;
			
 
				-	char *name;
			
 
				-	/* User explicitly requested empty subsystem */
			
 
				-	bool none;
			
 
				-};
			
 
				-
			
 
				-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
			
 
				-{
			
 
				-	char *token, *o = data;
			
 
				-	bool all_ss = false, one_ss = false;
			
 
				-	u16 mask = U16_MAX;
			
 
				-	struct cgroup_subsys *ss;
			
 
				-	int nr_opts = 0;
			
 
				-	int i;
			
 
				-
			
 
				-#ifdef CONFIG_CPUSETS
			
 
				-	mask = ~((u16)1 << cpuset_cgrp_id);
			
 
				-#endif
			
 
				-
			
 
				-	memset(opts, 0, sizeof(*opts));
			
 
				-
			
 
				-	while ((token = strsep(&o, ",")) != NULL) {
			
 
				-		nr_opts++;
			
 
				-
			
 
				-		if (!*token)
			
 
				-			return -EINVAL;
			
 
				-		if (!strcmp(token, "none")) {
			
 
				-			/* Explicitly have no subsystems */
			
 
				-			opts->none = true;
			
 
				-			continue;
			
 
				-		}
			
 
				-		if (!strcmp(token, "all")) {
			
 
				-			/* Mutually exclusive option 'all' + subsystem name */
			
 
				-			if (one_ss)
			
 
				-				return -EINVAL;
			
 
				-			all_ss = true;
			
 
				-			continue;
			
 
				-		}
			
 
				-		if (!strcmp(token, "noprefix")) {
			
 
				-			opts->flags |= CGRP_ROOT_NOPREFIX;
			
 
				-			continue;
			
 
				-		}
			
 
				-		if (!strcmp(token, "clone_children")) {
			
 
				-			opts->cpuset_clone_children = true;
			
 
				-			continue;
			
 
				-		}
			
 
				-		if (!strcmp(token, "xattr")) {
			
 
				-			opts->flags |= CGRP_ROOT_XATTR;
			
 
				-			continue;
			
 
				-		}
			
 
				-		if (!strncmp(token, "release_agent=", 14)) {
			
 
				-			/* Specifying two release agents is forbidden */
			
 
				-			if (opts->release_agent)
			
 
				-				return -EINVAL;
			
 
				-			opts->release_agent =
			
 
				-				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
			
 
				-			if (!opts->release_agent)
			
 
				-				return -ENOMEM;
			
 
				-			continue;
			
 
				-		}
			
 
				-		if (!strncmp(token, "name=", 5)) {
			
 
				-			const char *name = token + 5;
			
 
				-			/* Can't specify an empty name */
			
 
				-			if (!strlen(name))
			
 
				-				return -EINVAL;
			
 
				-			/* Must match [\w.-]+ */
			
 
				-			for (i = 0; i < strlen(name); i++) {
			
 
				-				char c = name[i];
			
 
				-				if (isalnum(c))
			
 
				-					continue;
			
 
				-				if ((c == '.') || (c == '-') || (c == '_'))
			
 
				-					continue;
			
 
				-				return -EINVAL;
			
 
				-			}
			
 
				-			/* Specifying two names is forbidden */
			
 
				-			if (opts->name)
			
 
				-				return -EINVAL;
			
 
				-			opts->name = kstrndup(name,
			
 
				-					      MAX_CGROUP_ROOT_NAMELEN - 1,
			
 
				-					      GFP_KERNEL);
			
 
				-			if (!opts->name)
			
 
				-				return -ENOMEM;
			
 
				-
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		for_each_subsys(ss, i) {
			
 
				-			if (strcmp(token, ss->legacy_name))
			
 
				-				continue;
			
 
				-			if (!cgroup_ssid_enabled(i))
			
 
				-				continue;
			
 
				-			if (cgroup_ssid_no_v1(i))
			
 
				-				continue;
			
 
				-
			
 
				-			/* Mutually exclusive option 'all' + subsystem name */
			
 
				-			if (all_ss)
			
 
				-				return -EINVAL;
			
 
				-			opts->subsys_mask |= (1 << i);
			
 
				-			one_ss = true;
			
 
				-
			
 
				-			break;
			
 
				-		}
			
 
				-		if (i == CGROUP_SUBSYS_COUNT)
			
 
				-			return -ENOENT;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * If the 'all' option was specified select all the subsystems,
			
 
				-	 * otherwise if 'none', 'name=' and a subsystem name options were
			
 
				-	 * not specified, let's default to 'all'
			
 
				-	 */
			
 
				-	if (all_ss || (!one_ss && !opts->none && !opts->name))
			
 
				-		for_each_subsys(ss, i)
			
 
				-			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
			
 
				-				opts->subsys_mask |= (1 << i);
			
 
				-
			
 
				-	/*
			
 
				-	 * We either have to specify by name or by subsystems. (So all
			
 
				-	 * empty hierarchies must have a name).
			
 
				-	 */
			
 
				-	if (!opts->subsys_mask && !opts->name)
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	/*
			
 
				-	 * Option noprefix was introduced just for backward compatibility
			
 
				-	 * with the old cpuset, so we allow noprefix only if mounting just
			
 
				-	 * the cpuset subsystem.
			
 
				-	 */
			
 
				-	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	/* Can't specify "none" and some subsystems */
			
 
				-	if (opts->subsys_mask && opts->none)
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
			
 
				 {
			
 
				-	int ret = 0;
			
 
				-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
			
 
				-	struct cgroup_sb_opts opts;
			
 
				-	u16 added_mask, removed_mask;
			
 
				-
			
 
				-	if (root == &cgrp_dfl_root) {
			
 
				-		pr_err("remount is not allowed\n");
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-
			
 
				-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
			
 
				-
			
 
				-	/* See what subsystems are wanted */
			
 
				-	ret = parse_cgroupfs_options(data, &opts);
			
 
				-	if (ret)
			
 
				-		goto out_unlock;
			
 
				-
			
 
				-	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
			
 
				-		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
			
 
				-			task_tgid_nr(current), current->comm);
			
 
				-
			
 
				-	added_mask = opts.subsys_mask & ~root->subsys_mask;
			
 
				-	removed_mask = root->subsys_mask & ~opts.subsys_mask;
			
 
				-
			
 
				-	/* Don't allow flags or name to change at remount */
			
 
				-	if ((opts.flags ^ root->flags) ||
			
 
				-	    (opts.name && strcmp(opts.name, root->name))) {
			
 
				-		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
			
 
				-		       opts.flags, opts.name ?: "", root->flags, root->name);
			
 
				-		ret = -EINVAL;
			
 
				-		goto out_unlock;
			
 
				-	}
			
 
				-
			
 
				-	/* remounting is not allowed for populated hierarchies */
			
 
				-	if (!list_empty(&root->cgrp.self.children)) {
			
 
				-		ret = -EBUSY;
			
 
				-		goto out_unlock;
			
 
				-	}
			
 
				-
			
 
				-	ret = rebind_subsystems(root, added_mask);
			
 
				-	if (ret)
			
 
				-		goto out_unlock;
			
 
				-
			
 
				-	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
			
 
				-
			
 
				-	if (opts.release_agent) {
			
 
				-		spin_lock(&release_agent_path_lock);
			
 
				-		strcpy(root->release_agent_path, opts.release_agent);
			
 
				-		spin_unlock(&release_agent_path_lock);
			
 
				-	}
			
 
				-
			
 
				-	trace_cgroup_remount(root);
			
 
				-
			
 
				- out_unlock:
			
 
				-	kfree(opts.release_agent);
			
 
				-	kfree(opts.name);
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				-	return ret;
			
 
				+	pr_err("remount is not allowed\n");
			
 
				+	return -EINVAL;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1964,11 +1617,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 
				 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
			
 
				 
			
 
				 	init_waitqueue_head(&cgrp->offline_waitq);
			
 
				-	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
			
 
				+	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
			
 
				 }
			
 
				 
			
 
				-static void init_cgroup_root(struct cgroup_root *root,
			
 
				-			     struct cgroup_sb_opts *opts)
			
 
				+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
			
 
				 {
			
 
				 	struct cgroup *cgrp = &root->cgrp;
			
 
				 
			
@@ -1987,10 +1639,11 @@ static void init_cgroup_root(struct cgroup_root *root,
 
				 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
			
 
				 }
			
 
				 
			
 
				-static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
			
 
				+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
			
 
				 {
			
 
				 	LIST_HEAD(tmp_links);
			
 
				 	struct cgroup *root_cgrp = &root->cgrp;
			
 
				+	struct kernfs_syscall_ops *kf_sops;
			
 
				 	struct css_set *cset;
			
 
				 	int i, ret;
			
 
				 
			
@@ -2022,7 +1675,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 
				 	if (ret)
			
 
				 		goto cancel_ref;
			
 
				 
			
 
				-	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
			
 
				+	kf_sops = root == &cgrp_dfl_root ?
			
 
				+		&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
			
 
				+
			
 
				+	root->kf_root = kernfs_create_root(kf_sops,
			
 
				 					   KERNFS_ROOT_CREATE_DEACTIVATED,
			
 
				 					   root_cgrp);
			
 
				 	if (IS_ERR(root->kf_root)) {
			
@@ -2080,20 +1736,48 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
			
 
				+			       struct cgroup_root *root, unsigned long magic,
			
 
				+			       struct cgroup_namespace *ns)
			
 
				+{
			
 
				+	struct dentry *dentry;
			
 
				+	bool new_sb;
			
 
				+
			
 
				+	dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
			
 
				+
			
 
				+	/*
			
 
				+	 * In non-init cgroup namespace, instead of root cgroup's dentry,
			
 
				+	 * we return the dentry corresponding to the cgroupns->root_cgrp.
			
 
				+	 */
			
 
				+	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
			
 
				+		struct dentry *nsdentry;
			
 
				+		struct cgroup *cgrp;
			
 
				+
			
 
				+		mutex_lock(&cgroup_mutex);
			
 
				+		spin_lock_irq(&css_set_lock);
			
 
				+
			
 
				+		cgrp = cset_cgroup_from_root(ns->root_cset, root);
			
 
				+
			
 
				+		spin_unlock_irq(&css_set_lock);
			
 
				+		mutex_unlock(&cgroup_mutex);
			
 
				+
			
 
				+		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
			
 
				+		dput(dentry);
			
 
				+		dentry = nsdentry;
			
 
				+	}
			
 
				+
			
 
				+	if (IS_ERR(dentry) || !new_sb)
			
 
				+		cgroup_put(&root->cgrp);
			
 
				+
			
 
				+	return dentry;
			
 
				+}
			
 
				+
			
 
				 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
			
 
				 			 int flags, const char *unused_dev_name,
			
 
				 			 void *data)
			
 
				 {
			
 
				-	bool is_v2 = fs_type == &cgroup2_fs_type;
			
 
				-	struct super_block *pinned_sb = NULL;
			
 
				 	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
			
 
				-	struct cgroup_subsys *ss;
			
 
				-	struct cgroup_root *root;
			
 
				-	struct cgroup_sb_opts opts;
			
 
				 	struct dentry *dentry;
			
 
				-	int ret;
			
 
				-	int i;
			
 
				-	bool new_sb;
			
 
				 
			
 
				 	get_cgroup_ns(ns);
			
 
				 
			
@@ -2110,225 +1794,65 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
				 	if (!use_task_css_set_links)
			
 
				 		cgroup_enable_task_cg_lists();
			
 
				 
			
 
				-	if (is_v2) {
			
 
				+	if (fs_type == &cgroup2_fs_type) {
			
 
				 		if (data) {
			
 
				 			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
			
 
				 			put_cgroup_ns(ns);
			
 
				 			return ERR_PTR(-EINVAL);
			
 
				 		}
			
 
				 		cgrp_dfl_visible = true;
			
 
				-		root = &cgrp_dfl_root;
			
 
				-		cgroup_get(&root->cgrp);
			
 
				-		goto out_mount;
			
 
				+		cgroup_get(&cgrp_dfl_root.cgrp);
			
 
				+
			
 
				+		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
			
 
				+					 CGROUP2_SUPER_MAGIC, ns);
			
 
				+	} else {
			
 
				+		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
			
 
				+				       CGROUP_SUPER_MAGIC, ns);
			
 
				 	}
			
 
				 
			
 
				-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
			
 
				+	put_cgroup_ns(ns);
			
 
				+	return dentry;
			
 
				+}
			
 
				 
			
 
				-	/* First find the desired set of subsystems */
			
 
				-	ret = parse_cgroupfs_options(data, &opts);
			
 
				-	if (ret)
			
 
				-		goto out_unlock;
			
 
				+static void cgroup_kill_sb(struct super_block *sb)
			
 
				+{
			
 
				+	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
			
 
				+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
			
 
				 
			
 
				 	/*
			
 
				-	 * Destruction of cgroup root is asynchronous, so subsystems may
			
 
				-	 * still be dying after the previous unmount.  Let's drain the
			
 
				-	 * dying subsystems.  We just need to ensure that the ones
			
 
				-	 * unmounted previously finish dying and don't care about new ones
			
 
				-	 * starting.  Testing ref liveliness is good enough.
			
 
				+	 * If @root doesn't have any mounts or children, start killing it.
			
 
				+	 * This prevents new mounts by disabling percpu_ref_tryget_live().
			
 
				+	 * cgroup_mount() may wait for @root's release.
			
 
				+	 *
			
 
				+	 * And don't kill the default root.
			
 
				 	 */
			
 
				-	for_each_subsys(ss, i) {
			
 
				-		if (!(opts.subsys_mask & (1 << i)) ||
			
 
				-		    ss->root == &cgrp_dfl_root)
			
 
				-			continue;
			
 
				+	if (!list_empty(&root->cgrp.self.children) ||
			
 
				+	    root == &cgrp_dfl_root)
			
 
				+		cgroup_put(&root->cgrp);
			
 
				+	else
			
 
				+		percpu_ref_kill(&root->cgrp.self.refcnt);
			
 
				 
			
 
				-		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
			
 
				-			mutex_unlock(&cgroup_mutex);
			
 
				-			msleep(10);
			
 
				-			ret = restart_syscall();
			
 
				-			goto out_free;
			
 
				-		}
			
 
				-		cgroup_put(&ss->root->cgrp);
			
 
				-	}
			
 
				+	kernfs_kill_sb(sb);
			
 
				+}
			
 
				 
			
 
				-	for_each_root(root) {
			
 
				-		bool name_match = false;
			
 
				+struct file_system_type cgroup_fs_type = {
			
 
				+	.name = "cgroup",
			
 
				+	.mount = cgroup_mount,
			
 
				+	.kill_sb = cgroup_kill_sb,
			
 
				+	.fs_flags = FS_USERNS_MOUNT,
			
 
				+};
			
 
				 
			
 
				-		if (root == &cgrp_dfl_root)
			
 
				-			continue;
			
 
				+static struct file_system_type cgroup2_fs_type = {
			
 
				+	.name = "cgroup2",
			
 
				+	.mount = cgroup_mount,
			
 
				+	.kill_sb = cgroup_kill_sb,
			
 
				+	.fs_flags = FS_USERNS_MOUNT,
			
 
				+};
			
 
				 
			
 
				-		/*
			
 
				-		 * If we asked for a name then it must match.  Also, if
			
 
				-		 * name matches but sybsys_mask doesn't, we should fail.
			
 
				-		 * Remember whether name matched.
			
 
				-		 */
			
 
				-		if (opts.name) {
			
 
				-			if (strcmp(opts.name, root->name))
			
 
				-				continue;
			
 
				-			name_match = true;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-		 * If we asked for subsystems (or explicitly for no
			
 
				-		 * subsystems) then they must match.
			
 
				-		 */
			
 
				-		if ((opts.subsys_mask || opts.none) &&
			
 
				-		    (opts.subsys_mask != root->subsys_mask)) {
			
 
				-			if (!name_match)
			
 
				-				continue;
			
 
				-			ret = -EBUSY;
			
 
				-			goto out_unlock;
			
 
				-		}
			
 
				-
			
 
				-		if (root->flags ^ opts.flags)
			
 
				-			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
			
 
				-
			
 
				-		/*
			
 
				-		 * We want to reuse @root whose lifetime is governed by its
			
 
				-		 * ->cgrp.  Let's check whether @root is alive and keep it
			
 
				-		 * that way.  As cgroup_kill_sb() can happen anytime, we
			
 
				-		 * want to block it by pinning the sb so that @root doesn't
			
 
				-		 * get killed before mount is complete.
			
 
				-		 *
			
 
				-		 * With the sb pinned, tryget_live can reliably indicate
			
 
				-		 * whether @root can be reused.  If it's being killed,
			
 
				-		 * drain it.  We can use wait_queue for the wait but this
			
 
				-		 * path is super cold.  Let's just sleep a bit and retry.
			
 
				-		 */
			
 
				-		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
			
 
				-		if (IS_ERR(pinned_sb) ||
			
 
				-		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
			
 
				-			mutex_unlock(&cgroup_mutex);
			
 
				-			if (!IS_ERR_OR_NULL(pinned_sb))
			
 
				-				deactivate_super(pinned_sb);
			
 
				-			msleep(10);
			
 
				-			ret = restart_syscall();
			
 
				-			goto out_free;
			
 
				-		}
			
 
				-
			
 
				-		ret = 0;
			
 
				-		goto out_unlock;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * No such thing, create a new one.  name= matching without subsys
			
 
				-	 * specification is allowed for already existing hierarchies but we
			
 
				-	 * can't create new one without subsys specification.
			
 
				-	 */
			
 
				-	if (!opts.subsys_mask && !opts.none) {
			
 
				-		ret = -EINVAL;
			
 
				-		goto out_unlock;
			
 
				-	}
			
 
				-
			
 
				-	/* Hierarchies may only be created in the initial cgroup namespace. */
			
 
				-	if (ns != &init_cgroup_ns) {
			
 
				-		ret = -EPERM;
			
 
				-		goto out_unlock;
			
 
				-	}
			
 
				-
			
 
				-	root = kzalloc(sizeof(*root), GFP_KERNEL);
			
 
				-	if (!root) {
			
 
				-		ret = -ENOMEM;
			
 
				-		goto out_unlock;
			
 
				-	}
			
 
				-
			
 
				-	init_cgroup_root(root, &opts);
			
 
				-
			
 
				-	ret = cgroup_setup_root(root, opts.subsys_mask);
			
 
				-	if (ret)
			
 
				-		cgroup_free_root(root);
			
 
				-
			
 
				-out_unlock:
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				-out_free:
			
 
				-	kfree(opts.release_agent);
			
 
				-	kfree(opts.name);
			
 
				-
			
 
				-	if (ret) {
			
 
				-		put_cgroup_ns(ns);
			
 
				-		return ERR_PTR(ret);
			
 
				-	}
			
 
				-out_mount:
			
 
				-	dentry = kernfs_mount(fs_type, flags, root->kf_root,
			
 
				-			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
			
 
				-			      &new_sb);
			
 
				-
			
 
				-	/*
			
 
				-	 * In non-init cgroup namespace, instead of root cgroup's
			
 
				-	 * dentry, we return the dentry corresponding to the
			
 
				-	 * cgroupns->root_cgrp.
			
 
				-	 */
			
 
				-	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
			
 
				-		struct dentry *nsdentry;
			
 
				-		struct cgroup *cgrp;
			
 
				-
			
 
				-		mutex_lock(&cgroup_mutex);
			
 
				-		spin_lock_irq(&css_set_lock);
			
 
				-
			
 
				-		cgrp = cset_cgroup_from_root(ns->root_cset, root);
			
 
				-
			
 
				-		spin_unlock_irq(&css_set_lock);
			
 
				-		mutex_unlock(&cgroup_mutex);
			
 
				-
			
 
				-		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
			
 
				-		dput(dentry);
			
 
				-		dentry = nsdentry;
			
 
				-	}
			
 
				-
			
 
				-	if (IS_ERR(dentry) || !new_sb)
			
 
				-		cgroup_put(&root->cgrp);
			
 
				-
			
 
				-	/*
			
 
				-	 * If @pinned_sb, we're reusing an existing root and holding an
			
 
				-	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
			
 
				-	 */
			
 
				-	if (pinned_sb) {
			
 
				-		WARN_ON(new_sb);
			
 
				-		deactivate_super(pinned_sb);
			
 
				-	}
			
 
				-
			
 
				-	put_cgroup_ns(ns);
			
 
				-	return dentry;
			
 
				-}
			
 
				-
			
 
				-static void cgroup_kill_sb(struct super_block *sb)
			
 
				-{
			
 
				-	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
			
 
				-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
			
 
				-
			
 
				-	/*
			
 
				-	 * If @root doesn't have any mounts or children, start killing it.
			
 
				-	 * This prevents new mounts by disabling percpu_ref_tryget_live().
			
 
				-	 * cgroup_mount() may wait for @root's release.
			
 
				-	 *
			
 
				-	 * And don't kill the default root.
			
 
				-	 */
			
 
				-	if (!list_empty(&root->cgrp.self.children) ||
			
 
				-	    root == &cgrp_dfl_root)
			
 
				-		cgroup_put(&root->cgrp);
			
 
				-	else
			
 
				-		percpu_ref_kill(&root->cgrp.self.refcnt);
			
 
				-
			
 
				-	kernfs_kill_sb(sb);
			
 
				-}
			
 
				-
			
 
				-static struct file_system_type cgroup_fs_type = {
			
 
				-	.name = "cgroup",
			
 
				-	.mount = cgroup_mount,
			
 
				-	.kill_sb = cgroup_kill_sb,
			
 
				-	.fs_flags = FS_USERNS_MOUNT,
			
 
				-};
			
 
				-
			
 
				-static struct file_system_type cgroup2_fs_type = {
			
 
				-	.name = "cgroup2",
			
 
				-	.mount = cgroup_mount,
			
 
				-	.kill_sb = cgroup_kill_sb,
			
 
				-	.fs_flags = FS_USERNS_MOUNT,
			
 
				-};
			
 
				-
			
 
				-static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
			
 
				-				 struct cgroup_namespace *ns)
			
 
				-{
			
 
				-	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
			
 
				+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
			
 
				+			  struct cgroup_namespace *ns)
			
 
				+{
			
 
				+	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
			
 
				 
			
 
				 	return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
			
 
				 }
			
@@ -2389,49 +1913,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(task_cgroup_path);
			
 
				 
			
 
				-/* used to track tasks and other necessary states during migration */
			
 
				-struct cgroup_taskset {
			
 
				-	/* the src and dst cset list running through cset->mg_node */
			
 
				-	struct list_head	src_csets;
			
 
				-	struct list_head	dst_csets;
			
 
				-
			
 
				-	/* the subsys currently being processed */
			
 
				-	int			ssid;
			
 
				-
			
 
				-	/*
			
 
				-	 * Fields for cgroup_taskset_*() iteration.
			
 
				-	 *
			
 
				-	 * Before migration is committed, the target migration tasks are on
			
 
				-	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
			
 
				-	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
			
 
				-	 * or ->dst_csets depending on whether migration is committed.
			
 
				-	 *
			
 
				-	 * ->cur_csets and ->cur_task point to the current task position
			
 
				-	 * during iteration.
			
 
				-	 */
			
 
				-	struct list_head	*csets;
			
 
				-	struct css_set		*cur_cset;
			
 
				-	struct task_struct	*cur_task;
			
 
				-};
			
 
				-
			
 
				-#define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
			
 
				-	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
			
 
				-	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
			
 
				-	.csets			= &tset.src_csets,			\
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				- * cgroup_taskset_add - try to add a migration target task to a taskset
			
 
				+ * cgroup_migrate_add_task - add a migration target task to a migration context
			
 
				  * @task: target task
			
 
				- * @tset: target taskset
			
 
				+ * @mgctx: target migration context
			
 
				  *
			
 
				- * Add @task, which is a migration target, to @tset.  This function becomes
			
 
				- * noop if @task doesn't need to be migrated.  @task's css_set should have
			
 
				- * been added as a migration source and @task->cg_list will be moved from
			
 
				- * the css_set's tasks list to mg_tasks one.
			
 
				+ * Add @task, which is a migration target, to @mgctx->tset.  This function
			
 
				+ * becomes noop if @task doesn't need to be migrated.  @task's css_set
			
 
				+ * should have been added as a migration source and @task->cg_list will be
			
 
				+ * moved from the css_set's tasks list to mg_tasks one.
			
 
				  */
			
 
				-static void cgroup_taskset_add(struct task_struct *task,
			
 
				-			       struct cgroup_taskset *tset)
			
 
				+static void cgroup_migrate_add_task(struct task_struct *task,
			
 
				+				    struct cgroup_mgctx *mgctx)
			
 
				 {
			
 
				 	struct css_set *cset;
			
 
				 
			
@@ -2451,10 +1944,11 @@ static void cgroup_taskset_add(struct task_struct *task,
 
				 
			
 
				 	list_move_tail(&task->cg_list, &cset->mg_tasks);
			
 
				 	if (list_empty(&cset->mg_node))
			
 
				-		list_add_tail(&cset->mg_node, &tset->src_csets);
			
 
				+		list_add_tail(&cset->mg_node,
			
 
				+			      &mgctx->tset.src_csets);
			
 
				 	if (list_empty(&cset->mg_dst_cset->mg_node))
			
 
				-		list_move_tail(&cset->mg_dst_cset->mg_node,
			
 
				-			       &tset->dst_csets);
			
 
				+		list_add_tail(&cset->mg_dst_cset->mg_node,
			
 
				+			      &mgctx->tset.dst_csets);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -2521,17 +2015,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 
				 
			
 
				 /**
			
 
				  * cgroup_taskset_migrate - migrate a taskset
			
 
				- * @tset: taget taskset
			
 
				- * @root: cgroup root the migration is taking place on
			
 
				+ * @mgctx: migration context
			
 
				  *
			
 
				- * Migrate tasks in @tset as setup by migration preparation functions.
			
 
				+ * Migrate tasks in @mgctx as setup by migration preparation functions.
			
 
				  * This function fails iff one of the ->can_attach callbacks fails and
			
 
				- * guarantees that either all or none of the tasks in @tset are migrated.
			
 
				- * @tset is consumed regardless of success.
			
 
				+ * guarantees that either all or none of the tasks in @mgctx are migrated.
			
 
				+ * @mgctx is consumed regardless of success.
			
 
				  */
			
 
				-static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
			
 
				-				  struct cgroup_root *root)
			
 
				+static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
			
 
				 {
			
 
				+	struct cgroup_taskset *tset = &mgctx->tset;
			
 
				 	struct cgroup_subsys *ss;
			
 
				 	struct task_struct *task, *tmp_task;
			
 
				 	struct css_set *cset, *tmp_cset;
			
@@ -2542,7 +2035,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 
				 		return 0;
			
 
				 
			
 
				 	/* check that we can legitimately attach to the cgroup */
			
 
				-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
			
 
				+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
			
 
				 		if (ss->can_attach) {
			
 
				 			tset->ssid = ssid;
			
 
				 			ret = ss->can_attach(tset);
			
@@ -2578,7 +2071,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 
				 	 */
			
 
				 	tset->csets = &tset->dst_csets;
			
 
				 
			
 
				-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
			
 
				+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
			
 
				 		if (ss->attach) {
			
 
				 			tset->ssid = ssid;
			
 
				 			ss->attach(tset);
			
@@ -2589,7 +2082,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 
				 	goto out_release_tset;
			
 
				 
			
 
				 out_cancel_attach:
			
 
				-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
			
 
				+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
			
 
				 		if (ssid == failed_ssid)
			
 
				 			break;
			
 
				 		if (ss->cancel_attach) {
			
@@ -2616,7 +2109,7 @@ out_release_tset:
 
				  * zero for migration destination cgroups with tasks so that child cgroups
			
 
				  * don't compete against tasks.
			
 
				  */
			
 
				-static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
			
 
				+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
			
 
				 {
			
 
				 	return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
			
 
				 		!dst_cgrp->subtree_control;
			
@@ -2624,25 +2117,31 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 
				 
			
 
				 /**
			
 
				  * cgroup_migrate_finish - cleanup after attach
			
 
				- * @preloaded_csets: list of preloaded css_sets
			
 
				+ * @mgctx: migration context
			
 
				  *
			
 
				  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
			
 
				  * those functions for details.
			
 
				  */
			
 
				-static void cgroup_migrate_finish(struct list_head *preloaded_csets)
			
 
				+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
			
 
				 {
			
 
				+	LIST_HEAD(preloaded);
			
 
				 	struct css_set *cset, *tmp_cset;
			
 
				 
			
 
				 	lockdep_assert_held(&cgroup_mutex);
			
 
				 
			
 
				 	spin_lock_irq(&css_set_lock);
			
 
				-	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
			
 
				+
			
 
				+	list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
			
 
				+	list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
			
 
				+
			
 
				+	list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
			
 
				 		cset->mg_src_cgrp = NULL;
			
 
				 		cset->mg_dst_cgrp = NULL;
			
 
				 		cset->mg_dst_cset = NULL;
			
 
				 		list_del_init(&cset->mg_preload_node);
			
 
				 		put_css_set_locked(cset);
			
 
				 	}
			
 
				+
			
 
				 	spin_unlock_irq(&css_set_lock);
			
 
				 }
			
 
				 
			
@@ -2650,10 +2149,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 
				  * cgroup_migrate_add_src - add a migration source css_set
			
 
				  * @src_cset: the source css_set to add
			
 
				  * @dst_cgrp: the destination cgroup
			
 
				- * @preloaded_csets: list of preloaded css_sets
			
 
				+ * @mgctx: migration context
			
 
				  *
			
 
				  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
			
 
				- * @src_cset and add it to @preloaded_csets, which should later be cleaned
			
 
				+ * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
			
 
				  * up by cgroup_migrate_finish().
			
 
				  *
			
 
				  * This function may be called without holding cgroup_threadgroup_rwsem
			
@@ -2662,9 +2161,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 
				  * into play and the preloaded css_sets are guaranteed to cover all
			
 
				  * migrations.
			
 
				  */
			
 
				-static void cgroup_migrate_add_src(struct css_set *src_cset,
			
 
				-				   struct cgroup *dst_cgrp,
			
 
				-				   struct list_head *preloaded_csets)
			
 
				+void cgroup_migrate_add_src(struct css_set *src_cset,
			
 
				+			    struct cgroup *dst_cgrp,
			
 
				+			    struct cgroup_mgctx *mgctx)
			
 
				 {
			
 
				 	struct cgroup *src_cgrp;
			
 
				 
			
@@ -2692,33 +2191,35 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
 
				 	src_cset->mg_src_cgrp = src_cgrp;
			
 
				 	src_cset->mg_dst_cgrp = dst_cgrp;
			
 
				 	get_css_set(src_cset);
			
 
				-	list_add(&src_cset->mg_preload_node, preloaded_csets);
			
 
				+	list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
			
 
				- * @preloaded_csets: list of preloaded source css_sets
			
 
				+ * @mgctx: migration context
			
 
				  *
			
 
				  * Tasks are about to be moved and all the source css_sets have been
			
 
				- * preloaded to @preloaded_csets.  This function looks up and pins all
			
 
				- * destination css_sets, links each to its source, and append them to
			
 
				- * @preloaded_csets.
			
 
				+ * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
			
 
				+ * pins all destination css_sets, links each to its source, and append them
			
 
				+ * to @mgctx->preloaded_dst_csets.
			
 
				  *
			
 
				  * This function must be called after cgroup_migrate_add_src() has been
			
 
				  * called on each migration source css_set.  After migration is performed
			
 
				  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
			
 
				- * @preloaded_csets.
			
 
				+ * @mgctx.
			
 
				  */
			
 
				-static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
			
 
				+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
			
 
				 {
			
 
				-	LIST_HEAD(csets);
			
 
				 	struct css_set *src_cset, *tmp_cset;
			
 
				 
			
 
				 	lockdep_assert_held(&cgroup_mutex);
			
 
				 
			
 
				 	/* look up the dst cset for each src cset and link it to src */
			
 
				-	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
			
 
				+	list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
			
 
				+				 mg_preload_node) {
			
 
				 		struct css_set *dst_cset;
			
 
				+		struct cgroup_subsys *ss;
			
 
				+		int ssid;
			
 
				 
			
 
				 		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
			
 
				 		if (!dst_cset)
			
@@ -2743,15 +2244,19 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
 
				 		src_cset->mg_dst_cset = dst_cset;
			
 
				 
			
 
				 		if (list_empty(&dst_cset->mg_preload_node))
			
 
				-			list_add(&dst_cset->mg_preload_node, &csets);
			
 
				+			list_add_tail(&dst_cset->mg_preload_node,
			
 
				+				      &mgctx->preloaded_dst_csets);
			
 
				 		else
			
 
				 			put_css_set(dst_cset);
			
 
				+
			
 
				+		for_each_subsys(ss, ssid)
			
 
				+			if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
			
 
				+				mgctx->ss_mask |= 1 << ssid;
			
 
				 	}
			
 
				 
			
 
				-	list_splice_tail(&csets, preloaded_csets);
			
 
				 	return 0;
			
 
				 err:
			
 
				-	cgroup_migrate_finish(&csets);
			
 
				+	cgroup_migrate_finish(mgctx);
			
 
				 	return -ENOMEM;
			
 
				 }
			
 
				 
			
@@ -2759,7 +2264,7 @@ err:
 
				  * cgroup_migrate - migrate a process or task to a cgroup
			
 
				  * @leader: the leader of the process or the task to migrate
			
 
				  * @threadgroup: whether @leader points to the whole process or a single task
			
 
				- * @root: cgroup root migration is taking place on
			
 
				+ * @mgctx: migration context
			
 
				  *
			
 
				  * Migrate a process or task denoted by @leader.  If migrating a process,
			
 
				  * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
			
@@ -2773,10 +2278,9 @@ err:
 
				  * decided for all targets by invoking group_migrate_prepare_dst() before
			
 
				  * actually starting migrating.
			
 
				  */
			
 
				-static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
			
 
				-			  struct cgroup_root *root)
			
 
				+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
			
 
				+		   struct cgroup_mgctx *mgctx)
			
 
				 {
			
 
				-	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
			
 
				 	struct task_struct *task;
			
 
				 
			
 
				 	/*
			
@@ -2788,14 +2292,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 
				 	rcu_read_lock();
			
 
				 	task = leader;
			
 
				 	do {
			
 
				-		cgroup_taskset_add(task, &tset);
			
 
				+		cgroup_migrate_add_task(task, mgctx);
			
 
				 		if (!threadgroup)
			
 
				 			break;
			
 
				 	} while_each_thread(leader, task);
			
 
				 	rcu_read_unlock();
			
 
				 	spin_unlock_irq(&css_set_lock);
			
 
				 
			
 
				-	return cgroup_taskset_migrate(&tset, root);
			
 
				+	return cgroup_migrate_execute(mgctx);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -2806,10 +2310,10 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 
				  *
			
 
				  * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
			
 
				  */
			
 
				-static int cgroup_attach_task(struct cgroup *dst_cgrp,
			
 
				-			      struct task_struct *leader, bool threadgroup)
			
 
				+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
			
 
				+		       bool threadgroup)
			
 
				 {
			
 
				-	LIST_HEAD(preloaded_csets);
			
 
				+	DEFINE_CGROUP_MGCTX(mgctx);
			
 
				 	struct task_struct *task;
			
 
				 	int ret;
			
 
				 
			
@@ -2821,8 +2325,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 
				 	rcu_read_lock();
			
 
				 	task = leader;
			
 
				 	do {
			
 
				-		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
			
 
				-				       &preloaded_csets);
			
 
				+		cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
			
 
				 		if (!threadgroup)
			
 
				 			break;
			
 
				 	} while_each_thread(leader, task);
			
@@ -2830,11 +2333,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 
				 	spin_unlock_irq(&css_set_lock);
			
 
				 
			
 
				 	/* prepare dst csets and commit */
			
 
				-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
			
 
				+	ret = cgroup_migrate_prepare_dst(&mgctx);
			
 
				 	if (!ret)
			
 
				-		ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
			
 
				+		ret = cgroup_migrate(leader, threadgroup, &mgctx);
			
 
				 
			
 
				-	cgroup_migrate_finish(&preloaded_csets);
			
 
				+	cgroup_migrate_finish(&mgctx);
			
 
				 
			
 
				 	if (!ret)
			
 
				 		trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
			
@@ -2846,20 +2349,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 
				 					 struct cgroup *dst_cgrp,
			
 
				 					 struct kernfs_open_file *of)
			
 
				 {
			
 
				-	const struct cred *cred = current_cred();
			
 
				-	const struct cred *tcred = get_task_cred(task);
			
 
				 	int ret = 0;
			
 
				 
			
 
				-	/*
			
 
				-	 * even if we're attaching all tasks in the thread group, we only
			
 
				-	 * need to check permissions on one of them.
			
 
				-	 */
			
 
				-	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
			
 
				-	    !uid_eq(cred->euid, tcred->uid) &&
			
 
				-	    !uid_eq(cred->euid, tcred->suid))
			
 
				-		ret = -EACCES;
			
 
				-
			
 
				-	if (!ret && cgroup_on_dfl(dst_cgrp)) {
			
 
				+	if (cgroup_on_dfl(dst_cgrp)) {
			
 
				 		struct super_block *sb = of->file->f_path.dentry->d_sb;
			
 
				 		struct cgroup *cgrp;
			
 
				 		struct inode *inode;
			
@@ -2877,9 +2369,21 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 
				 			ret = inode_permission(inode, MAY_WRITE);
			
 
				 			iput(inode);
			
 
				 		}
			
 
				+	} else {
			
 
				+		const struct cred *cred = current_cred();
			
 
				+		const struct cred *tcred = get_task_cred(task);
			
 
				+
			
 
				+		/*
			
 
				+		 * even if we're attaching all tasks in the thread group,
			
 
				+		 * we only need to check permissions on one of them.
			
 
				+		 */
			
 
				+		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
			
 
				+		    !uid_eq(cred->euid, tcred->uid) &&
			
 
				+		    !uid_eq(cred->euid, tcred->suid))
			
 
				+			ret = -EACCES;
			
 
				+		put_cred(tcred);
			
 
				 	}
			
 
				 
			
 
				-	put_cred(tcred);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -2888,8 +2392,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 
				  * function to attach either it or all tasks in its threadgroup. Will lock
			
 
				  * cgroup_mutex and threadgroup.
			
 
				  */
			
 
				-static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
			
 
				-				    size_t nbytes, loff_t off, bool threadgroup)
			
 
				+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
			
 
				+			     size_t nbytes, loff_t off, bool threadgroup)
			
 
				 {
			
 
				 	struct task_struct *tsk;
			
 
				 	struct cgroup_subsys *ss;
			
@@ -2950,86 +2454,12 @@ out_unlock_threadgroup:
 
				 	return ret ?: nbytes;
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
			
 
				- * @from: attach to all cgroups of a given task
			
 
				- * @tsk: the task to be attached
			
 
				- */
			
 
				-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
			
 
				-{
			
 
				-	struct cgroup_root *root;
			
 
				-	int retval = 0;
			
 
				-
			
 
				-	mutex_lock(&cgroup_mutex);
			
 
				-	percpu_down_write(&cgroup_threadgroup_rwsem);
			
 
				-	for_each_root(root) {
			
 
				-		struct cgroup *from_cgrp;
			
 
				-
			
 
				-		if (root == &cgrp_dfl_root)
			
 
				-			continue;
			
 
				-
			
 
				-		spin_lock_irq(&css_set_lock);
			
 
				-		from_cgrp = task_cgroup_from_root(from, root);
			
 
				-		spin_unlock_irq(&css_set_lock);
			
 
				-
			
 
				-		retval = cgroup_attach_task(from_cgrp, tsk, false);
			
 
				-		if (retval)
			
 
				-			break;
			
 
				-	}
			
 
				-	percpu_up_write(&cgroup_threadgroup_rwsem);
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				-
			
 
				-	return retval;
			
 
				-}
			
 
				-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
			
 
				-
			
 
				-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
			
 
				-				  char *buf, size_t nbytes, loff_t off)
			
 
				-{
			
 
				-	return __cgroup_procs_write(of, buf, nbytes, off, false);
			
 
				-}
			
 
				-
			
 
				-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
			
 
				-				  char *buf, size_t nbytes, loff_t off)
			
 
				+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
			
 
				+			   loff_t off)
			
 
				 {
			
 
				 	return __cgroup_procs_write(of, buf, nbytes, off, true);
			
 
				 }
			
 
				 
			
 
				-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
			
 
				-					  char *buf, size_t nbytes, loff_t off)
			
 
				-{
			
 
				-	struct cgroup *cgrp;
			
 
				-
			
 
				-	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
			
 
				-
			
 
				-	cgrp = cgroup_kn_lock_live(of->kn, false);
			
 
				-	if (!cgrp)
			
 
				-		return -ENODEV;
			
 
				-	spin_lock(&release_agent_path_lock);
			
 
				-	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
			
 
				-		sizeof(cgrp->root->release_agent_path));
			
 
				-	spin_unlock(&release_agent_path_lock);
			
 
				-	cgroup_kn_unlock(of->kn);
			
 
				-	return nbytes;
			
 
				-}
			
 
				-
			
 
				-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
			
 
				-{
			
 
				-	struct cgroup *cgrp = seq_css(seq)->cgroup;
			
 
				-
			
 
				-	spin_lock(&release_agent_path_lock);
			
 
				-	seq_puts(seq, cgrp->root->release_agent_path);
			
 
				-	spin_unlock(&release_agent_path_lock);
			
 
				-	seq_putc(seq, '\n');
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
			
 
				-{
			
 
				-	seq_puts(seq, "0\n");
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
			
 
				 {
			
 
				 	struct cgroup_subsys *ss;
			
@@ -3075,8 +2505,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
 
				  */
			
 
				 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
			
 
				 {
			
 
				-	LIST_HEAD(preloaded_csets);
			
 
				-	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
			
 
				+	DEFINE_CGROUP_MGCTX(mgctx);
			
 
				 	struct cgroup_subsys_state *d_css;
			
 
				 	struct cgroup *dsct;
			
 
				 	struct css_set *src_cset;
			
@@ -3092,33 +2521,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
				 		struct cgrp_cset_link *link;
			
 
				 
			
 
				 		list_for_each_entry(link, &dsct->cset_links, cset_link)
			
 
				-			cgroup_migrate_add_src(link->cset, dsct,
			
 
				-					       &preloaded_csets);
			
 
				+			cgroup_migrate_add_src(link->cset, dsct, &mgctx);
			
 
				 	}
			
 
				 	spin_unlock_irq(&css_set_lock);
			
 
				 
			
 
				 	/* NULL dst indicates self on default hierarchy */
			
 
				-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
			
 
				+	ret = cgroup_migrate_prepare_dst(&mgctx);
			
 
				 	if (ret)
			
 
				 		goto out_finish;
			
 
				 
			
 
				 	spin_lock_irq(&css_set_lock);
			
 
				-	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
			
 
				+	list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
			
 
				 		struct task_struct *task, *ntask;
			
 
				 
			
 
				-		/* src_csets precede dst_csets, break on the first dst_cset */
			
 
				-		if (!src_cset->mg_src_cgrp)
			
 
				-			break;
			
 
				-
			
 
				 		/* all tasks in src_csets need to be migrated */
			
 
				 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
			
 
				-			cgroup_taskset_add(task, &tset);
			
 
				+			cgroup_migrate_add_task(task, &mgctx);
			
 
				 	}
			
 
				 	spin_unlock_irq(&css_set_lock);
			
 
				 
			
 
				-	ret = cgroup_taskset_migrate(&tset, cgrp->root);
			
 
				+	ret = cgroup_migrate_execute(&mgctx);
			
 
				 out_finish:
			
 
				-	cgroup_migrate_finish(&preloaded_csets);
			
 
				+	cgroup_migrate_finish(&mgctx);
			
 
				 	percpu_up_write(&cgroup_threadgroup_rwsem);
			
 
				 	return ret;
			
 
				 }
			
@@ -3131,7 +2555,7 @@ out_finish:
 
				  * controller while the previous css is still around.  This function grabs
			
 
				  * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
			
 
				  */
			
 
				-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
			
 
				+void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
			
 
				 	__acquires(&cgroup_mutex)
			
 
				 {
			
 
				 	struct cgroup *dsct;
			
@@ -3503,6 +2927,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int cgroup_file_open(struct kernfs_open_file *of)
			
 
				+{
			
 
				+	struct cftype *cft = of->kn->priv;
			
 
				+
			
 
				+	if (cft->open)
			
 
				+		return cft->open(of);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void cgroup_file_release(struct kernfs_open_file *of)
			
 
				+{
			
 
				+	struct cftype *cft = of->kn->priv;
			
 
				+
			
 
				+	if (cft->release)
			
 
				+		cft->release(of);
			
 
				+}
			
 
				+
			
 
				 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
			
 
				 				 size_t nbytes, loff_t off)
			
 
				 {
			
@@ -3553,7 +2994,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
 
				 
			
 
				 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
			
 
				 {
			
 
				-	seq_cft(seq)->seq_stop(seq, v);
			
 
				+	if (seq_cft(seq)->seq_stop)
			
 
				+		seq_cft(seq)->seq_stop(seq, v);
			
 
				 }
			
 
				 
			
 
				 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
			
@@ -3575,12 +3017,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 
				 
			
 
				 static struct kernfs_ops cgroup_kf_single_ops = {
			
 
				 	.atomic_write_len	= PAGE_SIZE,
			
 
				+	.open			= cgroup_file_open,
			
 
				+	.release		= cgroup_file_release,
			
 
				 	.write			= cgroup_file_write,
			
 
				 	.seq_show		= cgroup_seqfile_show,
			
 
				 };
			
 
				 
			
 
				 static struct kernfs_ops cgroup_kf_ops = {
			
 
				 	.atomic_write_len	= PAGE_SIZE,
			
 
				+	.open			= cgroup_file_open,
			
 
				+	.release		= cgroup_file_release,
			
 
				 	.write			= cgroup_file_write,
			
 
				 	.seq_start		= cgroup_seqfile_start,
			
 
				 	.seq_next		= cgroup_seqfile_next,
			
@@ -3588,48 +3034,6 @@ static struct kernfs_ops cgroup_kf_ops = {
 
				 	.seq_show		= cgroup_seqfile_show,
			
 
				 };
			
 
				 
			
 
				-/*
			
 
				- * cgroup_rename - Only allow simple rename of directories in place.
			
 
				- */
			
 
				-static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
			
 
				-			 const char *new_name_str)
			
 
				-{
			
 
				-	struct cgroup *cgrp = kn->priv;
			
 
				-	int ret;
			
 
				-
			
 
				-	if (kernfs_type(kn) != KERNFS_DIR)
			
 
				-		return -ENOTDIR;
			
 
				-	if (kn->parent != new_parent)
			
 
				-		return -EIO;
			
 
				-
			
 
				-	/*
			
 
				-	 * This isn't a proper migration and its usefulness is very
			
 
				-	 * limited.  Disallow on the default hierarchy.
			
 
				-	 */
			
 
				-	if (cgroup_on_dfl(cgrp))
			
 
				-		return -EPERM;
			
 
				-
			
 
				-	/*
			
 
				-	 * We're gonna grab cgroup_mutex which nests outside kernfs
			
 
				-	 * active_ref.  kernfs_rename() doesn't require active_ref
			
 
				-	 * protection.  Break them before grabbing cgroup_mutex.
			
 
				-	 */
			
 
				-	kernfs_break_active_protection(new_parent);
			
 
				-	kernfs_break_active_protection(kn);
			
 
				-
			
 
				-	mutex_lock(&cgroup_mutex);
			
 
				-
			
 
				-	ret = kernfs_rename(kn, new_parent, new_name_str);
			
 
				-	if (!ret)
			
 
				-		trace_cgroup_rename(cgrp);
			
 
				-
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				-
			
 
				-	kernfs_unbreak_active_protection(kn);
			
 
				-	kernfs_unbreak_active_protection(new_parent);
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				 /* set uid and gid of cgroup dirs and files to that of the creator */
			
 
				 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
			
 
				 {
			
@@ -3925,26 +3329,6 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 
				 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * cgroup_task_count - count the number of tasks in a cgroup.
			
 
				- * @cgrp: the cgroup in question
			
 
				- *
			
 
				- * Return the number of tasks in the cgroup.  The returned number can be
			
 
				- * higher than the actual number of tasks due to css_set references from
			
 
				- * namespace roots and temporary usages.
			
 
				- */
			
 
				-static int cgroup_task_count(const struct cgroup *cgrp)
			
 
				-{
			
 
				-	int count = 0;
			
 
				-	struct cgrp_cset_link *link;
			
 
				-
			
 
				-	spin_lock_irq(&css_set_lock);
			
 
				-	list_for_each_entry(link, &cgrp->cset_links, cset_link)
			
 
				-		count += atomic_read(&link->cset->refcount);
			
 
				-	spin_unlock_irq(&css_set_lock);
			
 
				-	return count;
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * css_next_child - find the next child of a given css
			
 
				  * @pos: the current position (%NULL to initiate traversal)
			
@@ -4343,560 +3727,69 @@ void css_task_iter_end(struct css_task_iter *it)
 
				 		put_task_struct(it->cur_task);
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
			
 
				- * @to: cgroup to which the tasks will be moved
			
 
				- * @from: cgroup in which the tasks currently reside
			
 
				- *
			
 
				- * Locking rules between cgroup_post_fork() and the migration path
			
 
				- * guarantee that, if a task is forking while being migrated, the new child
			
 
				- * is guaranteed to be either visible in the source cgroup after the
			
 
				- * parent's migration is complete or put into the target cgroup.  No task
			
 
				- * can slip out of migration through forking.
			
 
				- */
			
 
				-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
			
 
				+static void cgroup_procs_release(struct kernfs_open_file *of)
			
 
				 {
			
 
				-	LIST_HEAD(preloaded_csets);
			
 
				-	struct cgrp_cset_link *link;
			
 
				-	struct css_task_iter it;
			
 
				-	struct task_struct *task;
			
 
				-	int ret;
			
 
				-
			
 
				-	if (!cgroup_may_migrate_to(to))
			
 
				-		return -EBUSY;
			
 
				+	if (of->priv) {
			
 
				+		css_task_iter_end(of->priv);
			
 
				+		kfree(of->priv);
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				-	mutex_lock(&cgroup_mutex);
			
 
				+static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
			
 
				+{
			
 
				+	struct kernfs_open_file *of = s->private;
			
 
				+	struct css_task_iter *it = of->priv;
			
 
				+	struct task_struct *task;
			
 
				 
			
 
				-	percpu_down_write(&cgroup_threadgroup_rwsem);
			
 
				+	do {
			
 
				+		task = css_task_iter_next(it);
			
 
				+	} while (task && !thread_group_leader(task));
			
 
				 
			
 
				-	/* all tasks in @from are being moved, all csets are source */
			
 
				-	spin_lock_irq(&css_set_lock);
			
 
				-	list_for_each_entry(link, &from->cset_links, cset_link)
			
 
				-		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
			
 
				-	spin_unlock_irq(&css_set_lock);
			
 
				+	return task;
			
 
				+}
			
 
				 
			
 
				-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
			
 
				-	if (ret)
			
 
				-		goto out_err;
			
 
				+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
			
 
				+{
			
 
				+	struct kernfs_open_file *of = s->private;
			
 
				+	struct cgroup *cgrp = seq_css(s)->cgroup;
			
 
				+	struct css_task_iter *it = of->priv;
			
 
				 
			
 
				 	/*
			
 
				-	 * Migrate tasks one-by-one until @from is empty.  This fails iff
			
 
				-	 * ->can_attach() fails.
			
 
				+	 * When a seq_file is seeked, it's always traversed sequentially
			
 
				+	 * from position 0, so we can simply keep iterating on !0 *pos.
			
 
				 	 */
			
 
				-	do {
			
 
				-		css_task_iter_start(&from->self, &it);
			
 
				-		task = css_task_iter_next(&it);
			
 
				-		if (task)
			
 
				-			get_task_struct(task);
			
 
				-		css_task_iter_end(&it);
			
 
				-
			
 
				-		if (task) {
			
 
				-			ret = cgroup_migrate(task, false, to->root);
			
 
				-			if (!ret)
			
 
				-				trace_cgroup_transfer_tasks(to, task, false);
			
 
				-			put_task_struct(task);
			
 
				-		}
			
 
				-	} while (task && !ret);
			
 
				-out_err:
			
 
				-	cgroup_migrate_finish(&preloaded_csets);
			
 
				-	percpu_up_write(&cgroup_threadgroup_rwsem);
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				-	return ret;
			
 
				-}
			
 
				+	if (!it) {
			
 
				+		if (WARN_ON_ONCE((*pos)++))
			
 
				+			return ERR_PTR(-EINVAL);
			
 
				 
			
 
				-/*
			
 
				- * Stuff for reading the 'tasks'/'procs' files.
			
 
				- *
			
 
				- * Reading this file can return large amounts of data if a cgroup has
			
 
				- * *lots* of attached tasks. So it may need several calls to read(),
			
 
				- * but we cannot guarantee that the information we produce is correct
			
 
				- * unless we produce it entirely atomically.
			
 
				- *
			
 
				- */
			
 
				+		it = kzalloc(sizeof(*it), GFP_KERNEL);
			
 
				+		if (!it)
			
 
				+			return ERR_PTR(-ENOMEM);
			
 
				+		of->priv = it;
			
 
				+		css_task_iter_start(&cgrp->self, it);
			
 
				+	} else if (!(*pos)++) {
			
 
				+		css_task_iter_end(it);
			
 
				+		css_task_iter_start(&cgrp->self, it);
			
 
				+	}
			
 
				 
			
 
				-/* which pidlist file are we talking about? */
			
 
				-enum cgroup_filetype {
			
 
				-	CGROUP_FILE_PROCS,
			
 
				-	CGROUP_FILE_TASKS,
			
 
				-};
			
 
				+	return cgroup_procs_next(s, NULL, NULL);
			
 
				+}
			
 
				 
			
 
				-/*
			
 
				- * A pidlist is a list of pids that virtually represents the contents of one
			
 
				- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
			
 
				- * a pair (one each for procs, tasks) for each pid namespace that's relevant
			
 
				- * to the cgroup.
			
 
				- */
			
 
				-struct cgroup_pidlist {
			
 
				-	/*
			
 
				-	 * used to find which pidlist is wanted. doesn't change as long as
			
 
				-	 * this particular list stays in the list.
			
 
				-	*/
			
 
				-	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
			
 
				-	/* array of xids */
			
 
				-	pid_t *list;
			
 
				-	/* how many elements the above list has */
			
 
				-	int length;
			
 
				-	/* each of these stored in a list by its cgroup */
			
 
				-	struct list_head links;
			
 
				-	/* pointer to the cgroup we belong to, for list removal purposes */
			
 
				-	struct cgroup *owner;
			
 
				-	/* for delayed destruction */
			
 
				-	struct delayed_work destroy_dwork;
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				- * The following two functions "fix" the issue where there are more pids
			
 
				- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
			
 
				- * TODO: replace with a kernel-wide solution to this problem
			
 
				- */
			
 
				-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
			
 
				-static void *pidlist_allocate(int count)
			
 
				-{
			
 
				-	if (PIDLIST_TOO_LARGE(count))
			
 
				-		return vmalloc(count * sizeof(pid_t));
			
 
				-	else
			
 
				-		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
			
 
				-}
			
 
				-
			
 
				-static void pidlist_free(void *p)
			
 
				-{
			
 
				-	kvfree(p);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Used to destroy all pidlists lingering waiting for destroy timer.  None
			
 
				- * should be left afterwards.
			
 
				- */
			
 
				-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
			
 
				-{
			
 
				-	struct cgroup_pidlist *l, *tmp_l;
			
 
				-
			
 
				-	mutex_lock(&cgrp->pidlist_mutex);
			
 
				-	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
			
 
				-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
			
 
				-	mutex_unlock(&cgrp->pidlist_mutex);
			
 
				-
			
 
				-	flush_workqueue(cgroup_pidlist_destroy_wq);
			
 
				-	BUG_ON(!list_empty(&cgrp->pidlists));
			
 
				-}
			
 
				-
			
 
				-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
			
 
				-{
			
 
				-	struct delayed_work *dwork = to_delayed_work(work);
			
 
				-	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
			
 
				-						destroy_dwork);
			
 
				-	struct cgroup_pidlist *tofree = NULL;
			
 
				-
			
 
				-	mutex_lock(&l->owner->pidlist_mutex);
			
 
				-
			
 
				-	/*
			
 
				-	 * Destroy iff we didn't get queued again.  The state won't change
			
 
				-	 * as destroy_dwork can only be queued while locked.
			
 
				-	 */
			
 
				-	if (!delayed_work_pending(dwork)) {
			
 
				-		list_del(&l->links);
			
 
				-		pidlist_free(l->list);
			
 
				-		put_pid_ns(l->key.ns);
			
 
				-		tofree = l;
			
 
				-	}
			
 
				-
			
 
				-	mutex_unlock(&l->owner->pidlist_mutex);
			
 
				-	kfree(tofree);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
			
 
				- * Returns the number of unique elements.
			
 
				- */
			
 
				-static int pidlist_uniq(pid_t *list, int length)
			
 
				-{
			
 
				-	int src, dest = 1;
			
 
				-
			
 
				-	/*
			
 
				-	 * we presume the 0th element is unique, so i starts at 1. trivial
			
 
				-	 * edge cases first; no work needs to be done for either
			
 
				-	 */
			
 
				-	if (length == 0 || length == 1)
			
 
				-		return length;
			
 
				-	/* src and dest walk down the list; dest counts unique elements */
			
 
				-	for (src = 1; src < length; src++) {
			
 
				-		/* find next unique element */
			
 
				-		while (list[src] == list[src-1]) {
			
 
				-			src++;
			
 
				-			if (src == length)
			
 
				-				goto after;
			
 
				-		}
			
 
				-		/* dest always points to where the next unique element goes */
			
 
				-		list[dest] = list[src];
			
 
				-		dest++;
			
 
				-	}
			
 
				-after:
			
 
				-	return dest;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * The two pid files - task and cgroup.procs - guaranteed that the result
			
 
				- * is sorted, which forced this whole pidlist fiasco.  As pid order is
			
 
				- * different per namespace, each namespace needs differently sorted list,
			
 
				- * making it impossible to use, for example, single rbtree of member tasks
			
 
				- * sorted by task pointer.  As pidlists can be fairly large, allocating one
			
 
				- * per open file is dangerous, so cgroup had to implement shared pool of
			
 
				- * pidlists keyed by cgroup and namespace.
			
 
				- *
			
 
				- * All this extra complexity was caused by the original implementation
			
 
				- * committing to an entirely unnecessary property.  In the long term, we
			
 
				- * want to do away with it.  Explicitly scramble sort order if on the
			
 
				- * default hierarchy so that no such expectation exists in the new
			
 
				- * interface.
			
 
				- *
			
 
				- * Scrambling is done by swapping every two consecutive bits, which is
			
 
				- * non-identity one-to-one mapping which disturbs sort order sufficiently.
			
 
				- */
			
 
				-static pid_t pid_fry(pid_t pid)
			
 
				-{
			
 
				-	unsigned a = pid & 0x55555555;
			
 
				-	unsigned b = pid & 0xAAAAAAAA;
			
 
				-
			
 
				-	return (a << 1) | (b >> 1);
			
 
				-}
			
 
				-
			
 
				-static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
			
 
				-{
			
 
				-	if (cgroup_on_dfl(cgrp))
			
 
				-		return pid_fry(pid);
			
 
				-	else
			
 
				-		return pid;
			
 
				-}
			
 
				-
			
 
				-static int cmppid(const void *a, const void *b)
			
 
				-{
			
 
				-	return *(pid_t *)a - *(pid_t *)b;
			
 
				-}
			
 
				-
			
 
				-static int fried_cmppid(const void *a, const void *b)
			
 
				-{
			
 
				-	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
			
 
				-}
			
 
				-
			
 
				-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
			
 
				-						  enum cgroup_filetype type)
			
 
				-{
			
 
				-	struct cgroup_pidlist *l;
			
 
				-	/* don't need task_nsproxy() if we're looking at ourself */
			
 
				-	struct pid_namespace *ns = task_active_pid_ns(current);
			
 
				-
			
 
				-	lockdep_assert_held(&cgrp->pidlist_mutex);
			
 
				-
			
 
				-	list_for_each_entry(l, &cgrp->pidlists, links)
			
 
				-		if (l->key.type == type && l->key.ns == ns)
			
 
				-			return l;
			
 
				-	return NULL;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * find the appropriate pidlist for our purpose (given procs vs tasks)
			
 
				- * returns with the lock on that pidlist already held, and takes care
			
 
				- * of the use count, or returns NULL with no locks held if we're out of
			
 
				- * memory.
			
 
				- */
			
 
				-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
			
 
				-						enum cgroup_filetype type)
			
 
				-{
			
 
				-	struct cgroup_pidlist *l;
			
 
				-
			
 
				-	lockdep_assert_held(&cgrp->pidlist_mutex);
			
 
				-
			
 
				-	l = cgroup_pidlist_find(cgrp, type);
			
 
				-	if (l)
			
 
				-		return l;
			
 
				-
			
 
				-	/* entry not found; create a new one */
			
 
				-	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
			
 
				-	if (!l)
			
 
				-		return l;
			
 
				-
			
 
				-	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
			
 
				-	l->key.type = type;
			
 
				-	/* don't need task_nsproxy() if we're looking at ourself */
			
 
				-	l->key.ns = get_pid_ns(task_active_pid_ns(current));
			
 
				-	l->owner = cgrp;
			
 
				-	list_add(&l->links, &cgrp->pidlists);
			
 
				-	return l;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
			
 
				- */
			
 
				-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
			
 
				-			      struct cgroup_pidlist **lp)
			
 
				-{
			
 
				-	pid_t *array;
			
 
				-	int length;
			
 
				-	int pid, n = 0; /* used for populating the array */
			
 
				-	struct css_task_iter it;
			
 
				-	struct task_struct *tsk;
			
 
				-	struct cgroup_pidlist *l;
			
 
				-
			
 
				-	lockdep_assert_held(&cgrp->pidlist_mutex);
			
 
				-
			
 
				-	/*
			
 
				-	 * If cgroup gets more users after we read count, we won't have
			
 
				-	 * enough space - tough.  This race is indistinguishable to the
			
 
				-	 * caller from the case that the additional cgroup users didn't
			
 
				-	 * show up until sometime later on.
			
 
				-	 */
			
 
				-	length = cgroup_task_count(cgrp);
			
 
				-	array = pidlist_allocate(length);
			
 
				-	if (!array)
			
 
				-		return -ENOMEM;
			
 
				-	/* now, populate the array */
			
 
				-	css_task_iter_start(&cgrp->self, &it);
			
 
				-	while ((tsk = css_task_iter_next(&it))) {
			
 
				-		if (unlikely(n == length))
			
 
				-			break;
			
 
				-		/* get tgid or pid for procs or tasks file respectively */
			
 
				-		if (type == CGROUP_FILE_PROCS)
			
 
				-			pid = task_tgid_vnr(tsk);
			
 
				-		else
			
 
				-			pid = task_pid_vnr(tsk);
			
 
				-		if (pid > 0) /* make sure to only use valid results */
			
 
				-			array[n++] = pid;
			
 
				-	}
			
 
				-	css_task_iter_end(&it);
			
 
				-	length = n;
			
 
				-	/* now sort & (if procs) strip out duplicates */
			
 
				-	if (cgroup_on_dfl(cgrp))
			
 
				-		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
			
 
				-	else
			
 
				-		sort(array, length, sizeof(pid_t), cmppid, NULL);
			
 
				-	if (type == CGROUP_FILE_PROCS)
			
 
				-		length = pidlist_uniq(array, length);
			
 
				-
			
 
				-	l = cgroup_pidlist_find_create(cgrp, type);
			
 
				-	if (!l) {
			
 
				-		pidlist_free(array);
			
 
				-		return -ENOMEM;
			
 
				-	}
			
 
				-
			
 
				-	/* store array, freeing old if necessary */
			
 
				-	pidlist_free(l->list);
			
 
				-	l->list = array;
			
 
				-	l->length = length;
			
 
				-	*lp = l;
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * cgroupstats_build - build and fill cgroupstats
			
 
				- * @stats: cgroupstats to fill information into
			
 
				- * @dentry: A dentry entry belonging to the cgroup for which stats have
			
 
				- * been requested.
			
 
				- *
			
 
				- * Build and fill cgroupstats so that taskstats can export it to user
			
 
				- * space.
			
 
				- */
			
 
				-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
			
 
				-{
			
 
				-	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
			
 
				-	struct cgroup *cgrp;
			
 
				-	struct css_task_iter it;
			
 
				-	struct task_struct *tsk;
			
 
				-
			
 
				-	/* it should be kernfs_node belonging to cgroupfs and is a directory */
			
 
				-	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
			
 
				-	    kernfs_type(kn) != KERNFS_DIR)
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	mutex_lock(&cgroup_mutex);
			
 
				-
			
 
				-	/*
			
 
				-	 * We aren't being called from kernfs and there's no guarantee on
			
 
				-	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
			
 
				-	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
			
 
				-	 */
			
 
				-	rcu_read_lock();
			
 
				-	cgrp = rcu_dereference(kn->priv);
			
 
				-	if (!cgrp || cgroup_is_dead(cgrp)) {
			
 
				-		rcu_read_unlock();
			
 
				-		mutex_unlock(&cgroup_mutex);
			
 
				-		return -ENOENT;
			
 
				-	}
			
 
				-	rcu_read_unlock();
			
 
				-
			
 
				-	css_task_iter_start(&cgrp->self, &it);
			
 
				-	while ((tsk = css_task_iter_next(&it))) {
			
 
				-		switch (tsk->state) {
			
 
				-		case TASK_RUNNING:
			
 
				-			stats->nr_running++;
			
 
				-			break;
			
 
				-		case TASK_INTERRUPTIBLE:
			
 
				-			stats->nr_sleeping++;
			
 
				-			break;
			
 
				-		case TASK_UNINTERRUPTIBLE:
			
 
				-			stats->nr_uninterruptible++;
			
 
				-			break;
			
 
				-		case TASK_STOPPED:
			
 
				-			stats->nr_stopped++;
			
 
				-			break;
			
 
				-		default:
			
 
				-			if (delayacct_is_task_waiting_on_io(tsk))
			
 
				-				stats->nr_io_wait++;
			
 
				-			break;
			
 
				-		}
			
 
				-	}
			
 
				-	css_task_iter_end(&it);
			
 
				-
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-/*
			
 
				- * seq_file methods for the tasks/procs files. The seq_file position is the
			
 
				- * next pid to display; the seq_file iterator is a pointer to the pid
			
 
				- * in the cgroup->l->list array.
			
 
				- */
			
 
				-
			
 
				-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
			
 
				+static int cgroup_procs_show(struct seq_file *s, void *v)
			
 
				 {
			
 
				-	/*
			
 
				-	 * Initially we receive a position value that corresponds to
			
 
				-	 * one more than the last pid shown (or 0 on the first call or
			
 
				-	 * after a seek to the start). Use a binary-search to find the
			
 
				-	 * next pid to display, if any
			
 
				-	 */
			
 
				-	struct kernfs_open_file *of = s->private;
			
 
				-	struct cgroup *cgrp = seq_css(s)->cgroup;
			
 
				-	struct cgroup_pidlist *l;
			
 
				-	enum cgroup_filetype type = seq_cft(s)->private;
			
 
				-	int index = 0, pid = *pos;
			
 
				-	int *iter, ret;
			
 
				-
			
 
				-	mutex_lock(&cgrp->pidlist_mutex);
			
 
				-
			
 
				-	/*
			
 
				-	 * !NULL @of->priv indicates that this isn't the first start()
			
 
				-	 * after open.  If the matching pidlist is around, we can use that.
			
 
				-	 * Look for it.  Note that @of->priv can't be used directly.  It
			
 
				-	 * could already have been destroyed.
			
 
				-	 */
			
 
				-	if (of->priv)
			
 
				-		of->priv = cgroup_pidlist_find(cgrp, type);
			
 
				-
			
 
				-	/*
			
 
				-	 * Either this is the first start() after open or the matching
			
 
				-	 * pidlist has been destroyed inbetween.  Create a new one.
			
 
				-	 */
			
 
				-	if (!of->priv) {
			
 
				-		ret = pidlist_array_load(cgrp, type,
			
 
				-					 (struct cgroup_pidlist **)&of->priv);
			
 
				-		if (ret)
			
 
				-			return ERR_PTR(ret);
			
 
				-	}
			
 
				-	l = of->priv;
			
 
				-
			
 
				-	if (pid) {
			
 
				-		int end = l->length;
			
 
				-
			
 
				-		while (index < end) {
			
 
				-			int mid = (index + end) / 2;
			
 
				-			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
			
 
				-				index = mid;
			
 
				-				break;
			
 
				-			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
			
 
				-				index = mid + 1;
			
 
				-			else
			
 
				-				end = mid;
			
 
				-		}
			
 
				-	}
			
 
				-	/* If we're off the end of the array, we're done */
			
 
				-	if (index >= l->length)
			
 
				-		return NULL;
			
 
				-	/* Update the abstract position to be the actual pid that we found */
			
 
				-	iter = l->list + index;
			
 
				-	*pos = cgroup_pid_fry(cgrp, *iter);
			
 
				-	return iter;
			
 
				-}
			
 
				-
			
 
				-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
			
 
				-{
			
 
				-	struct kernfs_open_file *of = s->private;
			
 
				-	struct cgroup_pidlist *l = of->priv;
			
 
				-
			
 
				-	if (l)
			
 
				-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
			
 
				-				 CGROUP_PIDLIST_DESTROY_DELAY);
			
 
				-	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
			
 
				-}
			
 
				-
			
 
				-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
			
 
				-{
			
 
				-	struct kernfs_open_file *of = s->private;
			
 
				-	struct cgroup_pidlist *l = of->priv;
			
 
				-	pid_t *p = v;
			
 
				-	pid_t *end = l->list + l->length;
			
 
				-	/*
			
 
				-	 * Advance to the next pid in the array. If this goes off the
			
 
				-	 * end, we're done
			
 
				-	 */
			
 
				-	p++;
			
 
				-	if (p >= end) {
			
 
				-		return NULL;
			
 
				-	} else {
			
 
				-		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
			
 
				-		return p;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static int cgroup_pidlist_show(struct seq_file *s, void *v)
			
 
				-{
			
 
				-	seq_printf(s, "%d\n", *(int *)v);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
			
 
				-					 struct cftype *cft)
			
 
				-{
			
 
				-	return notify_on_release(css->cgroup);
			
 
				-}
			
 
				-
			
 
				-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
			
 
				-					  struct cftype *cft, u64 val)
			
 
				-{
			
 
				-	if (val)
			
 
				-		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
			
 
				-	else
			
 
				-		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
			
 
				-				      struct cftype *cft)
			
 
				-{
			
 
				-	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
			
 
				-}
			
 
				-
			
 
				-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
			
 
				-				       struct cftype *cft, u64 val)
			
 
				-{
			
 
				-	if (val)
			
 
				-		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
			
 
				-	else
			
 
				-		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
			
 
				+	seq_printf(s, "%d\n", task_tgid_vnr(v));
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 /* cgroup core interface files for the default hierarchy */
			
 
				-static struct cftype cgroup_dfl_base_files[] = {
			
 
				+static struct cftype cgroup_base_files[] = {
			
 
				 	{
			
 
				 		.name = "cgroup.procs",
			
 
				 		.file_offset = offsetof(struct cgroup, procs_file),
			
 
				-		.seq_start = cgroup_pidlist_start,
			
 
				-		.seq_next = cgroup_pidlist_next,
			
 
				-		.seq_stop = cgroup_pidlist_stop,
			
 
				-		.seq_show = cgroup_pidlist_show,
			
 
				-		.private = CGROUP_FILE_PROCS,
			
 
				+		.release = cgroup_procs_release,
			
 
				+		.seq_start = cgroup_procs_start,
			
 
				+		.seq_next = cgroup_procs_next,
			
 
				+		.seq_show = cgroup_procs_show,
			
 
				 		.write = cgroup_procs_write,
			
 
				 	},
			
 
				 	{
			
@@ -4917,51 +3810,6 @@ static struct cftype cgroup_dfl_base_files[] = {
 
				 	{ }	/* terminate */
			
 
				 };
			
 
				 
			
 
				-/* cgroup core interface files for the legacy hierarchies */
			
 
				-static struct cftype cgroup_legacy_base_files[] = {
			
 
				-	{
			
 
				-		.name = "cgroup.procs",
			
 
				-		.seq_start = cgroup_pidlist_start,
			
 
				-		.seq_next = cgroup_pidlist_next,
			
 
				-		.seq_stop = cgroup_pidlist_stop,
			
 
				-		.seq_show = cgroup_pidlist_show,
			
 
				-		.private = CGROUP_FILE_PROCS,
			
 
				-		.write = cgroup_procs_write,
			
 
				-	},
			
 
				-	{
			
 
				-		.name = "cgroup.clone_children",
			
 
				-		.read_u64 = cgroup_clone_children_read,
			
 
				-		.write_u64 = cgroup_clone_children_write,
			
 
				-	},
			
 
				-	{
			
 
				-		.name = "cgroup.sane_behavior",
			
 
				-		.flags = CFTYPE_ONLY_ON_ROOT,
			
 
				-		.seq_show = cgroup_sane_behavior_show,
			
 
				-	},
			
 
				-	{
			
 
				-		.name = "tasks",
			
 
				-		.seq_start = cgroup_pidlist_start,
			
 
				-		.seq_next = cgroup_pidlist_next,
			
 
				-		.seq_stop = cgroup_pidlist_stop,
			
 
				-		.seq_show = cgroup_pidlist_show,
			
 
				-		.private = CGROUP_FILE_TASKS,
			
 
				-		.write = cgroup_tasks_write,
			
 
				-	},
			
 
				-	{
			
 
				-		.name = "notify_on_release",
			
 
				-		.read_u64 = cgroup_read_notify_on_release,
			
 
				-		.write_u64 = cgroup_write_notify_on_release,
			
 
				-	},
			
 
				-	{
			
 
				-		.name = "release_agent",
			
 
				-		.flags = CFTYPE_ONLY_ON_ROOT,
			
 
				-		.seq_show = cgroup_release_agent_show,
			
 
				-		.write = cgroup_release_agent_write,
			
 
				-		.max_write_len = PATH_MAX - 1,
			
 
				-	},
			
 
				-	{ }	/* terminate */
			
 
				-};
			
 
				-
			
 
				 /*
			
 
				  * css destruction is four-stage process.
			
 
				  *
			
@@ -5007,7 +3855,7 @@ static void css_free_work_fn(struct work_struct *work)
 
				 	} else {
			
 
				 		/* cgroup free path */
			
 
				 		atomic_dec(&cgrp->root->nr_cgrps);
			
 
				-		cgroup_pidlist_destroy_all(cgrp);
			
 
				+		cgroup1_pidlist_destroy_all(cgrp);
			
 
				 		cancel_work_sync(&cgrp->release_agent_work);
			
 
				 
			
 
				 		if (cgroup_parent(cgrp)) {
			
@@ -5302,8 +4150,7 @@ out_free_cgrp:
 
				 	return ERR_PTR(ret);
			
 
				 }
			
 
				 
			
 
				-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
			
 
				-			umode_t mode)
			
 
				+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
			
 
				 {
			
 
				 	struct cgroup *parent, *cgrp;
			
 
				 	struct kernfs_node *kn;
			
@@ -5507,7 +4354,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
				 	 */
			
 
				 	kernfs_remove(cgrp->kn);
			
 
				 
			
 
				-	check_for_release(cgroup_parent(cgrp));
			
 
				+	cgroup1_check_for_release(cgroup_parent(cgrp));
			
 
				 
			
 
				 	/* put the base reference */
			
 
				 	percpu_ref_kill(&cgrp->self.refcnt);
			
@@ -5515,7 +4362,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
				 	return 0;
			
 
				 };
			
 
				 
			
 
				-static int cgroup_rmdir(struct kernfs_node *kn)
			
 
				+int cgroup_rmdir(struct kernfs_node *kn)
			
 
				 {
			
 
				 	struct cgroup *cgrp;
			
 
				 	int ret = 0;
			
@@ -5535,10 +4382,8 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
				 
			
 
				 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
			
 
				 	.remount_fs		= cgroup_remount,
			
 
				-	.show_options		= cgroup_show_options,
			
 
				 	.mkdir			= cgroup_mkdir,
			
 
				 	.rmdir			= cgroup_rmdir,
			
 
				-	.rename			= cgroup_rename,
			
 
				 	.show_path		= cgroup_show_path,
			
 
				 };
			
 
				 
			
@@ -5646,8 +4491,8 @@ int __init cgroup_init(void)
 
				 
			
 
				 	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
			
 
				 	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
			
 
				-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
			
 
				-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
			
 
				+	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
			
 
				+	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
			
 
				 
			
 
				 	/*
			
 
				 	 * The latency of the synchronize_sched() is too high for cgroups,
			
@@ -5697,7 +4542,7 @@ int __init cgroup_init(void)
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		if (cgroup_ssid_no_v1(ssid))
			
 
				+		if (cgroup1_ssid_disabled(ssid))
			
 
				 			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
			
 
				 			       ss->name);
			
 
				 
			
@@ -5744,15 +4589,6 @@ static int __init cgroup_wq_init(void)
 
				 	 */
			
 
				 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
			
 
				 	BUG_ON(!cgroup_destroy_wq);
			
 
				-
			
 
				-	/*
			
 
				-	 * Used to destroy pidlists and separate to serve as flush domain.
			
 
				-	 * Cap @max_active to 1 too.
			
 
				-	 */
			
 
				-	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
			
 
				-						    0, 1);
			
 
				-	BUG_ON(!cgroup_pidlist_destroy_wq);
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 core_initcall(cgroup_wq_init);
			
@@ -5835,42 +4671,6 @@ out:
 
				 	return retval;
			
 
				 }
			
 
				 
			
 
				-/* Display information about each subsystem and each hierarchy */
			
 
				-static int proc_cgroupstats_show(struct seq_file *m, void *v)
			
 
				-{
			
 
				-	struct cgroup_subsys *ss;
			
 
				-	int i;
			
 
				-
			
 
				-	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
			
 
				-	/*
			
 
				-	 * ideally we don't want subsystems moving around while we do this.
			
 
				-	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
			
 
				-	 * subsys/hierarchy state.
			
 
				-	 */
			
 
				-	mutex_lock(&cgroup_mutex);
			
 
				-
			
 
				-	for_each_subsys(ss, i)
			
 
				-		seq_printf(m, "%s\t%d\t%d\t%d\n",
			
 
				-			   ss->legacy_name, ss->root->hierarchy_id,
			
 
				-			   atomic_read(&ss->root->nr_cgrps),
			
 
				-			   cgroup_ssid_enabled(i));
			
 
				-
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int cgroupstats_open(struct inode *inode, struct file *file)
			
 
				-{
			
 
				-	return single_open(file, proc_cgroupstats_show, NULL);
			
 
				-}
			
 
				-
			
 
				-static const struct file_operations proc_cgroupstats_operations = {
			
 
				-	.open = cgroupstats_open,
			
 
				-	.read = seq_read,
			
 
				-	.llseek = seq_lseek,
			
 
				-	.release = single_release,
			
 
				-};
			
 
				-
			
 
				 /**
			
 
				  * cgroup_fork - initialize cgroup related fields during copy_process()
			
 
				  * @child: pointer to task_struct of forking parent process.
			
@@ -6050,76 +4850,6 @@ void cgroup_free(struct task_struct *task)
 
				 	put_css_set(cset);
			
 
				 }
			
 
				 
			
 
				-static void check_for_release(struct cgroup *cgrp)
			
 
				-{
			
 
				-	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
			
 
				-	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
			
 
				-		schedule_work(&cgrp->release_agent_work);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Notify userspace when a cgroup is released, by running the
			
 
				- * configured release agent with the name of the cgroup (path
			
 
				- * relative to the root of cgroup file system) as the argument.
			
 
				- *
			
 
				- * Most likely, this user command will try to rmdir this cgroup.
			
 
				- *
			
 
				- * This races with the possibility that some other task will be
			
 
				- * attached to this cgroup before it is removed, or that some other
			
 
				- * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
			
 
				- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
			
 
				- * unused, and this cgroup will be reprieved from its death sentence,
			
 
				- * to continue to serve a useful existence.  Next time it's released,
			
 
				- * we will get notified again, if it still has 'notify_on_release' set.
			
 
				- *
			
 
				- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
			
 
				- * means only wait until the task is successfully execve()'d.  The
			
 
				- * separate release agent task is forked by call_usermodehelper(),
			
 
				- * then control in this thread returns here, without waiting for the
			
 
				- * release agent task.  We don't bother to wait because the caller of
			
 
				- * this routine has no use for the exit status of the release agent
			
 
				- * task, so no sense holding our caller up for that.
			
 
				- */
			
 
				-static void cgroup_release_agent(struct work_struct *work)
			
 
				-{
			
 
				-	struct cgroup *cgrp =
			
 
				-		container_of(work, struct cgroup, release_agent_work);
			
 
				-	char *pathbuf = NULL, *agentbuf = NULL;
			
 
				-	char *argv[3], *envp[3];
			
 
				-	int ret;
			
 
				-
			
 
				-	mutex_lock(&cgroup_mutex);
			
 
				-
			
 
				-	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
			
 
				-	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
			
 
				-	if (!pathbuf || !agentbuf)
			
 
				-		goto out;
			
 
				-
			
 
				-	spin_lock_irq(&css_set_lock);
			
 
				-	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
			
 
				-	spin_unlock_irq(&css_set_lock);
			
 
				-	if (ret < 0 || ret >= PATH_MAX)
			
 
				-		goto out;
			
 
				-
			
 
				-	argv[0] = agentbuf;
			
 
				-	argv[1] = pathbuf;
			
 
				-	argv[2] = NULL;
			
 
				-
			
 
				-	/* minimal command environment */
			
 
				-	envp[0] = "HOME=/";
			
 
				-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
			
 
				-	envp[2] = NULL;
			
 
				-
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				-	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
			
 
				-	goto out_free;
			
 
				-out:
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				-out_free:
			
 
				-	kfree(agentbuf);
			
 
				-	kfree(pathbuf);
			
 
				-}
			
 
				-
			
 
				 static int __init cgroup_disable(char *str)
			
 
				 {
			
 
				 	struct cgroup_subsys *ss;
			
@@ -6141,33 +4871,6 @@ static int __init cgroup_disable(char *str)
 
				 }
			
 
				 __setup("cgroup_disable=", cgroup_disable);
			
 
				 
			
 
				-static int __init cgroup_no_v1(char *str)
			
 
				-{
			
 
				-	struct cgroup_subsys *ss;
			
 
				-	char *token;
			
 
				-	int i;
			
 
				-
			
 
				-	while ((token = strsep(&str, ",")) != NULL) {
			
 
				-		if (!*token)
			
 
				-			continue;
			
 
				-
			
 
				-		if (!strcmp(token, "all")) {
			
 
				-			cgroup_no_v1_mask = U16_MAX;
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				-		for_each_subsys(ss, i) {
			
 
				-			if (strcmp(token, ss->name) &&
			
 
				-			    strcmp(token, ss->legacy_name))
			
 
				-				continue;
			
 
				-
			
 
				-			cgroup_no_v1_mask |= 1 << i;
			
 
				-		}
			
 
				-	}
			
 
				-	return 1;
			
 
				-}
			
 
				-__setup("cgroup_no_v1=", cgroup_no_v1);
			
 
				-
			
 
				 /**
			
 
				  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
			
 
				  * @dentry: directory dentry of interest
			
@@ -6197,7 +4900,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 
				 	 * have been or be removed at any point.  @kn->priv is RCU
			
 
				 	 * protected for this access.  See css_release_work_fn() for details.
			
 
				 	 */
			
 
				-	cgrp = rcu_dereference(kn->priv);
			
 
				+	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
			
 
				 	if (cgrp)
			
 
				 		css = cgroup_css(cgrp, ss);
			
 
				 
			
@@ -6349,154 +5052,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 
				 
			
 
				 #endif	/* CONFIG_SOCK_CGROUP_DATA */
			
 
				 
			
 
				-/* cgroup namespaces */
			
 
				-
			
 
				-static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
			
 
				-{
			
 
				-	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
			
 
				-}
			
 
				-
			
 
				-static void dec_cgroup_namespaces(struct ucounts *ucounts)
			
 
				-{
			
 
				-	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
			
 
				-}
			
 
				-
			
 
				-static struct cgroup_namespace *alloc_cgroup_ns(void)
			
 
				-{
			
 
				-	struct cgroup_namespace *new_ns;
			
 
				-	int ret;
			
 
				-
			
 
				-	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
			
 
				-	if (!new_ns)
			
 
				-		return ERR_PTR(-ENOMEM);
			
 
				-	ret = ns_alloc_inum(&new_ns->ns);
			
 
				-	if (ret) {
			
 
				-		kfree(new_ns);
			
 
				-		return ERR_PTR(ret);
			
 
				-	}
			
 
				-	atomic_set(&new_ns->count, 1);
			
 
				-	new_ns->ns.ops = &cgroupns_operations;
			
 
				-	return new_ns;
			
 
				-}
			
 
				-
			
 
				-void free_cgroup_ns(struct cgroup_namespace *ns)
			
 
				-{
			
 
				-	put_css_set(ns->root_cset);
			
 
				-	dec_cgroup_namespaces(ns->ucounts);
			
 
				-	put_user_ns(ns->user_ns);
			
 
				-	ns_free_inum(&ns->ns);
			
 
				-	kfree(ns);
			
 
				-}
			
 
				-EXPORT_SYMBOL(free_cgroup_ns);
			
 
				-
			
 
				-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
			
 
				-					struct user_namespace *user_ns,
			
 
				-					struct cgroup_namespace *old_ns)
			
 
				-{
			
 
				-	struct cgroup_namespace *new_ns;
			
 
				-	struct ucounts *ucounts;
			
 
				-	struct css_set *cset;
			
 
				-
			
 
				-	BUG_ON(!old_ns);
			
 
				-
			
 
				-	if (!(flags & CLONE_NEWCGROUP)) {
			
 
				-		get_cgroup_ns(old_ns);
			
 
				-		return old_ns;
			
 
				-	}
			
 
				-
			
 
				-	/* Allow only sysadmin to create cgroup namespace. */
			
 
				-	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
			
 
				-		return ERR_PTR(-EPERM);
			
 
				-
			
 
				-	ucounts = inc_cgroup_namespaces(user_ns);
			
 
				-	if (!ucounts)
			
 
				-		return ERR_PTR(-ENOSPC);
			
 
				-
			
 
				-	/* It is not safe to take cgroup_mutex here */
			
 
				-	spin_lock_irq(&css_set_lock);
			
 
				-	cset = task_css_set(current);
			
 
				-	get_css_set(cset);
			
 
				-	spin_unlock_irq(&css_set_lock);
			
 
				-
			
 
				-	new_ns = alloc_cgroup_ns();
			
 
				-	if (IS_ERR(new_ns)) {
			
 
				-		put_css_set(cset);
			
 
				-		dec_cgroup_namespaces(ucounts);
			
 
				-		return new_ns;
			
 
				-	}
			
 
				-
			
 
				-	new_ns->user_ns = get_user_ns(user_ns);
			
 
				-	new_ns->ucounts = ucounts;
			
 
				-	new_ns->root_cset = cset;
			
 
				-
			
 
				-	return new_ns;
			
 
				-}
			
 
				-
			
 
				-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
			
 
				-{
			
 
				-	return container_of(ns, struct cgroup_namespace, ns);
			
 
				-}
			
 
				-
			
 
				-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
			
 
				-{
			
 
				-	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
			
 
				-
			
 
				-	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
			
 
				-	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
			
 
				-		return -EPERM;
			
 
				-
			
 
				-	/* Don't need to do anything if we are attaching to our own cgroupns. */
			
 
				-	if (cgroup_ns == nsproxy->cgroup_ns)
			
 
				-		return 0;
			
 
				-
			
 
				-	get_cgroup_ns(cgroup_ns);
			
 
				-	put_cgroup_ns(nsproxy->cgroup_ns);
			
 
				-	nsproxy->cgroup_ns = cgroup_ns;
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static struct ns_common *cgroupns_get(struct task_struct *task)
			
 
				-{
			
 
				-	struct cgroup_namespace *ns = NULL;
			
 
				-	struct nsproxy *nsproxy;
			
 
				-
			
 
				-	task_lock(task);
			
 
				-	nsproxy = task->nsproxy;
			
 
				-	if (nsproxy) {
			
 
				-		ns = nsproxy->cgroup_ns;
			
 
				-		get_cgroup_ns(ns);
			
 
				-	}
			
 
				-	task_unlock(task);
			
 
				-
			
 
				-	return ns ? &ns->ns : NULL;
			
 
				-}
			
 
				-
			
 
				-static void cgroupns_put(struct ns_common *ns)
			
 
				-{
			
 
				-	put_cgroup_ns(to_cg_ns(ns));
			
 
				-}
			
 
				-
			
 
				-static struct user_namespace *cgroupns_owner(struct ns_common *ns)
			
 
				-{
			
 
				-	return to_cg_ns(ns)->user_ns;
			
 
				-}
			
 
				-
			
 
				-const struct proc_ns_operations cgroupns_operations = {
			
 
				-	.name		= "cgroup",
			
 
				-	.type		= CLONE_NEWCGROUP,
			
 
				-	.get		= cgroupns_get,
			
 
				-	.put		= cgroupns_put,
			
 
				-	.install	= cgroupns_install,
			
 
				-	.owner		= cgroupns_owner,
			
 
				-};
			
 
				-
			
 
				-static __init int cgroup_namespaces_init(void)
			
 
				-{
			
 
				-	return 0;
			
 
				-}
			
 
				-subsys_initcall(cgroup_namespaces_init);
			
 
				-
			
 
				 #ifdef CONFIG_CGROUP_BPF
			
 
				 int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
			
 
				 		      enum bpf_attach_type type, bool overridable)
			
@@ -6510,149 +5065,3 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
 
				 	return ret;
			
 
				 }
			
 
				 #endif /* CONFIG_CGROUP_BPF */
			
 
				-
			
 
				-#ifdef CONFIG_CGROUP_DEBUG
			
 
				-static struct cgroup_subsys_state *
			
 
				-debug_css_alloc(struct cgroup_subsys_state *parent_css)
			
 
				-{
			
 
				-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
			
 
				-
			
 
				-	if (!css)
			
 
				-		return ERR_PTR(-ENOMEM);
			
 
				-
			
 
				-	return css;
			
 
				-}
			
 
				-
			
 
				-static void debug_css_free(struct cgroup_subsys_state *css)
			
 
				-{
			
 
				-	kfree(css);
			
 
				-}
			
 
				-
			
 
				-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
			
 
				-				struct cftype *cft)
			
 
				-{
			
 
				-	return cgroup_task_count(css->cgroup);
			
 
				-}
			
 
				-
			
 
				-static u64 current_css_set_read(struct cgroup_subsys_state *css,
			
 
				-				struct cftype *cft)
			
 
				-{
			
 
				-	return (u64)(unsigned long)current->cgroups;
			
 
				-}
			
 
				-
			
 
				-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
			
 
				-					 struct cftype *cft)
			
 
				-{
			
 
				-	u64 count;
			
 
				-
			
 
				-	rcu_read_lock();
			
 
				-	count = atomic_read(&task_css_set(current)->refcount);
			
 
				-	rcu_read_unlock();
			
 
				-	return count;
			
 
				-}
			
 
				-
			
 
				-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
			
 
				-{
			
 
				-	struct cgrp_cset_link *link;
			
 
				-	struct css_set *cset;
			
 
				-	char *name_buf;
			
 
				-
			
 
				-	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
			
 
				-	if (!name_buf)
			
 
				-		return -ENOMEM;
			
 
				-
			
 
				-	spin_lock_irq(&css_set_lock);
			
 
				-	rcu_read_lock();
			
 
				-	cset = rcu_dereference(current->cgroups);
			
 
				-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
			
 
				-		struct cgroup *c = link->cgrp;
			
 
				-
			
 
				-		cgroup_name(c, name_buf, NAME_MAX + 1);
			
 
				-		seq_printf(seq, "Root %d group %s\n",
			
 
				-			   c->root->hierarchy_id, name_buf);
			
 
				-	}
			
 
				-	rcu_read_unlock();
			
 
				-	spin_unlock_irq(&css_set_lock);
			
 
				-	kfree(name_buf);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-#define MAX_TASKS_SHOWN_PER_CSS 25
			
 
				-static int cgroup_css_links_read(struct seq_file *seq, void *v)
			
 
				-{
			
 
				-	struct cgroup_subsys_state *css = seq_css(seq);
			
 
				-	struct cgrp_cset_link *link;
			
 
				-
			
 
				-	spin_lock_irq(&css_set_lock);
			
 
				-	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
			
 
				-		struct css_set *cset = link->cset;
			
 
				-		struct task_struct *task;
			
 
				-		int count = 0;
			
 
				-
			
 
				-		seq_printf(seq, "css_set %p\n", cset);
			
 
				-
			
 
				-		list_for_each_entry(task, &cset->tasks, cg_list) {
			
 
				-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
			
 
				-				goto overflow;
			
 
				-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
			
 
				-		}
			
 
				-
			
 
				-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
			
 
				-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
			
 
				-				goto overflow;
			
 
				-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
			
 
				-		}
			
 
				-		continue;
			
 
				-	overflow:
			
 
				-		seq_puts(seq, "  ...\n");
			
 
				-	}
			
 
				-	spin_unlock_irq(&css_set_lock);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
			
 
				-{
			
 
				-	return (!cgroup_is_populated(css->cgroup) &&
			
 
				-		!css_has_online_children(&css->cgroup->self));
			
 
				-}
			
 
				-
			
 
				-static struct cftype debug_files[] =  {
			
 
				-	{
			
 
				-		.name = "taskcount",
			
 
				-		.read_u64 = debug_taskcount_read,
			
 
				-	},
			
 
				-
			
 
				-	{
			
 
				-		.name = "current_css_set",
			
 
				-		.read_u64 = current_css_set_read,
			
 
				-	},
			
 
				-
			
 
				-	{
			
 
				-		.name = "current_css_set_refcount",
			
 
				-		.read_u64 = current_css_set_refcount_read,
			
 
				-	},
			
 
				-
			
 
				-	{
			
 
				-		.name = "current_css_set_cg_links",
			
 
				-		.seq_show = current_css_set_cg_links_read,
			
 
				-	},
			
 
				-
			
 
				-	{
			
 
				-		.name = "cgroup_css_links",
			
 
				-		.seq_show = cgroup_css_links_read,
			
 
				-	},
			
 
				-
			
 
				-	{
			
 
				-		.name = "releasable",
			
 
				-		.read_u64 = releasable_read,
			
 
				-	},
			
 
				-
			
 
				-	{ }	/* terminate */
			
 
				-};
			
 
				-
			
 
				-struct cgroup_subsys debug_cgrp_subsys = {
			
 
				-	.css_alloc = debug_css_alloc,
			
 
				-	.css_free = debug_css_free,
			
 
				-	.legacy_cftypes = debug_files,
			
 
				-};
			
 
				-#endif /* CONFIG_CGROUP_DEBUG */
			
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -0,0 +1,155 @@
 
				+#include "cgroup-internal.h"
			
 
				+
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/nsproxy.h>
			
 
				+#include <linux/proc_ns.h>
			
 
				+
			
 
				+
			
 
				+/* cgroup namespaces */
			
 
				+
			
 
				+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
			
 
				+{
			
 
				+	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
			
 
				+}
			
 
				+
			
 
				+static void dec_cgroup_namespaces(struct ucounts *ucounts)
			
 
				+{
			
 
				+	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
			
 
				+}
			
 
				+
			
 
				+static struct cgroup_namespace *alloc_cgroup_ns(void)
			
 
				+{
			
 
				+	struct cgroup_namespace *new_ns;
			
 
				+	int ret;
			
 
				+
			
 
				+	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
			
 
				+	if (!new_ns)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	ret = ns_alloc_inum(&new_ns->ns);
			
 
				+	if (ret) {
			
 
				+		kfree(new_ns);
			
 
				+		return ERR_PTR(ret);
			
 
				+	}
			
 
				+	atomic_set(&new_ns->count, 1);
			
 
				+	new_ns->ns.ops = &cgroupns_operations;
			
 
				+	return new_ns;
			
 
				+}
			
 
				+
			
 
				+void free_cgroup_ns(struct cgroup_namespace *ns)
			
 
				+{
			
 
				+	put_css_set(ns->root_cset);
			
 
				+	dec_cgroup_namespaces(ns->ucounts);
			
 
				+	put_user_ns(ns->user_ns);
			
 
				+	ns_free_inum(&ns->ns);
			
 
				+	kfree(ns);
			
 
				+}
			
 
				+EXPORT_SYMBOL(free_cgroup_ns);
			
 
				+
			
 
				+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
			
 
				+					struct user_namespace *user_ns,
			
 
				+					struct cgroup_namespace *old_ns)
			
 
				+{
			
 
				+	struct cgroup_namespace *new_ns;
			
 
				+	struct ucounts *ucounts;
			
 
				+	struct css_set *cset;
			
 
				+
			
 
				+	BUG_ON(!old_ns);
			
 
				+
			
 
				+	if (!(flags & CLONE_NEWCGROUP)) {
			
 
				+		get_cgroup_ns(old_ns);
			
 
				+		return old_ns;
			
 
				+	}
			
 
				+
			
 
				+	/* Allow only sysadmin to create cgroup namespace. */
			
 
				+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
			
 
				+		return ERR_PTR(-EPERM);
			
 
				+
			
 
				+	ucounts = inc_cgroup_namespaces(user_ns);
			
 
				+	if (!ucounts)
			
 
				+		return ERR_PTR(-ENOSPC);
			
 
				+
			
 
				+	/* It is not safe to take cgroup_mutex here */
			
 
				+	spin_lock_irq(&css_set_lock);
			
 
				+	cset = task_css_set(current);
			
 
				+	get_css_set(cset);
			
 
				+	spin_unlock_irq(&css_set_lock);
			
 
				+
			
 
				+	new_ns = alloc_cgroup_ns();
			
 
				+	if (IS_ERR(new_ns)) {
			
 
				+		put_css_set(cset);
			
 
				+		dec_cgroup_namespaces(ucounts);
			
 
				+		return new_ns;
			
 
				+	}
			
 
				+
			
 
				+	new_ns->user_ns = get_user_ns(user_ns);
			
 
				+	new_ns->ucounts = ucounts;
			
 
				+	new_ns->root_cset = cset;
			
 
				+
			
 
				+	return new_ns;
			
 
				+}
			
 
				+
			
 
				+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
			
 
				+{
			
 
				+	return container_of(ns, struct cgroup_namespace, ns);
			
 
				+}
			
 
				+
			
 
				+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
			
 
				+{
			
 
				+	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
			
 
				+
			
 
				+	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
			
 
				+	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
			
 
				+		return -EPERM;
			
 
				+
			
 
				+	/* Don't need to do anything if we are attaching to our own cgroupns. */
			
 
				+	if (cgroup_ns == nsproxy->cgroup_ns)
			
 
				+		return 0;
			
 
				+
			
 
				+	get_cgroup_ns(cgroup_ns);
			
 
				+	put_cgroup_ns(nsproxy->cgroup_ns);
			
 
				+	nsproxy->cgroup_ns = cgroup_ns;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct ns_common *cgroupns_get(struct task_struct *task)
			
 
				+{
			
 
				+	struct cgroup_namespace *ns = NULL;
			
 
				+	struct nsproxy *nsproxy;
			
 
				+
			
 
				+	task_lock(task);
			
 
				+	nsproxy = task->nsproxy;
			
 
				+	if (nsproxy) {
			
 
				+		ns = nsproxy->cgroup_ns;
			
 
				+		get_cgroup_ns(ns);
			
 
				+	}
			
 
				+	task_unlock(task);
			
 
				+
			
 
				+	return ns ? &ns->ns : NULL;
			
 
				+}
			
 
				+
			
 
				+static void cgroupns_put(struct ns_common *ns)
			
 
				+{
			
 
				+	put_cgroup_ns(to_cg_ns(ns));
			
 
				+}
			
 
				+
			
 
				+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
			
 
				+{
			
 
				+	return to_cg_ns(ns)->user_ns;
			
 
				+}
			
 
				+
			
 
				+const struct proc_ns_operations cgroupns_operations = {
			
 
				+	.name		= "cgroup",
			
 
				+	.type		= CLONE_NEWCGROUP,
			
 
				+	.get		= cgroupns_get,
			
 
				+	.put		= cgroupns_put,
			
 
				+	.install	= cgroupns_install,
			
 
				+	.owner		= cgroupns_owner,
			
 
				+};
			
 
				+
			
 
				+static __init int cgroup_namespaces_init(void)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+subsys_initcall(cgroup_namespaces_init);
			
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,619 @@
 
				+/*
			
 
				+ * RDMA resource limiting controller for cgroups.
			
 
				+ *
			
 
				+ * Used to allow a cgroup hierarchy to stop processes from consuming
			
 
				+ * additional RDMA resources after a certain limit is reached.
			
 
				+ *
			
 
				+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
			
 
				+ *
			
 
				+ * This file is subject to the terms and conditions of version 2 of the GNU
			
 
				+ * General Public License. See the file COPYING in the main directory of the
			
 
				+ * Linux distribution for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/bitops.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/seq_file.h>
			
 
				+#include <linux/cgroup.h>
			
 
				+#include <linux/parser.h>
			
 
				+#include <linux/cgroup_rdma.h>
			
 
				+
			
 
				+#define RDMACG_MAX_STR "max"
			
 
				+
			
 
				+/*
			
 
				+ * Protects list of resource pools maintained on per cgroup basis
			
 
				+ * and rdma device list.
			
 
				+ */
			
 
				+static DEFINE_MUTEX(rdmacg_mutex);
			
 
				+static LIST_HEAD(rdmacg_devices);
			
 
				+
			
 
				+enum rdmacg_file_type {
			
 
				+	RDMACG_RESOURCE_TYPE_MAX,
			
 
				+	RDMACG_RESOURCE_TYPE_STAT,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * resource table definition as to be seen by the user.
			
 
				+ * Need to add entries to it when more resources are
			
 
				+ * added/defined at IB verb/core layer.
			
 
				+ */
			
 
				+static char const *rdmacg_resource_names[] = {
			
 
				+	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
			
 
				+	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
			
 
				+};
			
 
				+
			
 
				+/* resource tracker for each resource of rdma cgroup */
			
 
				+struct rdmacg_resource {
			
 
				+	int max;
			
 
				+	int usage;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * resource pool object which represents per cgroup, per device
			
 
				+ * resources. There are multiple instances of this object per cgroup,
			
 
				+ * therefore it cannot be embedded within rdma_cgroup structure. It
			
 
				+ * is maintained as list.
			
 
				+ */
			
 
				+struct rdmacg_resource_pool {
			
 
				+	struct rdmacg_device	*device;
			
 
				+	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
			
 
				+
			
 
				+	struct list_head	cg_node;
			
 
				+	struct list_head	dev_node;
			
 
				+
			
 
				+	/* count active user tasks of this pool */
			
 
				+	u64			usage_sum;
			
 
				+	/* total number counts which are set to max */
			
 
				+	int			num_max_cnt;
			
 
				+};
			
 
				+
			
 
				+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
			
 
				+{
			
 
				+	return container_of(css, struct rdma_cgroup, css);
			
 
				+}
			
 
				+
			
 
				+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
			
 
				+{
			
 
				+	return css_rdmacg(cg->css.parent);
			
 
				+}
			
 
				+
			
 
				+static inline struct rdma_cgroup *get_current_rdmacg(void)
			
 
				+{
			
 
				+	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
			
 
				+}
			
 
				+
			
 
				+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
			
 
				+			       int index, int new_max)
			
 
				+{
			
 
				+	if (new_max == S32_MAX) {
			
 
				+		if (rpool->resources[index].max != S32_MAX)
			
 
				+			rpool->num_max_cnt++;
			
 
				+	} else {
			
 
				+		if (rpool->resources[index].max == S32_MAX)
			
 
				+			rpool->num_max_cnt--;
			
 
				+	}
			
 
				+	rpool->resources[index].max = new_max;
			
 
				+}
			
 
				+
			
 
				+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
			
 
				+		set_resource_limit(rpool, i, S32_MAX);
			
 
				+}
			
 
				+
			
 
				+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
			
 
				+{
			
 
				+	lockdep_assert_held(&rdmacg_mutex);
			
 
				+
			
 
				+	list_del(&rpool->cg_node);
			
 
				+	list_del(&rpool->dev_node);
			
 
				+	kfree(rpool);
			
 
				+}
			
 
				+
			
 
				+static struct rdmacg_resource_pool *
			
 
				+find_cg_rpool_locked(struct rdma_cgroup *cg,
			
 
				+		     struct rdmacg_device *device)
			
 
				+
			
 
				+{
			
 
				+	struct rdmacg_resource_pool *pool;
			
 
				+
			
 
				+	lockdep_assert_held(&rdmacg_mutex);
			
 
				+
			
 
				+	list_for_each_entry(pool, &cg->rpools, cg_node)
			
 
				+		if (pool->device == device)
			
 
				+			return pool;
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct rdmacg_resource_pool *
			
 
				+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
			
 
				+{
			
 
				+	struct rdmacg_resource_pool *rpool;
			
 
				+
			
 
				+	rpool = find_cg_rpool_locked(cg, device);
			
 
				+	if (rpool)
			
 
				+		return rpool;
			
 
				+
			
 
				+	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
			
 
				+	if (!rpool)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	rpool->device = device;
			
 
				+	set_all_resource_max_limit(rpool);
			
 
				+
			
 
				+	INIT_LIST_HEAD(&rpool->cg_node);
			
 
				+	INIT_LIST_HEAD(&rpool->dev_node);
			
 
				+	list_add_tail(&rpool->cg_node, &cg->rpools);
			
 
				+	list_add_tail(&rpool->dev_node, &device->rpools);
			
 
				+	return rpool;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * uncharge_cg_locked - uncharge resource for rdma cgroup
			
 
				+ * @cg: pointer to cg to uncharge and all parents in hierarchy
			
 
				+ * @device: pointer to rdmacg device
			
 
				+ * @index: index of the resource to uncharge in cg (resource pool)
			
 
				+ *
			
 
				+ * It also frees the resource pool which was created as part of
			
 
				+ * charging operation when there are no resources attached to
			
 
				+ * resource pool.
			
 
				+ */
			
 
				+static void
			
 
				+uncharge_cg_locked(struct rdma_cgroup *cg,
			
 
				+		   struct rdmacg_device *device,
			
 
				+		   enum rdmacg_resource_type index)
			
 
				+{
			
 
				+	struct rdmacg_resource_pool *rpool;
			
 
				+
			
 
				+	rpool = find_cg_rpool_locked(cg, device);
			
 
				+
			
 
				+	/*
			
 
				+	 * rpool cannot be null at this stage. Let kernel operate in case
			
 
				+	 * if there a bug in IB stack or rdma controller, instead of crashing
			
 
				+	 * the system.
			
 
				+	 */
			
 
				+	if (unlikely(!rpool)) {
			
 
				+		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	rpool->resources[index].usage--;
			
 
				+
			
 
				+	/*
			
 
				+	 * A negative count (or overflow) is invalid,
			
 
				+	 * it indicates a bug in the rdma controller.
			
 
				+	 */
			
 
				+	WARN_ON_ONCE(rpool->resources[index].usage < 0);
			
 
				+	rpool->usage_sum--;
			
 
				+	if (rpool->usage_sum == 0 &&
			
 
				+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
			
 
				+		/*
			
 
				+		 * No user of the rpool and all entries are set to max, so
			
 
				+		 * safe to delete this rpool.
			
 
				+		 */
			
 
				+		free_cg_rpool_locked(rpool);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
			
 
				+ * @device: pointer to rdmacg device
			
 
				+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
			
 
				+ *           stop uncharging
			
 
				+ * @index: index of the resource to uncharge in cg in given resource pool
			
 
				+ */
			
 
				+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
			
 
				+				     struct rdmacg_device *device,
			
 
				+				     struct rdma_cgroup *stop_cg,
			
 
				+				     enum rdmacg_resource_type index)
			
 
				+{
			
 
				+	struct rdma_cgroup *p;
			
 
				+
			
 
				+	mutex_lock(&rdmacg_mutex);
			
 
				+
			
 
				+	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
			
 
				+		uncharge_cg_locked(p, device, index);
			
 
				+
			
 
				+	mutex_unlock(&rdmacg_mutex);
			
 
				+
			
 
				+	css_put(&cg->css);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
			
 
				+ * @device: pointer to rdmacg device
			
 
				+ * @index: index of the resource to uncharge in cgroup in given resource pool
			
 
				+ */
			
 
				+void rdmacg_uncharge(struct rdma_cgroup *cg,
			
 
				+		     struct rdmacg_device *device,
			
 
				+		     enum rdmacg_resource_type index)
			
 
				+{
			
 
				+	if (index >= RDMACG_RESOURCE_MAX)
			
 
				+		return;
			
 
				+
			
 
				+	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
			
 
				+}
			
 
				+EXPORT_SYMBOL(rdmacg_uncharge);
			
 
				+
			
 
				+/**
			
 
				+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
			
 
				+ * @rdmacg: pointer to rdma cgroup which will own this resource
			
 
				+ * @device: pointer to rdmacg device
			
 
				+ * @index: index of the resource to charge in cgroup (resource pool)
			
 
				+ *
			
 
				+ * This function follows charging resource in hierarchical way.
			
 
				+ * It will fail if the charge would cause the new value to exceed the
			
 
				+ * hierarchical limit.
			
 
				+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
			
 
				+ * Returns pointer to rdmacg for this resource when charging is successful.
			
 
				+ *
			
 
				+ * Charger needs to account resources on two criteria.
			
 
				+ * (a) per cgroup & (b) per device resource usage.
			
 
				+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
			
 
				+ * the configured limits. Per device provides granular configuration
			
 
				+ * in multi device usage. It allocates resource pool in the hierarchy
			
 
				+ * for each parent it come across for first resource. Later on resource
			
 
				+ * pool will be available. Therefore it will be much faster thereon
			
 
				+ * to charge/uncharge.
			
 
				+ */
			
 
				+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
			
 
				+		      struct rdmacg_device *device,
			
 
				+		      enum rdmacg_resource_type index)
			
 
				+{
			
 
				+	struct rdma_cgroup *cg, *p;
			
 
				+	struct rdmacg_resource_pool *rpool;
			
 
				+	s64 new;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (index >= RDMACG_RESOURCE_MAX)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/*
			
 
				+	 * hold on to css, as cgroup can be removed but resource
			
 
				+	 * accounting happens on css.
			
 
				+	 */
			
 
				+	cg = get_current_rdmacg();
			
 
				+
			
 
				+	mutex_lock(&rdmacg_mutex);
			
 
				+	for (p = cg; p; p = parent_rdmacg(p)) {
			
 
				+		rpool = get_cg_rpool_locked(p, device);
			
 
				+		if (IS_ERR(rpool)) {
			
 
				+			ret = PTR_ERR(rpool);
			
 
				+			goto err;
			
 
				+		} else {
			
 
				+			new = rpool->resources[index].usage + 1;
			
 
				+			if (new > rpool->resources[index].max) {
			
 
				+				ret = -EAGAIN;
			
 
				+				goto err;
			
 
				+			} else {
			
 
				+				rpool->resources[index].usage = new;
			
 
				+				rpool->usage_sum++;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	mutex_unlock(&rdmacg_mutex);
			
 
				+
			
 
				+	*rdmacg = cg;
			
 
				+	return 0;
			
 
				+
			
 
				+err:
			
 
				+	mutex_unlock(&rdmacg_mutex);
			
 
				+	rdmacg_uncharge_hierarchy(cg, device, p, index);
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL(rdmacg_try_charge);
			
 
				+
			
 
				+/**
			
 
				+ * rdmacg_register_device - register rdmacg device to rdma controller.
			
 
				+ * @device: pointer to rdmacg device whose resources need to be accounted.
			
 
				+ *
			
 
				+ * If IB stack wish a device to participate in rdma cgroup resource
			
 
				+ * tracking, it must invoke this API to register with rdma cgroup before
			
 
				+ * any user space application can start using the RDMA resources.
			
 
				+ * Returns 0 on success or EINVAL when table length given is beyond
			
 
				+ * supported size.
			
 
				+ */
			
 
				+int rdmacg_register_device(struct rdmacg_device *device)
			
 
				+{
			
 
				+	INIT_LIST_HEAD(&device->dev_node);
			
 
				+	INIT_LIST_HEAD(&device->rpools);
			
 
				+
			
 
				+	mutex_lock(&rdmacg_mutex);
			
 
				+	list_add_tail(&device->dev_node, &rdmacg_devices);
			
 
				+	mutex_unlock(&rdmacg_mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL(rdmacg_register_device);
			
 
				+
			
 
				+/**
			
 
				+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
			
 
				+ * @device: pointer to rdmacg device which was previously registered with rdma
			
 
				+ *          controller using rdmacg_register_device().
			
 
				+ *
			
 
				+ * IB stack must invoke this after all the resources of the IB device
			
 
				+ * are destroyed and after ensuring that no more resources will be created
			
 
				+ * when this API is invoked.
			
 
				+ */
			
 
				+void rdmacg_unregister_device(struct rdmacg_device *device)
			
 
				+{
			
 
				+	struct rdmacg_resource_pool *rpool, *tmp;
			
 
				+
			
 
				+	/*
			
 
				+	 * Synchronize with any active resource settings,
			
 
				+	 * usage query happening via configfs.
			
 
				+	 */
			
 
				+	mutex_lock(&rdmacg_mutex);
			
 
				+	list_del_init(&device->dev_node);
			
 
				+
			
 
				+	/*
			
 
				+	 * Now that this device is off the cgroup list, its safe to free
			
 
				+	 * all the rpool resources.
			
 
				+	 */
			
 
				+	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
			
 
				+		free_cg_rpool_locked(rpool);
			
 
				+
			
 
				+	mutex_unlock(&rdmacg_mutex);
			
 
				+}
			
 
				+EXPORT_SYMBOL(rdmacg_unregister_device);
			
 
				+
			
 
				+static int parse_resource(char *c, int *intval)
			
 
				+{
			
 
				+	substring_t argstr;
			
 
				+	const char **table = &rdmacg_resource_names[0];
			
 
				+	char *name, *value = c;
			
 
				+	size_t len;
			
 
				+	int ret, i = 0;
			
 
				+
			
 
				+	name = strsep(&value, "=");
			
 
				+	if (!name || !value)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	len = strlen(value);
			
 
				+
			
 
				+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
			
 
				+		if (strcmp(table[i], name))
			
 
				+			continue;
			
 
				+
			
 
				+		argstr.from = value;
			
 
				+		argstr.to = value + len;
			
 
				+
			
 
				+		ret = match_int(&argstr, intval);
			
 
				+		if (ret >= 0) {
			
 
				+			if (*intval < 0)
			
 
				+				break;
			
 
				+			return i;
			
 
				+		}
			
 
				+		if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
			
 
				+			*intval = S32_MAX;
			
 
				+			return i;
			
 
				+		}
			
 
				+		break;
			
 
				+	}
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static int rdmacg_parse_limits(char *options,
			
 
				+			       int *new_limits, unsigned long *enables)
			
 
				+{
			
 
				+	char *c;
			
 
				+	int err = -EINVAL;
			
 
				+
			
 
				+	/* parse resource options */
			
 
				+	while ((c = strsep(&options, " ")) != NULL) {
			
 
				+		int index, intval;
			
 
				+
			
 
				+		index = parse_resource(c, &intval);
			
 
				+		if (index < 0)
			
 
				+			goto err;
			
 
				+
			
 
				+		new_limits[index] = intval;
			
 
				+		*enables |= BIT(index);
			
 
				+	}
			
 
				+	return 0;
			
 
				+
			
 
				+err:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
			
 
				+{
			
 
				+	struct rdmacg_device *device;
			
 
				+
			
 
				+	lockdep_assert_held(&rdmacg_mutex);
			
 
				+
			
 
				+	list_for_each_entry(device, &rdmacg_devices, dev_node)
			
 
				+		if (!strcmp(name, device->name))
			
 
				+			return device;
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
			
 
				+				       char *buf, size_t nbytes, loff_t off)
			
 
				+{
			
 
				+	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
			
 
				+	const char *dev_name;
			
 
				+	struct rdmacg_resource_pool *rpool;
			
 
				+	struct rdmacg_device *device;
			
 
				+	char *options = strstrip(buf);
			
 
				+	int *new_limits;
			
 
				+	unsigned long enables = 0;
			
 
				+	int i = 0, ret = 0;
			
 
				+
			
 
				+	/* extract the device name first */
			
 
				+	dev_name = strsep(&options, " ");
			
 
				+	if (!dev_name) {
			
 
				+		ret = -EINVAL;
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
			
 
				+	if (!new_limits) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	ret = rdmacg_parse_limits(options, new_limits, &enables);
			
 
				+	if (ret)
			
 
				+		goto parse_err;
			
 
				+
			
 
				+	/* acquire lock to synchronize with hot plug devices */
			
 
				+	mutex_lock(&rdmacg_mutex);
			
 
				+
			
 
				+	device = rdmacg_get_device_locked(dev_name);
			
 
				+	if (!device) {
			
 
				+		ret = -ENODEV;
			
 
				+		goto dev_err;
			
 
				+	}
			
 
				+
			
 
				+	rpool = get_cg_rpool_locked(cg, device);
			
 
				+	if (IS_ERR(rpool)) {
			
 
				+		ret = PTR_ERR(rpool);
			
 
				+		goto dev_err;
			
 
				+	}
			
 
				+
			
 
				+	/* now set the new limits of the rpool */
			
 
				+	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
			
 
				+		set_resource_limit(rpool, i, new_limits[i]);
			
 
				+
			
 
				+	if (rpool->usage_sum == 0 &&
			
 
				+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
			
 
				+		/*
			
 
				+		 * No user of the rpool and all entries are set to max, so
			
 
				+		 * safe to delete this rpool.
			
 
				+		 */
			
 
				+		free_cg_rpool_locked(rpool);
			
 
				+	}
			
 
				+
			
 
				+dev_err:
			
 
				+	mutex_unlock(&rdmacg_mutex);
			
 
				+
			
 
				+parse_err:
			
 
				+	kfree(new_limits);
			
 
				+
			
 
				+err:
			
 
				+	return ret ?: nbytes;
			
 
				+}
			
 
				+
			
 
				+static void print_rpool_values(struct seq_file *sf,
			
 
				+			       struct rdmacg_resource_pool *rpool)
			
 
				+{
			
 
				+	enum rdmacg_file_type sf_type;
			
 
				+	int i;
			
 
				+	u32 value;
			
 
				+
			
 
				+	sf_type = seq_cft(sf)->private;
			
 
				+
			
 
				+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
			
 
				+		seq_puts(sf, rdmacg_resource_names[i]);
			
 
				+		seq_putc(sf, '=');
			
 
				+		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
			
 
				+			if (rpool)
			
 
				+				value = rpool->resources[i].max;
			
 
				+			else
			
 
				+				value = S32_MAX;
			
 
				+		} else {
			
 
				+			if (rpool)
			
 
				+				value = rpool->resources[i].usage;
			
 
				+			else
			
 
				+				value = 0;
			
 
				+		}
			
 
				+
			
 
				+		if (value == S32_MAX)
			
 
				+			seq_puts(sf, RDMACG_MAX_STR);
			
 
				+		else
			
 
				+			seq_printf(sf, "%d", value);
			
 
				+		seq_putc(sf, ' ');
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int rdmacg_resource_read(struct seq_file *sf, void *v)
			
 
				+{
			
 
				+	struct rdmacg_device *device;
			
 
				+	struct rdmacg_resource_pool *rpool;
			
 
				+	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
			
 
				+
			
 
				+	mutex_lock(&rdmacg_mutex);
			
 
				+
			
 
				+	list_for_each_entry(device, &rdmacg_devices, dev_node) {
			
 
				+		seq_printf(sf, "%s ", device->name);
			
 
				+
			
 
				+		rpool = find_cg_rpool_locked(cg, device);
			
 
				+		print_rpool_values(sf, rpool);
			
 
				+
			
 
				+		seq_putc(sf, '\n');
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&rdmacg_mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct cftype rdmacg_files[] = {
			
 
				+	{
			
 
				+		.name = "max",
			
 
				+		.write = rdmacg_resource_set_max,
			
 
				+		.seq_show = rdmacg_resource_read,
			
 
				+		.private = RDMACG_RESOURCE_TYPE_MAX,
			
 
				+		.flags = CFTYPE_NOT_ON_ROOT,
			
 
				+	},
			
 
				+	{
			
 
				+		.name = "current",
			
 
				+		.seq_show = rdmacg_resource_read,
			
 
				+		.private = RDMACG_RESOURCE_TYPE_STAT,
			
 
				+		.flags = CFTYPE_NOT_ON_ROOT,
			
 
				+	},
			
 
				+	{ }	/* terminate */
			
 
				+};
			
 
				+
			
 
				+static struct cgroup_subsys_state *
			
 
				+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
			
 
				+{
			
 
				+	struct rdma_cgroup *cg;
			
 
				+
			
 
				+	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
			
 
				+	if (!cg)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	INIT_LIST_HEAD(&cg->rpools);
			
 
				+	return &cg->css;
			
 
				+}
			
 
				+
			
 
				+static void rdmacg_css_free(struct cgroup_subsys_state *css)
			
 
				+{
			
 
				+	struct rdma_cgroup *cg = css_rdmacg(css);
			
 
				+
			
 
				+	kfree(cg);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdmacg_css_offline - cgroup css_offline callback
			
 
				+ * @css: css of interest
			
 
				+ *
			
 
				+ * This function is called when @css is about to go away and responsible
			
 
				+ * for shooting down all rdmacg associated with @css. As part of that it
			
 
				+ * marks all the resource pool entries to max value, so that when resources are
			
 
				+ * uncharged, associated resource pool can be freed as well.
			
 
				+ */
			
 
				+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
			
 
				+{
			
 
				+	struct rdma_cgroup *cg = css_rdmacg(css);
			
 
				+	struct rdmacg_resource_pool *rpool;
			
 
				+
			
 
				+	mutex_lock(&rdmacg_mutex);
			
 
				+
			
 
				+	list_for_each_entry(rpool, &cg->rpools, cg_node)
			
 
				+		set_all_resource_max_limit(rpool);
			
 
				+
			
 
				+	mutex_unlock(&rdmacg_mutex);
			
 
				+}
			
 
				+
			
 
				+struct cgroup_subsys rdma_cgrp_subsys = {
			
 
				+	.css_alloc	= rdmacg_css_alloc,
			
 
				+	.css_free	= rdmacg_css_free,
			
 
				+	.css_offline	= rdmacg_css_offline,
			
 
				+	.legacy_cftypes	= rdmacg_files,
			
 
				+	.dfl_cftypes	= rdmacg_files,
			
 
				+};
			
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10959,5 +10959,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
 
				 	.css_alloc	= perf_cgroup_css_alloc,
			
 
				 	.css_free	= perf_cgroup_css_free,
			
 
				 	.attach		= perf_cgroup_attach,
			
 
				+	/*
			
 
				+	 * Implicitly enable on dfl hierarchy so that perf events can
			
 
				+	 * always be filtered by cgroup2 path as long as perf_event
			
 
				+	 * controller is not mounted on a legacy hierarchy.
			
 
				+	 */
			
 
				+	.implicit_on_dfl = true,
			
 
				 };
			
 
				 #endif /* CONFIG_CGROUP_PERF */
			
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 
				 {
			
 
				 	FILE *fp;
			
 
				 	char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
			
 
				+	char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path;
			
 
				 	char *token, *saved_ptr = NULL;
			
 
				-	int found = 0;
			
 
				 
			
 
				 	fp = fopen("/proc/mounts", "r");
			
 
				 	if (!fp)
			
@@ -24,31 +24,43 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 
				 	 * and inspect every cgroupfs mount point to find one that has
			
 
				 	 * perf_event subsystem
			
 
				 	 */
			
 
				+	path_v1[0] = '\0';
			
 
				+	path_v2[0] = '\0';
			
 
				+
			
 
				 	while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
			
 
				 				STR(PATH_MAX)"s %*d %*d\n",
			
 
				 				mountpoint, type, tokens) == 3) {
			
 
				 
			
 
				-		if (!strcmp(type, "cgroup")) {
			
 
				+		if (!path_v1[0] && !strcmp(type, "cgroup")) {
			
 
				 
			
 
				 			token = strtok_r(tokens, ",", &saved_ptr);
			
 
				 
			
 
				 			while (token != NULL) {
			
 
				 				if (!strcmp(token, "perf_event")) {
			
 
				-					found = 1;
			
 
				+					strcpy(path_v1, mountpoint);
			
 
				 					break;
			
 
				 				}
			
 
				 				token = strtok_r(NULL, ",", &saved_ptr);
			
 
				 			}
			
 
				 		}
			
 
				-		if (found)
			
 
				+
			
 
				+		if (!path_v2[0] && !strcmp(type, "cgroup2"))
			
 
				+			strcpy(path_v2, mountpoint);
			
 
				+
			
 
				+		if (path_v1[0] && path_v2[0])
			
 
				 			break;
			
 
				 	}
			
 
				 	fclose(fp);
			
 
				-	if (!found)
			
 
				+
			
 
				+	if (path_v1[0])
			
 
				+		path = path_v1;
			
 
				+	else if (path_v2[0])
			
 
				+		path = path_v2;
			
 
				+	else
			
 
				 		return -1;
			
 
				 
			
 
				-	if (strlen(mountpoint) < maxlen) {
			
 
				-		strcpy(buf, mountpoint);
			
 
				+	if (strlen(path) < maxlen) {
			
 
				+		strcpy(buf, path);
			
 
				 		return 0;
			
 
				 	}
			
 
				 	return -1;