9 лет назад · a10c38a4f3
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -350,12 +350,12 @@ struct rbd_device {
 
				 	struct rbd_spec		*spec;
			
 
				 	struct rbd_options	*opts;
			
 
				 
			
 
				-	char			*header_name;
			
 
				+	struct ceph_object_id	header_oid;
			
 
				+	struct ceph_object_locator header_oloc;
			
 
				 
			
 
				 	struct ceph_file_layout	layout;
			
 
				 
			
 
				-	struct ceph_osd_event   *watch_event;
			
 
				-	struct rbd_obj_request	*watch_request;
			
 
				+	struct ceph_osd_linger_request *watch_handle;
			
 
				 
			
 
				 	struct rbd_spec		*parent_spec;
			
 
				 	u64			parent_overlap;
			
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
 
				 	return __rbd_obj_request_wait(obj_request, 0);
			
 
				 }
			
 
				 
			
 
				-static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
			
 
				-					unsigned long timeout)
			
 
				-{
			
 
				-	return __rbd_obj_request_wait(obj_request, timeout);
			
 
				-}
			
 
				-
			
 
				 static void rbd_img_request_complete(struct rbd_img_request *img_request)
			
 
				 {
			
 
				 
			
@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 
				 		complete_all(&obj_request->completion);
			
 
				 }
			
 
				 
			
 
				-static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
			
 
				-{
			
 
				-	dout("%s: obj %p\n", __func__, obj_request);
			
 
				-	obj_request_done_set(obj_request);
			
 
				-}
			
 
				-
			
 
				 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
			
 
				 {
			
 
				 	struct rbd_img_request *img_request = NULL;
			
@@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
 
				 		obj_request_done_set(obj_request);
			
 
				 }
			
 
				 
			
 
				-static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
			
 
				-				struct ceph_msg *msg)
			
 
				+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
			
 
				 {
			
 
				 	struct rbd_obj_request *obj_request = osd_req->r_priv;
			
 
				 	u16 opcode;
			
 
				 
			
 
				-	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
			
 
				+	dout("%s: osd_req %p\n", __func__, osd_req);
			
 
				 	rbd_assert(osd_req == obj_request->osd_req);
			
 
				 	if (obj_request_img_data_test(obj_request)) {
			
 
				 		rbd_assert(obj_request->img_request);
			
@@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 
				 	case CEPH_OSD_OP_CALL:
			
 
				 		rbd_osd_call_callback(obj_request);
			
 
				 		break;
			
 
				-	case CEPH_OSD_OP_NOTIFY_ACK:
			
 
				-	case CEPH_OSD_OP_WATCH:
			
 
				-		rbd_osd_trivial_callback(obj_request);
			
 
				-		break;
			
 
				 	default:
			
 
				 		rbd_warn(NULL, "%s: unsupported op %hu",
			
 
				 			obj_request->object_name, (unsigned short) opcode);
			
@@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 
				 {
			
 
				 	struct rbd_img_request *img_request = obj_request->img_request;
			
 
				 	struct ceph_osd_request *osd_req = obj_request->osd_req;
			
 
				-	u64 snap_id;
			
 
				 
			
 
				-	rbd_assert(osd_req != NULL);
			
 
				-
			
 
				-	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
			
 
				-	ceph_osdc_build_request(osd_req, obj_request->offset,
			
 
				-			NULL, snap_id, NULL);
			
 
				+	if (img_request)
			
 
				+		osd_req->r_snapid = img_request->snap_id;
			
 
				 }
			
 
				 
			
 
				 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
			
 
				 {
			
 
				-	struct rbd_img_request *img_request = obj_request->img_request;
			
 
				 	struct ceph_osd_request *osd_req = obj_request->osd_req;
			
 
				-	struct ceph_snap_context *snapc;
			
 
				-	struct timespec mtime = CURRENT_TIME;
			
 
				 
			
 
				-	rbd_assert(osd_req != NULL);
			
 
				-
			
 
				-	snapc = img_request ? img_request->snapc : NULL;
			
 
				-	ceph_osdc_build_request(osd_req, obj_request->offset,
			
 
				-			snapc, CEPH_NOSNAP, &mtime);
			
 
				+	osd_req->r_mtime = CURRENT_TIME;
			
 
				+	osd_req->r_data_offset = obj_request->offset;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 
				 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
			
 
				 					  GFP_NOIO);
			
 
				 	if (!osd_req)
			
 
				-		return NULL;	/* ENOMEM */
			
 
				+		goto fail;
			
 
				 
			
 
				 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
			
 
				 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
			
@@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create(
 
				 	osd_req->r_priv = obj_request;
			
 
				 
			
 
				 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
			
 
				-	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
			
 
				+	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
			
 
				+			     obj_request->object_name))
			
 
				+		goto fail;
			
 
				+
			
 
				+	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
			
 
				+		goto fail;
			
 
				 
			
 
				 	return osd_req;
			
 
				+
			
 
				+fail:
			
 
				+	ceph_osdc_put_request(osd_req);
			
 
				+	return NULL;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 
				 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
			
 
				 						false, GFP_NOIO);
			
 
				 	if (!osd_req)
			
 
				-		return NULL;	/* ENOMEM */
			
 
				+		goto fail;
			
 
				 
			
 
				 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
			
 
				 	osd_req->r_callback = rbd_osd_req_callback;
			
 
				 	osd_req->r_priv = obj_request;
			
 
				 
			
 
				 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
			
 
				-	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
			
 
				+	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
			
 
				+			     obj_request->object_name))
			
 
				+		goto fail;
			
 
				+
			
 
				+	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
			
 
				+		goto fail;
			
 
				 
			
 
				 	return osd_req;
			
 
				+
			
 
				+fail:
			
 
				+	ceph_osdc_put_request(osd_req);
			
 
				+	return NULL;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
 
				 {
			
 
				 	struct rbd_obj_request *obj_request;
			
 
				 	struct rbd_obj_request *next_obj_request;
			
 
				+	int ret = 0;
			
 
				 
			
 
				 	dout("%s: img %p\n", __func__, img_request);
			
 
				-	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
			
 
				-		int ret;
			
 
				 
			
 
				+	rbd_img_request_get(img_request);
			
 
				+	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
			
 
				 		ret = rbd_img_obj_request_submit(obj_request);
			
 
				 		if (ret)
			
 
				-			return ret;
			
 
				+			goto out_put_ireq;
			
 
				 	}
			
 
				 
			
 
				-	return 0;
			
 
				+out_put_ireq:
			
 
				+	rbd_img_request_put(img_request);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
			
@@ -3090,45 +3084,18 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
 
				 	obj_request_done_set(obj_request);
			
 
				 }
			
 
				 
			
 
				-static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
			
 
				-{
			
 
				-	struct rbd_obj_request *obj_request;
			
 
				-	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
			
 
				-	int ret;
			
 
				-
			
 
				-	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
			
 
				-							OBJ_REQUEST_NODATA);
			
 
				-	if (!obj_request)
			
 
				-		return -ENOMEM;
			
 
				-
			
 
				-	ret = -ENOMEM;
			
 
				-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
			
 
				-						  obj_request);
			
 
				-	if (!obj_request->osd_req)
			
 
				-		goto out;
			
 
				-
			
 
				-	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
			
 
				-					notify_id, 0, 0);
			
 
				-	rbd_osd_req_format_read(obj_request);
			
 
				+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
			
 
				+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
			
 
				 
			
 
				-	ret = rbd_obj_request_submit(osdc, obj_request);
			
 
				-	if (ret)
			
 
				-		goto out;
			
 
				-	ret = rbd_obj_request_wait(obj_request);
			
 
				-out:
			
 
				-	rbd_obj_request_put(obj_request);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
			
 
				+static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
			
 
				+			 u64 notifier_id, void *data, size_t data_len)
			
 
				 {
			
 
				-	struct rbd_device *rbd_dev = (struct rbd_device *)data;
			
 
				+	struct rbd_device *rbd_dev = arg;
			
 
				+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
			
 
				 	int ret;
			
 
				 
			
 
				-	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
			
 
				-		rbd_dev->header_name, (unsigned long long)notify_id,
			
 
				-		(unsigned int)opcode);
			
 
				+	dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
			
 
				+	     cookie, notify_id);
			
 
				 
			
 
				 	/*
			
 
				 	 * Until adequate refresh error handling is in place, there is
			
@@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 
				 	if (ret)
			
 
				 		rbd_warn(rbd_dev, "refresh failed: %d", ret);
			
 
				 
			
 
				-	ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
			
 
				+	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
			
 
				+				   &rbd_dev->header_oloc, notify_id, cookie,
			
 
				+				   NULL, 0);
			
 
				 	if (ret)
			
 
				 		rbd_warn(rbd_dev, "notify_ack ret %d", ret);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Send a (un)watch request and wait for the ack.  Return a request
			
 
				- * with a ref held on success or error.
			
 
				- */
			
 
				-static struct rbd_obj_request *rbd_obj_watch_request_helper(
			
 
				-						struct rbd_device *rbd_dev,
			
 
				-						bool watch)
			
 
				+static void rbd_watch_errcb(void *arg, u64 cookie, int err)
			
 
				 {
			
 
				-	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
			
 
				-	struct ceph_options *opts = osdc->client->options;
			
 
				-	struct rbd_obj_request *obj_request;
			
 
				+	struct rbd_device *rbd_dev = arg;
			
 
				 	int ret;
			
 
				 
			
 
				-	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
			
 
				-					     OBJ_REQUEST_NODATA);
			
 
				-	if (!obj_request)
			
 
				-		return ERR_PTR(-ENOMEM);
			
 
				-
			
 
				-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
			
 
				-						  obj_request);
			
 
				-	if (!obj_request->osd_req) {
			
 
				-		ret = -ENOMEM;
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
			
 
				-			      rbd_dev->watch_event->cookie, 0, watch);
			
 
				-	rbd_osd_req_format_write(obj_request);
			
 
				+	rbd_warn(rbd_dev, "encountered watch error: %d", err);
			
 
				 
			
 
				-	if (watch)
			
 
				-		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
			
 
				-
			
 
				-	ret = rbd_obj_request_submit(osdc, obj_request);
			
 
				-	if (ret)
			
 
				-		goto out;
			
 
				+	__rbd_dev_header_unwatch_sync(rbd_dev);
			
 
				 
			
 
				-	ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
			
 
				-	if (ret)
			
 
				-		goto out;
			
 
				-
			
 
				-	ret = obj_request->result;
			
 
				+	ret = rbd_dev_header_watch_sync(rbd_dev);
			
 
				 	if (ret) {
			
 
				-		if (watch)
			
 
				-			rbd_obj_request_end(obj_request);
			
 
				-		goto out;
			
 
				+		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				-	return obj_request;
			
 
				-
			
 
				-out:
			
 
				-	rbd_obj_request_put(obj_request);
			
 
				-	return ERR_PTR(ret);
			
 
				+	ret = rbd_dev_refresh(rbd_dev);
			
 
				+	if (ret)
			
 
				+		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3205,35 +3140,33 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
 
				 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
			
 
				 {
			
 
				 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
			
 
				-	struct rbd_obj_request *obj_request;
			
 
				-	int ret;
			
 
				+	struct ceph_osd_linger_request *handle;
			
 
				 
			
 
				-	rbd_assert(!rbd_dev->watch_event);
			
 
				-	rbd_assert(!rbd_dev->watch_request);
			
 
				+	rbd_assert(!rbd_dev->watch_handle);
			
 
				 
			
 
				-	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
			
 
				-				     &rbd_dev->watch_event);
			
 
				-	if (ret < 0)
			
 
				-		return ret;
			
 
				+	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
			
 
				+				 &rbd_dev->header_oloc, rbd_watch_cb,
			
 
				+				 rbd_watch_errcb, rbd_dev);
			
 
				+	if (IS_ERR(handle))
			
 
				+		return PTR_ERR(handle);
			
 
				 
			
 
				-	obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
			
 
				-	if (IS_ERR(obj_request)) {
			
 
				-		ceph_osdc_cancel_event(rbd_dev->watch_event);
			
 
				-		rbd_dev->watch_event = NULL;
			
 
				-		return PTR_ERR(obj_request);
			
 
				-	}
			
 
				+	rbd_dev->watch_handle = handle;
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				-	/*
			
 
				-	 * A watch request is set to linger, so the underlying osd
			
 
				-	 * request won't go away until we unregister it.  We retain
			
 
				-	 * a pointer to the object request during that time (in
			
 
				-	 * rbd_dev->watch_request), so we'll keep a reference to it.
			
 
				-	 * We'll drop that reference after we've unregistered it in
			
 
				-	 * rbd_dev_header_unwatch_sync().
			
 
				-	 */
			
 
				-	rbd_dev->watch_request = obj_request;
			
 
				+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
			
 
				+	int ret;
			
 
				 
			
 
				-	return 0;
			
 
				+	if (!rbd_dev->watch_handle)
			
 
				+		return;
			
 
				+
			
 
				+	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
			
 
				+	if (ret)
			
 
				+		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
			
 
				+
			
 
				+	rbd_dev->watch_handle = NULL;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 
				  */
			
 
				 static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
			
 
				 {
			
 
				-	struct rbd_obj_request *obj_request;
			
 
				-
			
 
				-	rbd_assert(rbd_dev->watch_event);
			
 
				-	rbd_assert(rbd_dev->watch_request);
			
 
				-
			
 
				-	rbd_obj_request_end(rbd_dev->watch_request);
			
 
				-	rbd_obj_request_put(rbd_dev->watch_request);
			
 
				-	rbd_dev->watch_request = NULL;
			
 
				-
			
 
				-	obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
			
 
				-	if (!IS_ERR(obj_request))
			
 
				-		rbd_obj_request_put(obj_request);
			
 
				-	else
			
 
				-		rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
			
 
				-			 PTR_ERR(obj_request));
			
 
				-
			
 
				-	ceph_osdc_cancel_event(rbd_dev->watch_event);
			
 
				-	rbd_dev->watch_event = NULL;
			
 
				+	__rbd_dev_header_unwatch_sync(rbd_dev);
			
 
				 
			
 
				 	dout("%s flushing notifies\n", __func__);
			
 
				 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
			
@@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
 
				 		if (!ondisk)
			
 
				 			return -ENOMEM;
			
 
				 
			
 
				-		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
			
 
				+		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
			
 
				 				       0, size, ondisk);
			
 
				 		if (ret < 0)
			
 
				 			goto out;
			
@@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev)
 
				 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
			
 
				 	bool need_put = !!rbd_dev->opts;
			
 
				 
			
 
				+	ceph_oid_destroy(&rbd_dev->header_oid);
			
 
				+
			
 
				 	rbd_put_client(rbd_dev->rbd_client);
			
 
				 	rbd_spec_put(rbd_dev->spec);
			
 
				 	kfree(rbd_dev->opts);
			
@@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 
				 	INIT_LIST_HEAD(&rbd_dev->node);
			
 
				 	init_rwsem(&rbd_dev->header_rwsem);
			
 
				 
			
 
				+	ceph_oid_init(&rbd_dev->header_oid);
			
 
				+	ceph_oloc_init(&rbd_dev->header_oloc);
			
 
				+
			
 
				 	rbd_dev->dev.bus = &rbd_bus_type;
			
 
				 	rbd_dev->dev.type = &rbd_device_type;
			
 
				 	rbd_dev->dev.parent = &rbd_root_dev;
			
@@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 
				 		__le64 size;
			
 
				 	} __attribute__ ((packed)) size_buf = { 0 };
			
 
				 
			
 
				-	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
			
 
				+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
			
 
				 				"rbd", "get_size",
			
 
				 				&snapid, sizeof (snapid),
			
 
				 				&size_buf, sizeof (size_buf));
			
@@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
 
				 	if (!reply_buf)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
			
 
				+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
			
 
				 				"rbd", "get_object_prefix", NULL, 0,
			
 
				 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
			
 
				 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
			
@@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 
				 	u64 unsup;
			
 
				 	int ret;
			
 
				 
			
 
				-	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
			
 
				+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
			
 
				 				"rbd", "get_features",
			
 
				 				&snapid, sizeof (snapid),
			
 
				 				&features_buf, sizeof (features_buf));
			
@@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 
				 	}
			
 
				 
			
 
				 	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
			
 
				-	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
			
 
				+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
			
 
				 				"rbd", "get_parent",
			
 
				 				&snapid, sizeof (snapid),
			
 
				 				reply_buf, size);
			
@@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
 
				 	u64 stripe_count;
			
 
				 	int ret;
			
 
				 
			
 
				-	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
			
 
				+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
			
 
				 				"rbd", "get_stripe_unit_count", NULL, 0,
			
 
				 				(char *)&striping_info_buf, size);
			
 
				 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
			
@@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
 
				 	if (!reply_buf)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
			
 
				+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
			
 
				 				"rbd", "get_snapcontext", NULL, 0,
			
 
				 				reply_buf, size);
			
 
				 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
			
@@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
 
				 	snapid = cpu_to_le64(snap_id);
			
 
				-	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
			
 
				+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
			
 
				 				"rbd", "get_snapshot_name",
			
 
				 				&snapid, sizeof (snapid),
			
 
				 				reply_buf, size);
			
@@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 
				 again:
			
 
				 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
			
 
				 	if (ret == -ENOENT && tries++ < 1) {
			
 
				-		ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
			
 
				-					       &newest_epoch);
			
 
				+		ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
			
 
				+					    &newest_epoch);
			
 
				 		if (ret < 0)
			
 
				 			return ret;
			
 
				 
			
 
				 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
			
 
				-			ceph_monc_request_next_osdmap(&rbdc->client->monc);
			
 
				+			ceph_osdc_maybe_request_map(&rbdc->client->osdc);
			
 
				 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
			
 
				 						     newest_epoch,
			
 
				 						     opts->mount_timeout);
			
@@ -5260,35 +5181,26 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
 
				 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
			
 
				 {
			
 
				 	struct rbd_spec *spec = rbd_dev->spec;
			
 
				-	size_t size;
			
 
				+	int ret;
			
 
				 
			
 
				 	/* Record the header object name for this rbd image. */
			
 
				 
			
 
				 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
			
 
				 
			
 
				+	rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
			
 
				 	if (rbd_dev->image_format == 1)
			
 
				-		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
			
 
				+		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
			
 
				+				       spec->image_name, RBD_SUFFIX);
			
 
				 	else
			
 
				-		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
			
 
				-
			
 
				-	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
			
 
				-	if (!rbd_dev->header_name)
			
 
				-		return -ENOMEM;
			
 
				+		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
			
 
				+				       RBD_HEADER_PREFIX, spec->image_id);
			
 
				 
			
 
				-	if (rbd_dev->image_format == 1)
			
 
				-		sprintf(rbd_dev->header_name, "%s%s",
			
 
				-			spec->image_name, RBD_SUFFIX);
			
 
				-	else
			
 
				-		sprintf(rbd_dev->header_name, "%s%s",
			
 
				-			RBD_HEADER_PREFIX, spec->image_id);
			
 
				-	return 0;
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
			
 
				 {
			
 
				 	rbd_dev_unprobe(rbd_dev);
			
 
				-	kfree(rbd_dev->header_name);
			
 
				-	rbd_dev->header_name = NULL;
			
 
				 	rbd_dev->image_format = 0;
			
 
				 	kfree(rbd_dev->spec->image_id);
			
 
				 	rbd_dev->spec->image_id = NULL;
			
@@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
 
				 				pr_info("image %s/%s does not exist\n",
			
 
				 					rbd_dev->spec->pool_name,
			
 
				 					rbd_dev->spec->image_name);
			
 
				-			goto out_header_name;
			
 
				+			goto err_out_format;
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
 
				 		goto err_out_probe;
			
 
				 
			
 
				 	dout("discovered format %u image, header name is %s\n",
			
 
				-		rbd_dev->image_format, rbd_dev->header_name);
			
 
				+		rbd_dev->image_format, rbd_dev->header_oid.name);
			
 
				 	return 0;
			
 
				 
			
 
				 err_out_probe:
			
@@ -5381,9 +5293,6 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
 
				 err_out_watch:
			
 
				 	if (!depth)
			
 
				 		rbd_dev_header_unwatch_sync(rbd_dev);
			
 
				-out_header_name:
			
 
				-	kfree(rbd_dev->header_name);
			
 
				-	rbd_dev->header_name = NULL;
			
 
				 err_out_format:
			
 
				 	rbd_dev->image_format = 0;
			
 
				 	kfree(rbd_dev->spec->image_id);
			
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
 
				 /*
			
 
				  * Finish an async read(ahead) op.
			
 
				  */
			
 
				-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
			
 
				+static void finish_read(struct ceph_osd_request *req)
			
 
				 {
			
 
				 	struct inode *inode = req->r_inode;
			
 
				 	struct ceph_osd_data *osd_data;
			
 
				-	int rc = req->r_result;
			
 
				-	int bytes = le32_to_cpu(msg->hdr.data_len);
			
 
				+	int rc = req->r_result <= 0 ? req->r_result : 0;
			
 
				+	int bytes = req->r_result >= 0 ? req->r_result : 0;
			
 
				 	int num_pages;
			
 
				 	int i;
			
 
				 
			
@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 
				 	req->r_callback = finish_read;
			
 
				 	req->r_inode = inode;
			
 
				 
			
 
				-	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
			
 
				-
			
 
				 	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
			
 
				 	ret = ceph_osdc_start_request(osdc, req, false);
			
 
				 	if (ret < 0)
			
@@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 
				 				   truncate_seq, truncate_size,
			
 
				 				   &inode->i_mtime, &page, 1);
			
 
				 	if (err < 0) {
			
 
				-		dout("writepage setting page/mapping error %d %p\n", err, page);
			
 
				+		struct writeback_control tmp_wbc;
			
 
				+		if (!wbc)
			
 
				+			wbc = &tmp_wbc;
			
 
				+		if (err == -ERESTARTSYS) {
			
 
				+			/* killed by SIGKILL */
			
 
				+			dout("writepage interrupted page %p\n", page);
			
 
				+			redirty_page_for_writepage(wbc, page);
			
 
				+			end_page_writeback(page);
			
 
				+			goto out;
			
 
				+		}
			
 
				+		dout("writepage setting page/mapping error %d %p\n",
			
 
				+		     err, page);
			
 
				 		SetPageError(page);
			
 
				 		mapping_set_error(&inode->i_data, err);
			
 
				-		if (wbc)
			
 
				-			wbc->pages_skipped++;
			
 
				+		wbc->pages_skipped++;
			
 
				 	} else {
			
 
				 		dout("writepage cleaned page %p\n", page);
			
 
				 		err = 0;  /* vfs expects us to return 0 */
			
@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
 
				 	BUG_ON(!inode);
			
 
				 	ihold(inode);
			
 
				 	err = writepage_nounlock(page, wbc);
			
 
				+	if (err == -ERESTARTSYS) {
			
 
				+		/* direct memory reclaimer was killed by SIGKILL. return 0
			
 
				+		 * to prevent caller from setting mapping/page error */
			
 
				+		err = 0;
			
 
				+	}
			
 
				 	unlock_page(page);
			
 
				 	iput(inode);
			
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 /*
			
 
				  * lame release_pages helper.  release_pages() isn't exported to
			
 
				  * modules.
			
@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num)
 
				  * If we get an error, set the mapping error bit, but not the individual
			
 
				  * page error bits.
			
 
				  */
			
 
				-static void writepages_finish(struct ceph_osd_request *req,
			
 
				-			      struct ceph_msg *msg)
			
 
				+static void writepages_finish(struct ceph_osd_request *req)
			
 
				 {
			
 
				 	struct inode *inode = req->r_inode;
			
 
				 	struct ceph_inode_info *ci = ceph_inode(inode);
			
@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req,
 
				 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
			
 
				 	bool remove_page;
			
 
				 
			
 
				-
			
 
				 	dout("writepages_finish %p rc %d\n", inode, rc);
			
 
				 	if (rc < 0)
			
 
				 		mapping_set_error(mapping, rc);
			
@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req,
 
				 				clear_bdi_congested(&fsc->backing_dev_info,
			
 
				 						    BLK_RW_ASYNC);
			
 
				 
			
 
				+			if (rc < 0)
			
 
				+				SetPageError(page);
			
 
				+
			
 
				 			ceph_put_snap_context(page_snap_context(page));
			
 
				 			page->private = 0;
			
 
				 			ClearPagePrivate(page);
			
@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping,
 
				 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
			
 
				 
			
 
				 	if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
			
 
				-		pr_warn("writepage_start %p on forced umount\n", inode);
			
 
				-		truncate_pagecache(inode, 0);
			
 
				+		if (ci->i_wrbuffer_ref > 0) {
			
 
				+			pr_warn_ratelimited(
			
 
				+				"writepage_start %p %lld forced umount\n",
			
 
				+				inode, ceph_ino(inode));
			
 
				+		}
			
 
				 		mapping_set_error(mapping, -EIO);
			
 
				 		return -EIO; /* we're in a forced umount, don't write! */
			
 
				 	}
			
@@ -1063,10 +1079,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 
				 			pages = NULL;
			
 
				 		}
			
 
				 
			
 
				-		vino = ceph_vino(inode);
			
 
				-		ceph_osdc_build_request(req, offset, snapc, vino.snap,
			
 
				-					&inode->i_mtime);
			
 
				-
			
 
				+		req->r_mtime = inode->i_mtime;
			
 
				 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
			
 
				 		BUG_ON(rc);
			
 
				 		req = NULL;
			
@@ -1099,8 +1112,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 
				 		mapping->writeback_index = index;
			
 
				 
			
 
				 out:
			
 
				-	if (req)
			
 
				-		ceph_osdc_put_request(req);
			
 
				+	ceph_osdc_put_request(req);
			
 
				 	ceph_put_snap_context(snapc);
			
 
				 	dout("writepages done, rc = %d\n", rc);
			
 
				 	return rc;
			
@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file,
 
				 			    struct page *page)
			
 
				 {
			
 
				 	struct inode *inode = file_inode(file);
			
 
				+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
			
 
				 	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				 	loff_t page_off = pos & PAGE_MASK;
			
 
				 	int pos_in_page = pos & ~PAGE_MASK;
			
@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file,
 
				 	int r;
			
 
				 	struct ceph_snap_context *snapc, *oldest;
			
 
				 
			
 
				+	if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
			
 
				+		dout(" page %p forced umount\n", page);
			
 
				+		unlock_page(page);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+
			
 
				 retry_locked:
			
 
				 	/* writepages currently holds page lock, but if we change that later, */
			
 
				 	wait_on_page_writeback(page);
			
@@ -1165,7 +1184,7 @@ static int ceph_update_writeable_page(struct file *file,
 
				 			snapc = ceph_get_snap_context(snapc);
			
 
				 			unlock_page(page);
			
 
				 			ceph_queue_writeback(inode);
			
 
				-			r = wait_event_interruptible(ci->i_cap_wq,
			
 
				+			r = wait_event_killable(ci->i_cap_wq,
			
 
				 			       context_is_writeable_or_written(inode, snapc));
			
 
				 			ceph_put_snap_context(snapc);
			
 
				 			if (r == -ERESTARTSYS)
			
@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = {
 
				 	.direct_IO = ceph_direct_io,
			
 
				 };
			
 
				 
			
 
				+static void ceph_block_sigs(sigset_t *oldset)
			
 
				+{
			
 
				+	sigset_t mask;
			
 
				+	siginitsetinv(&mask, sigmask(SIGKILL));
			
 
				+	sigprocmask(SIG_BLOCK, &mask, oldset);
			
 
				+}
			
 
				+
			
 
				+static void ceph_restore_sigs(sigset_t *oldset)
			
 
				+{
			
 
				+	sigprocmask(SIG_SETMASK, oldset, NULL);
			
 
				+}
			
 
				 
			
 
				 /*
			
 
				  * vm ops
			
@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	struct page *pinned_page = NULL;
			
 
				 	loff_t off = vmf->pgoff << PAGE_SHIFT;
			
 
				 	int want, got, ret;
			
 
				+	sigset_t oldset;
			
 
				+
			
 
				+	ceph_block_sigs(&oldset);
			
 
				 
			
 
				 	dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
			
 
				 	     inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
			
@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
			
 
				 	else
			
 
				 		want = CEPH_CAP_FILE_CACHE;
			
 
				-	while (1) {
			
 
				-		got = 0;
			
 
				-		ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
			
 
				-				    -1, &got, &pinned_page);
			
 
				-		if (ret == 0)
			
 
				-			break;
			
 
				-		if (ret != -ERESTARTSYS) {
			
 
				-			WARN_ON(1);
			
 
				-			return VM_FAULT_SIGBUS;
			
 
				-		}
			
 
				-	}
			
 
				+
			
 
				+	got = 0;
			
 
				+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
			
 
				+	if (ret < 0)
			
 
				+		goto out_restore;
			
 
				+
			
 
				 	dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
			
 
				 	     inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
			
 
				 
			
@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	ceph_put_cap_refs(ci, got);
			
 
				 
			
 
				 	if (ret != -EAGAIN)
			
 
				-		return ret;
			
 
				+		goto out_restore;
			
 
				 
			
 
				 	/* read inline data */
			
 
				 	if (off >= PAGE_SIZE) {
			
@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 						~__GFP_FS));
			
 
				 		if (!page) {
			
 
				 			ret = VM_FAULT_OOM;
			
 
				-			goto out;
			
 
				+			goto out_inline;
			
 
				 		}
			
 
				 		ret1 = __ceph_do_getattr(inode, page,
			
 
				 					 CEPH_STAT_CAP_INLINE_DATA, true);
			
 
				 		if (ret1 < 0 || off >= i_size_read(inode)) {
			
 
				 			unlock_page(page);
			
 
				 			put_page(page);
			
 
				-			ret = VM_FAULT_SIGBUS;
			
 
				-			goto out;
			
 
				+			if (ret1 < 0)
			
 
				+				ret = ret1;
			
 
				+			else
			
 
				+				ret = VM_FAULT_SIGBUS;
			
 
				+			goto out_inline;
			
 
				 		}
			
 
				 		if (ret1 < PAGE_SIZE)
			
 
				 			zero_user_segment(page, ret1, PAGE_SIZE);
			
@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 		SetPageUptodate(page);
			
 
				 		vmf->page = page;
			
 
				 		ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
			
 
				+out_inline:
			
 
				+		dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
			
 
				+		     inode, off, (size_t)PAGE_SIZE, ret);
			
 
				 	}
			
 
				-out:
			
 
				-	dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
			
 
				-	     inode, off, (size_t)PAGE_SIZE, ret);
			
 
				+out_restore:
			
 
				+	ceph_restore_sigs(&oldset);
			
 
				+	if (ret < 0)
			
 
				+		ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
			
 
				+
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	loff_t size = i_size_read(inode);
			
 
				 	size_t len;
			
 
				 	int want, got, ret;
			
 
				+	sigset_t oldset;
			
 
				 
			
 
				 	prealloc_cf = ceph_alloc_cap_flush();
			
 
				 	if (!prealloc_cf)
			
 
				-		return VM_FAULT_SIGBUS;
			
 
				+		return VM_FAULT_OOM;
			
 
				+
			
 
				+	ceph_block_sigs(&oldset);
			
 
				 
			
 
				 	if (ci->i_inline_version != CEPH_INLINE_NONE) {
			
 
				 		struct page *locked_page = NULL;
			
@@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 		ret = ceph_uninline_data(vma->vm_file, locked_page);
			
 
				 		if (locked_page)
			
 
				 			unlock_page(locked_page);
			
 
				-		if (ret < 0) {
			
 
				-			ret = VM_FAULT_SIGBUS;
			
 
				+		if (ret < 0)
			
 
				 			goto out_free;
			
 
				-		}
			
 
				 	}
			
 
				 
			
 
				 	if (off + PAGE_SIZE <= size)
			
@@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
			
 
				 	else
			
 
				 		want = CEPH_CAP_FILE_BUFFER;
			
 
				-	while (1) {
			
 
				-		got = 0;
			
 
				-		ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
			
 
				-				    &got, NULL);
			
 
				-		if (ret == 0)
			
 
				-			break;
			
 
				-		if (ret != -ERESTARTSYS) {
			
 
				-			WARN_ON(1);
			
 
				-			ret = VM_FAULT_SIGBUS;
			
 
				-			goto out_free;
			
 
				-		}
			
 
				-	}
			
 
				+
			
 
				+	got = 0;
			
 
				+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
			
 
				+			    &got, NULL);
			
 
				+	if (ret < 0)
			
 
				+		goto out_free;
			
 
				+
			
 
				 	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
			
 
				 	     inode, off, len, ceph_cap_string(got));
			
 
				 
			
 
				 	/* Update time before taking page lock */
			
 
				 	file_update_time(vma->vm_file);
			
 
				 
			
 
				-	lock_page(page);
			
 
				+	do {
			
 
				+		lock_page(page);
			
 
				 
			
 
				-	ret = VM_FAULT_NOPAGE;
			
 
				-	if ((off > size) ||
			
 
				-	    (page->mapping != inode->i_mapping)) {
			
 
				-		unlock_page(page);
			
 
				-		goto out;
			
 
				-	}
			
 
				+		if ((off > size) || (page->mapping != inode->i_mapping)) {
			
 
				+			unlock_page(page);
			
 
				+			ret = VM_FAULT_NOPAGE;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
			
 
				+		if (ret >= 0) {
			
 
				+			/* success.  we'll keep the page locked. */
			
 
				+			set_page_dirty(page);
			
 
				+			ret = VM_FAULT_LOCKED;
			
 
				+		}
			
 
				+	} while (ret == -EAGAIN);
			
 
				 
			
 
				-	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
			
 
				-	if (ret >= 0) {
			
 
				-		/* success.  we'll keep the page locked. */
			
 
				-		set_page_dirty(page);
			
 
				-		ret = VM_FAULT_LOCKED;
			
 
				-	} else {
			
 
				-		if (ret == -ENOMEM)
			
 
				-			ret = VM_FAULT_OOM;
			
 
				-		else
			
 
				-			ret = VM_FAULT_SIGBUS;
			
 
				-	}
			
 
				-out:
			
 
				 	if (ret == VM_FAULT_LOCKED ||
			
 
				 	    ci->i_inline_version != CEPH_INLINE_NONE) {
			
 
				 		int dirty;
			
@@ -1495,8 +1523,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	     inode, off, len, ceph_cap_string(got), ret);
			
 
				 	ceph_put_cap_refs(ci, got);
			
 
				 out_free:
			
 
				+	ceph_restore_sigs(&oldset);
			
 
				 	ceph_free_cap_flush(prealloc_cf);
			
 
				-
			
 
				+	if (ret < 0)
			
 
				+		ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
			
 
				+	req->r_mtime = inode->i_mtime;
			
 
				 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
			
 
				 	if (!err)
			
 
				 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
			
@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 
				 			goto out_put;
			
 
				 	}
			
 
				 
			
 
				-	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
			
 
				+	req->r_mtime = inode->i_mtime;
			
 
				 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
			
 
				 	if (!err)
			
 
				 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
			
@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
				 	rd_req->r_flags = CEPH_OSD_FLAG_READ;
			
 
				 	osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
			
 
				 	rd_req->r_base_oloc.pool = pool;
			
 
				-	snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
			
 
				-		 "%llx.00000000", ci->i_vino.ino);
			
 
				-	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
			
 
				+	ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
			
 
				+
			
 
				+	err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
			
 
				+	if (err)
			
 
				+		goto out_unlock;
			
 
				 
			
 
				 	wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
			
 
				 					 1, false, GFP_NOFS);
			
@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				-	wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
			
 
				-			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
			
 
				+	wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
			
 
				 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
			
 
				-	wr_req->r_base_oloc.pool = pool;
			
 
				-	wr_req->r_base_oid = rd_req->r_base_oid;
			
 
				+	ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
			
 
				+	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
			
 
				+
			
 
				+	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
			
 
				+	if (err)
			
 
				+		goto out_unlock;
			
 
				 
			
 
				 	/* one page should be large enough for STAT data */
			
 
				 	pages = ceph_alloc_page_vector(1, GFP_KERNEL);
			
@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
				 
			
 
				 	osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
			
 
				 				     0, false, true);
			
 
				-	ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
			
 
				-				&ci->vfs_inode.i_mtime);
			
 
				 	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
			
 
				 
			
 
				-	ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
			
 
				-				&ci->vfs_inode.i_mtime);
			
 
				+	wr_req->r_mtime = ci->vfs_inode.i_mtime;
			
 
				 	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
			
 
				 
			
 
				 	if (!err)
			
@@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
				 out_unlock:
			
 
				 	up_write(&mdsc->pool_perm_rwsem);
			
 
				 
			
 
				-	if (rd_req)
			
 
				-		ceph_osdc_put_request(rd_req);
			
 
				-	if (wr_req)
			
 
				-		ceph_osdc_put_request(wr_req);
			
 
				+	ceph_osdc_put_request(rd_req);
			
 
				+	ceph_osdc_put_request(wr_req);
			
 
				 out:
			
 
				 	if (!err)
			
 
				 		err = have;
			
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
 
				 	unlock_page(page);
			
 
				 }
			
 
				 
			
 
				-static inline int cache_valid(struct ceph_inode_info *ci)
			
 
				+static inline bool cache_valid(struct ceph_inode_info *ci)
			
 
				 {
			
 
				 	return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
			
 
				 		(ci->i_fscache_gen == ci->i_rdcache_gen));
			
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1656,7 +1656,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 
				 	 */
			
 
				 	if ((!is_delayed || mdsc->stopping) &&
			
 
				 	    !S_ISDIR(inode->i_mode) &&		/* ignore readdir cache */
			
 
				-	    ci->i_wrbuffer_ref == 0 &&		/* no dirty pages... */
			
 
				+	    !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
			
 
				 	    inode->i_data.nrpages &&		/* have cached pages */
			
 
				 	    (revoking & (CEPH_CAP_FILE_CACHE|
			
 
				 			 CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
			
@@ -1698,8 +1698,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 
				 
			
 
				 		revoking = cap->implemented & ~cap->issued;
			
 
				 		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
			
 
				-		     cap->mds, cap, ceph_cap_string(cap->issued),
			
 
				-		     ceph_cap_string(cap_used),
			
 
				+		     cap->mds, cap, ceph_cap_string(cap_used),
			
 
				+		     ceph_cap_string(cap->issued),
			
 
				 		     ceph_cap_string(cap->implemented),
			
 
				 		     ceph_cap_string(revoking));
			
 
				 
			
@@ -2317,7 +2317,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 
				 
			
 
				 	/* make sure file is actually open */
			
 
				 	file_wanted = __ceph_caps_file_wanted(ci);
			
 
				-	if ((file_wanted & need) == 0) {
			
 
				+	if ((file_wanted & need) != need) {
			
 
				 		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
			
 
				 		     ceph_cap_string(need), ceph_cap_string(file_wanted));
			
 
				 		*err = -EBADF;
			
@@ -2412,12 +2412,26 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 
				 			goto out_unlock;
			
 
				 		}
			
 
				 
			
 
				-		if (!__ceph_is_any_caps(ci) &&
			
 
				-		    ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
			
 
				-			dout("get_cap_refs %p forced umount\n", inode);
			
 
				-			*err = -EIO;
			
 
				-			ret = 1;
			
 
				-			goto out_unlock;
			
 
				+		if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
			
 
				+			int mds_wanted;
			
 
				+			if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
			
 
				+			    CEPH_MOUNT_SHUTDOWN) {
			
 
				+				dout("get_cap_refs %p forced umount\n", inode);
			
 
				+				*err = -EIO;
			
 
				+				ret = 1;
			
 
				+				goto out_unlock;
			
 
				+			}
			
 
				+			mds_wanted = __ceph_caps_mds_wanted(ci);
			
 
				+			if ((mds_wanted & need) != need) {
			
 
				+				dout("get_cap_refs %p caps were dropped"
			
 
				+				     " (session killed?)\n", inode);
			
 
				+				*err = -ESTALE;
			
 
				+				ret = 1;
			
 
				+				goto out_unlock;
			
 
				+			}
			
 
				+			if ((mds_wanted & file_wanted) ==
			
 
				+			    (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
			
 
				+				ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
			
 
				 		}
			
 
				 
			
 
				 		dout("get_cap_refs %p have %s needed %s\n", inode,
			
@@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 
				 			if (err == -EAGAIN)
			
 
				 				continue;
			
 
				 			if (err < 0)
			
 
				-				return err;
			
 
				+				ret = err;
			
 
				 		} else {
			
 
				 			ret = wait_event_interruptible(ci->i_cap_wq,
			
 
				 					try_get_cap_refs(ci, need, want, endoff,
			
@@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 
				 				continue;
			
 
				 			if (err < 0)
			
 
				 				ret = err;
			
 
				-			if (ret < 0)
			
 
				-				return ret;
			
 
				+		}
			
 
				+		if (ret < 0) {
			
 
				+			if (err == -ESTALE) {
			
 
				+				/* session was killed, try renew caps */
			
 
				+				ret = ceph_renew_caps(&ci->vfs_inode);
			
 
				+				if (ret == 0)
			
 
				+					continue;
			
 
				+			}
			
 
				+			return ret;
			
 
				 		}
			
 
				 
			
 
				 		if (ci->i_inline_version != CEPH_INLINE_NONE &&
			
@@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 
				 	if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
			
 
				 	    ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
			
 
				 	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
			
 
				-	    !ci->i_wrbuffer_ref) {
			
 
				+	    !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
			
 
				 		if (try_nonblocking_invalidate(inode)) {
			
 
				 			/* there were locked pages.. invalidate later
			
 
				 			   in a separate thread. */
			
@@ -3226,6 +3247,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 
				 
			
 
				 	if (target < 0) {
			
 
				 		__ceph_remove_cap(cap, false);
			
 
				+		if (!ci->i_auth_cap)
			
 
				+			ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
			
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 
				 				   path ? path : "");
			
 
				 			spin_unlock(&req->r_old_dentry->d_lock);
			
 
				 			kfree(path);
			
 
				-		} else if (req->r_path2) {
			
 
				+		} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
			
 
				 			if (req->r_ino2.ino)
			
 
				 				seq_printf(s, " #%llx/%s", req->r_ino2.ino,
			
 
				 					   req->r_path2);
			
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -70,16 +70,42 @@ int ceph_init_dentry(struct dentry *dentry)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * for readdir, we encode the directory frag and offset within that
			
 
				- * frag into f_pos.
			
 
				+ * for f_pos for readdir:
			
 
				+ * - hash order:
			
 
				+ *	(0xff << 52) | ((24 bits hash) << 28) |
			
 
				+ *	(the nth entry has hash collision);
			
 
				+ * - frag+name order;
			
 
				+ *	((frag value) << 28) | (the nth entry in frag);
			
 
				  */
			
 
				+#define OFFSET_BITS	28
			
 
				+#define OFFSET_MASK	((1 << OFFSET_BITS) - 1)
			
 
				+#define HASH_ORDER	(0xffull << (OFFSET_BITS + 24))
			
 
				+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
			
 
				+{
			
 
				+	loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
			
 
				+	if (hash_order)
			
 
				+		fpos |= HASH_ORDER;
			
 
				+	return fpos;
			
 
				+}
			
 
				+
			
 
				+static bool is_hash_order(loff_t p)
			
 
				+{
			
 
				+	return (p & HASH_ORDER) == HASH_ORDER;
			
 
				+}
			
 
				+
			
 
				 static unsigned fpos_frag(loff_t p)
			
 
				 {
			
 
				-	return p >> 32;
			
 
				+	return p >> OFFSET_BITS;
			
 
				 }
			
 
				+
			
 
				+static unsigned fpos_hash(loff_t p)
			
 
				+{
			
 
				+	return ceph_frag_value(fpos_frag(p));
			
 
				+}
			
 
				+
			
 
				 static unsigned fpos_off(loff_t p)
			
 
				 {
			
 
				-	return p & 0xffffffff;
			
 
				+	return p & OFFSET_MASK;
			
 
				 }
			
 
				 
			
 
				 static int fpos_cmp(loff_t l, loff_t r)
			
@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+
			
 
				+static struct dentry *
			
 
				+__dcache_find_get_entry(struct dentry *parent, u64 idx,
			
 
				+			struct ceph_readdir_cache_control *cache_ctl)
			
 
				+{
			
 
				+	struct inode *dir = d_inode(parent);
			
 
				+	struct dentry *dentry;
			
 
				+	unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
			
 
				+	loff_t ptr_pos = idx * sizeof(struct dentry *);
			
 
				+	pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
			
 
				+
			
 
				+	if (ptr_pos >= i_size_read(dir))
			
 
				+		return NULL;
			
 
				+
			
 
				+	if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
			
 
				+		ceph_readdir_cache_release(cache_ctl);
			
 
				+		cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
			
 
				+		if (!cache_ctl->page) {
			
 
				+			dout(" page %lu not found\n", ptr_pgoff);
			
 
				+			return ERR_PTR(-EAGAIN);
			
 
				+		}
			
 
				+		/* reading/filling the cache are serialized by
			
 
				+		   i_mutex, no need to use page lock */
			
 
				+		unlock_page(cache_ctl->page);
			
 
				+		cache_ctl->dentries = kmap(cache_ctl->page);
			
 
				+	}
			
 
				+
			
 
				+	cache_ctl->index = idx & idx_mask;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	spin_lock(&parent->d_lock);
			
 
				+	/* check i_size again here, because empty directory can be
			
 
				+	 * marked as complete while not holding the i_mutex. */
			
 
				+	if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
			
 
				+		dentry = cache_ctl->dentries[cache_ctl->index];
			
 
				+	else
			
 
				+		dentry = NULL;
			
 
				+	spin_unlock(&parent->d_lock);
			
 
				+	if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
			
 
				+		dentry = NULL;
			
 
				+	rcu_read_unlock();
			
 
				+	return dentry ? : ERR_PTR(-EAGAIN);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * When possible, we try to satisfy a readdir by peeking at the
			
 
				  * dcache.  We make this work by carefully ordering dentries on
			
@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 
				 	struct inode *dir = d_inode(parent);
			
 
				 	struct dentry *dentry, *last = NULL;
			
 
				 	struct ceph_dentry_info *di;
			
 
				-	unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
			
 
				-	int err = 0;
			
 
				-	loff_t ptr_pos = 0;
			
 
				 	struct ceph_readdir_cache_control cache_ctl = {};
			
 
				+	u64 idx = 0;
			
 
				+	int err = 0;
			
 
				 
			
 
				-	dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
			
 
				+	dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
			
 
				+
			
 
				+	/* search start position */
			
 
				+	if (ctx->pos > 2) {
			
 
				+		u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
			
 
				+		while (count > 0) {
			
 
				+			u64 step = count >> 1;
			
 
				+			dentry = __dcache_find_get_entry(parent, idx + step,
			
 
				+							 &cache_ctl);
			
 
				+			if (!dentry) {
			
 
				+				/* use linar search */
			
 
				+				idx = 0;
			
 
				+				break;
			
 
				+			}
			
 
				+			if (IS_ERR(dentry)) {
			
 
				+				err = PTR_ERR(dentry);
			
 
				+				goto out;
			
 
				+			}
			
 
				+			di = ceph_dentry(dentry);
			
 
				+			spin_lock(&dentry->d_lock);
			
 
				+			if (fpos_cmp(di->offset, ctx->pos) < 0) {
			
 
				+				idx += step + 1;
			
 
				+				count -= step + 1;
			
 
				+			} else {
			
 
				+				count = step;
			
 
				+			}
			
 
				+			spin_unlock(&dentry->d_lock);
			
 
				+			dput(dentry);
			
 
				+		}
			
 
				 
			
 
				-	/* we can calculate cache index for the first dirfrag */
			
 
				-	if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
			
 
				-		cache_ctl.index = fpos_off(ctx->pos) - 2;
			
 
				-		BUG_ON(cache_ctl.index < 0);
			
 
				-		ptr_pos = cache_ctl.index * sizeof(struct dentry *);
			
 
				+		dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
			
 
				 	}
			
 
				 
			
 
				-	while (true) {
			
 
				-		pgoff_t pgoff;
			
 
				-		bool emit_dentry;
			
 
				 
			
 
				-		if (ptr_pos >= i_size_read(dir)) {
			
 
				+	for (;;) {
			
 
				+		bool emit_dentry = false;
			
 
				+		dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
			
 
				+		if (!dentry) {
			
 
				 			fi->flags |= CEPH_F_ATEND;
			
 
				 			err = 0;
			
 
				 			break;
			
 
				 		}
			
 
				-
			
 
				-		err = -EAGAIN;
			
 
				-		pgoff = ptr_pos >> PAGE_SHIFT;
			
 
				-		if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
			
 
				-			ceph_readdir_cache_release(&cache_ctl);
			
 
				-			cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
			
 
				-			if (!cache_ctl.page) {
			
 
				-				dout(" page %lu not found\n", pgoff);
			
 
				-				break;
			
 
				-			}
			
 
				-			/* reading/filling the cache are serialized by
			
 
				-			 * i_mutex, no need to use page lock */
			
 
				-			unlock_page(cache_ctl.page);
			
 
				-			cache_ctl.dentries = kmap(cache_ctl.page);
			
 
				+		if (IS_ERR(dentry)) {
			
 
				+			err = PTR_ERR(dentry);
			
 
				+			goto out;
			
 
				 		}
			
 
				 
			
 
				-		rcu_read_lock();
			
 
				-		spin_lock(&parent->d_lock);
			
 
				-		/* check i_size again here, because empty directory can be
			
 
				-		 * marked as complete while not holding the i_mutex. */
			
 
				-		if (ceph_dir_is_complete_ordered(dir) &&
			
 
				-		    ptr_pos < i_size_read(dir))
			
 
				-			dentry = cache_ctl.dentries[cache_ctl.index % nsize];
			
 
				-		else
			
 
				-			dentry = NULL;
			
 
				-		spin_unlock(&parent->d_lock);
			
 
				-		if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
			
 
				-			dentry = NULL;
			
 
				-		rcu_read_unlock();
			
 
				-		if (!dentry)
			
 
				-			break;
			
 
				-
			
 
				-		emit_dentry = false;
			
 
				 		di = ceph_dentry(dentry);
			
 
				 		spin_lock(&dentry->d_lock);
			
 
				 		if (di->lease_shared_gen == shared_gen &&
			
 
				 		    d_really_is_positive(dentry) &&
			
 
				-		    ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
			
 
				-		    ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
			
 
				 		    fpos_cmp(ctx->pos, di->offset) <= 0) {
			
 
				 			emit_dentry = true;
			
 
				 		}
			
 
				 		spin_unlock(&dentry->d_lock);
			
 
				 
			
 
				 		if (emit_dentry) {
			
 
				-			dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
			
 
				+			dout(" %llx dentry %p %pd %p\n", di->offset,
			
 
				 			     dentry, dentry, d_inode(dentry));
			
 
				 			ctx->pos = di->offset;
			
 
				 			if (!dir_emit(ctx, dentry->d_name.name,
			
@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 
				 		} else {
			
 
				 			dput(dentry);
			
 
				 		}
			
 
				-
			
 
				-		cache_ctl.index++;
			
 
				-		ptr_pos += sizeof(struct dentry *);
			
 
				 	}
			
 
				+out:
			
 
				 	ceph_readdir_cache_release(&cache_ctl);
			
 
				 	if (last) {
			
 
				 		int ret;
			
@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				+static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
			
 
				+{
			
 
				+	if (!fi->last_readdir)
			
 
				+		return true;
			
 
				+	if (is_hash_order(pos))
			
 
				+		return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
			
 
				+	else
			
 
				+		return fi->frag != fpos_frag(pos);
			
 
				+}
			
 
				+
			
 
				 static int ceph_readdir(struct file *file, struct dir_context *ctx)
			
 
				 {
			
 
				 	struct ceph_file_info *fi = file->private_data;
			
@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
			
 
				 	struct ceph_mds_client *mdsc = fsc->mdsc;
			
 
				-	unsigned frag = fpos_frag(ctx->pos);
			
 
				-	int off = fpos_off(ctx->pos);
			
 
				+	int i;
			
 
				 	int err;
			
 
				 	u32 ftype;
			
 
				 	struct ceph_mds_reply_info_parsed *rinfo;
			
 
				 
			
 
				-	dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
			
 
				+	dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
			
 
				 	if (fi->flags & CEPH_F_ATEND)
			
 
				 		return 0;
			
 
				 
			
@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 			    inode->i_mode >> 12))
			
 
				 			return 0;
			
 
				 		ctx->pos = 1;
			
 
				-		off = 1;
			
 
				 	}
			
 
				 	if (ctx->pos == 1) {
			
 
				 		ino_t ino = parent_ino(file->f_path.dentry);
			
@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 			    inode->i_mode >> 12))
			
 
				 			return 0;
			
 
				 		ctx->pos = 2;
			
 
				-		off = 2;
			
 
				 	}
			
 
				 
			
 
				 	/* can we use the dcache? */
			
@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 		err = __dcache_readdir(file, ctx, shared_gen);
			
 
				 		if (err != -EAGAIN)
			
 
				 			return err;
			
 
				-		frag = fpos_frag(ctx->pos);
			
 
				-		off = fpos_off(ctx->pos);
			
 
				 	} else {
			
 
				 		spin_unlock(&ci->i_ceph_lock);
			
 
				 	}
			
@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 	/* proceed with a normal readdir */
			
 
				 more:
			
 
				 	/* do we have the correct frag content buffered? */
			
 
				-	if (fi->frag != frag || fi->last_readdir == NULL) {
			
 
				+	if (need_send_readdir(fi, ctx->pos)) {
			
 
				 		struct ceph_mds_request *req;
			
 
				+		unsigned frag;
			
 
				 		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
			
 
				 			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
			
 
				 
			
@@ -305,6 +372,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 			fi->last_readdir = NULL;
			
 
				 		}
			
 
				 
			
 
				+		if (is_hash_order(ctx->pos)) {
			
 
				+			frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
			
 
				+						NULL, NULL);
			
 
				+		} else {
			
 
				+			frag = fpos_frag(ctx->pos);
			
 
				+		}
			
 
				+
			
 
				 		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
			
 
				 		     ceph_vinop(inode), frag, fi->last_name);
			
 
				 		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
			
@@ -331,6 +405,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 		req->r_readdir_cache_idx = fi->readdir_cache_idx;
			
 
				 		req->r_readdir_offset = fi->next_offset;
			
 
				 		req->r_args.readdir.frag = cpu_to_le32(frag);
			
 
				+		req->r_args.readdir.flags =
			
 
				+				cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
			
 
				 
			
 
				 		req->r_inode = inode;
			
 
				 		ihold(inode);
			
@@ -340,22 +416,26 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 			ceph_mdsc_put_request(req);
			
 
				 			return err;
			
 
				 		}
			
 
				-		dout("readdir got and parsed readdir result=%d"
			
 
				-		     " on frag %x, end=%d, complete=%d\n", err, frag,
			
 
				+		dout("readdir got and parsed readdir result=%d on "
			
 
				+		     "frag %x, end=%d, complete=%d, hash_order=%d\n",
			
 
				+		     err, frag,
			
 
				 		     (int)req->r_reply_info.dir_end,
			
 
				-		     (int)req->r_reply_info.dir_complete);
			
 
				-
			
 
				+		     (int)req->r_reply_info.dir_complete,
			
 
				+		     (int)req->r_reply_info.hash_order);
			
 
				 
			
 
				-		/* note next offset and last dentry name */
			
 
				 		rinfo = &req->r_reply_info;
			
 
				 		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
			
 
				 			frag = le32_to_cpu(rinfo->dir_dir->frag);
			
 
				-			off = req->r_readdir_offset;
			
 
				-			fi->next_offset = off;
			
 
				+			if (!rinfo->hash_order) {
			
 
				+				fi->next_offset = req->r_readdir_offset;
			
 
				+				/* adjust ctx->pos to beginning of frag */
			
 
				+				ctx->pos = ceph_make_fpos(frag,
			
 
				+							  fi->next_offset,
			
 
				+							  false);
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		fi->frag = frag;
			
 
				-		fi->offset = fi->next_offset;
			
 
				 		fi->last_readdir = req;
			
 
				 
			
 
				 		if (req->r_did_prepopulate) {
			
@@ -363,7 +443,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 			if (fi->readdir_cache_idx < 0) {
			
 
				 				/* preclude from marking dir ordered */
			
 
				 				fi->dir_ordered_count = 0;
			
 
				-			} else if (ceph_frag_is_leftmost(frag) && off == 2) {
			
 
				+			} else if (ceph_frag_is_leftmost(frag) &&
			
 
				+				   fi->next_offset == 2) {
			
 
				 				/* note dir version at start of readdir so
			
 
				 				 * we can tell if any dentries get dropped */
			
 
				 				fi->dir_release_count = req->r_dir_release_cnt;
			
@@ -377,65 +458,87 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 			fi->dir_release_count = 0;
			
 
				 		}
			
 
				 
			
 
				-		if (req->r_reply_info.dir_end) {
			
 
				-			kfree(fi->last_name);
			
 
				-			fi->last_name = NULL;
			
 
				-			if (ceph_frag_is_rightmost(frag))
			
 
				-				fi->next_offset = 2;
			
 
				-			else
			
 
				-				fi->next_offset = 0;
			
 
				-		} else {
			
 
				-			err = note_last_dentry(fi,
			
 
				-				       rinfo->dir_dname[rinfo->dir_nr-1],
			
 
				-				       rinfo->dir_dname_len[rinfo->dir_nr-1],
			
 
				-				       fi->next_offset + rinfo->dir_nr);
			
 
				+		/* note next offset and last dentry name */
			
 
				+		if (rinfo->dir_nr > 0) {
			
 
				+			struct ceph_mds_reply_dir_entry *rde =
			
 
				+					rinfo->dir_entries + (rinfo->dir_nr-1);
			
 
				+			unsigned next_offset = req->r_reply_info.dir_end ?
			
 
				+					2 : (fpos_off(rde->offset) + 1);
			
 
				+			err = note_last_dentry(fi, rde->name, rde->name_len,
			
 
				+					       next_offset);
			
 
				 			if (err)
			
 
				 				return err;
			
 
				+		} else if (req->r_reply_info.dir_end) {
			
 
				+			fi->next_offset = 2;
			
 
				+			/* keep last name */
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	rinfo = &fi->last_readdir->r_reply_info;
			
 
				-	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
			
 
				-	     rinfo->dir_nr, off, fi->offset);
			
 
				-
			
 
				-	ctx->pos = ceph_make_fpos(frag, off);
			
 
				-	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
			
 
				-		struct ceph_mds_reply_inode *in =
			
 
				-			rinfo->dir_in[off - fi->offset].in;
			
 
				+	dout("readdir frag %x num %d pos %llx chunk first %llx\n",
			
 
				+	     fi->frag, rinfo->dir_nr, ctx->pos,
			
 
				+	     rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
			
 
				+
			
 
				+	i = 0;
			
 
				+	/* search start position */
			
 
				+	if (rinfo->dir_nr > 0) {
			
 
				+		int step, nr = rinfo->dir_nr;
			
 
				+		while (nr > 0) {
			
 
				+			step = nr >> 1;
			
 
				+			if (rinfo->dir_entries[i + step].offset < ctx->pos) {
			
 
				+				i +=  step + 1;
			
 
				+				nr -= step + 1;
			
 
				+			} else {
			
 
				+				nr = step;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	for (; i < rinfo->dir_nr; i++) {
			
 
				+		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
			
 
				 		struct ceph_vino vino;
			
 
				 		ino_t ino;
			
 
				 
			
 
				-		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
			
 
				-		     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
			
 
				-		     rinfo->dir_dname_len[off - fi->offset],
			
 
				-		     rinfo->dir_dname[off - fi->offset], in);
			
 
				-		BUG_ON(!in);
			
 
				-		ftype = le32_to_cpu(in->mode) >> 12;
			
 
				-		vino.ino = le64_to_cpu(in->ino);
			
 
				-		vino.snap = le64_to_cpu(in->snapid);
			
 
				+		BUG_ON(rde->offset < ctx->pos);
			
 
				+
			
 
				+		ctx->pos = rde->offset;
			
 
				+		dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
			
 
				+		     i, rinfo->dir_nr, ctx->pos,
			
 
				+		     rde->name_len, rde->name, &rde->inode.in);
			
 
				+
			
 
				+		BUG_ON(!rde->inode.in);
			
 
				+		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
			
 
				+		vino.ino = le64_to_cpu(rde->inode.in->ino);
			
 
				+		vino.snap = le64_to_cpu(rde->inode.in->snapid);
			
 
				 		ino = ceph_vino_to_ino(vino);
			
 
				-		if (!dir_emit(ctx,
			
 
				-			    rinfo->dir_dname[off - fi->offset],
			
 
				-			    rinfo->dir_dname_len[off - fi->offset],
			
 
				-			    ceph_translate_ino(inode->i_sb, ino), ftype)) {
			
 
				+
			
 
				+		if (!dir_emit(ctx, rde->name, rde->name_len,
			
 
				+			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
			
 
				 			dout("filldir stopping us...\n");
			
 
				 			return 0;
			
 
				 		}
			
 
				-		off++;
			
 
				 		ctx->pos++;
			
 
				 	}
			
 
				 
			
 
				-	if (fi->last_name) {
			
 
				+	if (fi->next_offset > 2) {
			
 
				 		ceph_mdsc_put_request(fi->last_readdir);
			
 
				 		fi->last_readdir = NULL;
			
 
				 		goto more;
			
 
				 	}
			
 
				 
			
 
				 	/* more frags? */
			
 
				-	if (!ceph_frag_is_rightmost(frag)) {
			
 
				-		frag = ceph_frag_next(frag);
			
 
				-		off = 0;
			
 
				-		ctx->pos = ceph_make_fpos(frag, off);
			
 
				+	if (!ceph_frag_is_rightmost(fi->frag)) {
			
 
				+		unsigned frag = ceph_frag_next(fi->frag);
			
 
				+		if (is_hash_order(ctx->pos)) {
			
 
				+			loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
			
 
				+							fi->next_offset, true);
			
 
				+			if (new_pos > ctx->pos)
			
 
				+				ctx->pos = new_pos;
			
 
				+			/* keep last_name */
			
 
				+		} else {
			
 
				+			ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
			
 
				+			kfree(fi->last_name);
			
 
				+			fi->last_name = NULL;
			
 
				+		}
			
 
				 		dout("readdir next frag is %x\n", frag);
			
 
				 		goto more;
			
 
				 	}
			
@@ -467,7 +570,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
			
 
				+static void reset_readdir(struct ceph_file_info *fi)
			
 
				 {
			
 
				 	if (fi->last_readdir) {
			
 
				 		ceph_mdsc_put_request(fi->last_readdir);
			
@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
 
				 	fi->last_name = NULL;
			
 
				 	fi->dir_release_count = 0;
			
 
				 	fi->readdir_cache_idx = -1;
			
 
				-	if (ceph_frag_is_leftmost(frag))
			
 
				-		fi->next_offset = 2;  /* compensate for . and .. */
			
 
				-	else
			
 
				-		fi->next_offset = 0;
			
 
				+	fi->next_offset = 2;  /* compensate for . and .. */
			
 
				 	fi->flags &= ~CEPH_F_ATEND;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * discard buffered readdir content on seekdir(0), or seek to new frag,
			
 
				+ * or seek prior to current chunk
			
 
				+ */
			
 
				+static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
			
 
				+{
			
 
				+	struct ceph_mds_reply_info_parsed *rinfo;
			
 
				+	loff_t chunk_offset;
			
 
				+	if (new_pos == 0)
			
 
				+		return true;
			
 
				+	if (is_hash_order(new_pos)) {
			
 
				+		/* no need to reset last_name for a forward seek when
			
 
				+		 * dentries are sotred in hash order */
			
 
				+	} else if (fi->frag |= fpos_frag(new_pos)) {
			
 
				+		return true;
			
 
				+	}
			
 
				+	rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
			
 
				+	if (!rinfo || !rinfo->dir_nr)
			
 
				+		return true;
			
 
				+	chunk_offset = rinfo->dir_entries[0].offset;
			
 
				+	return new_pos < chunk_offset ||
			
 
				+	       is_hash_order(new_pos) != is_hash_order(chunk_offset);
			
 
				+}
			
 
				+
			
 
				 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
			
 
				 {
			
 
				 	struct ceph_file_info *fi = file->private_data;
			
 
				 	struct inode *inode = file->f_mapping->host;
			
 
				-	loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
			
 
				 	loff_t retval;
			
 
				 
			
 
				 	inode_lock(inode);
			
@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 
				 	}
			
 
				 
			
 
				 	if (offset >= 0) {
			
 
				+		if (need_reset_readdir(fi, offset)) {
			
 
				+			dout("dir_llseek dropping %p content\n", file);
			
 
				+			reset_readdir(fi);
			
 
				+		} else if (is_hash_order(offset) && offset > file->f_pos) {
			
 
				+			/* for hash offset, we don't know if a forward seek
			
 
				+			 * is within same frag */
			
 
				+			fi->dir_release_count = 0;
			
 
				+			fi->readdir_cache_idx = -1;
			
 
				+		}
			
 
				+
			
 
				 		if (offset != file->f_pos) {
			
 
				 			file->f_pos = offset;
			
 
				 			file->f_version = 0;
			
 
				 			fi->flags &= ~CEPH_F_ATEND;
			
 
				 		}
			
 
				 		retval = offset;
			
 
				-
			
 
				-		if (offset == 0 ||
			
 
				-		    fpos_frag(offset) != fi->frag ||
			
 
				-		    fpos_off(offset) < fi->offset) {
			
 
				-			/* discard buffered readdir content on seekdir(0), or
			
 
				-			 * seek to new frag, or seek prior to current chunk */
			
 
				-			dout("dir_llseek dropping %p content\n", file);
			
 
				-			reset_readdir(fi, fpos_frag(offset));
			
 
				-		} else if (fpos_cmp(offset, old_offset) > 0) {
			
 
				-			/* reset dir_release_count if we did a forward seek */
			
 
				-			fi->dir_release_count = 0;
			
 
				-			fi->readdir_cache_idx = -1;
			
 
				-		}
			
 
				 	}
			
 
				 out:
			
 
				 	inode_unlock(inode);
			
@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 
				 	return dentry;
			
 
				 }
			
 
				 
			
 
				-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
			
 
				+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
			
 
				 {
			
 
				 	return ceph_ino(inode) == CEPH_INO_ROOT &&
			
 
				 		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
			
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -191,6 +191,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * try renew caps after session gets killed.
			
 
				+ */
			
 
				+int ceph_renew_caps(struct inode *inode)
			
 
				+{
			
 
				+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int err, flags, wanted;
			
 
				+
			
 
				+	spin_lock(&ci->i_ceph_lock);
			
 
				+	wanted = __ceph_caps_file_wanted(ci);
			
 
				+	if (__ceph_is_any_real_caps(ci) &&
			
 
				+	    (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
			
 
				+		int issued = __ceph_caps_issued(ci, NULL);
			
 
				+		spin_unlock(&ci->i_ceph_lock);
			
 
				+		dout("renew caps %p want %s issued %s updating mds_wanted\n",
			
 
				+		     inode, ceph_cap_string(wanted), ceph_cap_string(issued));
			
 
				+		ceph_check_caps(ci, 0, NULL);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	spin_unlock(&ci->i_ceph_lock);
			
 
				+
			
 
				+	flags = 0;
			
 
				+	if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
			
 
				+		flags = O_RDWR;
			
 
				+	else if (wanted & CEPH_CAP_FILE_RD)
			
 
				+		flags = O_RDONLY;
			
 
				+	else if (wanted & CEPH_CAP_FILE_WR)
			
 
				+		flags = O_WRONLY;
			
 
				+#ifdef O_LAZY
			
 
				+	if (wanted & CEPH_CAP_FILE_LAZYIO)
			
 
				+		flags |= O_LAZY;
			
 
				+#endif
			
 
				+
			
 
				+	req = prepare_open_request(inode->i_sb, flags, 0);
			
 
				+	if (IS_ERR(req)) {
			
 
				+		err = PTR_ERR(req);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	req->r_inode = inode;
			
 
				+	ihold(inode);
			
 
				+	req->r_num_caps = 1;
			
 
				+	req->r_fmode = -1;
			
 
				+
			
 
				+	err = ceph_mdsc_do_request(mdsc, NULL, req);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+out:
			
 
				+	dout("renew caps %p open result=%d\n", inode, err);
			
 
				+	return err < 0 ? err : 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * If we already have the requisite capabilities, we can satisfy
			
 
				  * the open request locally (no need to request new caps from the
			
@@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode,
 
				 	kfree(aio_req);
			
 
				 }
			
 
				 
			
 
				-static void ceph_aio_complete_req(struct ceph_osd_request *req,
			
 
				-				  struct ceph_msg *msg)
			
 
				+static void ceph_aio_complete_req(struct ceph_osd_request *req)
			
 
				 {
			
 
				 	int rc = req->r_result;
			
 
				 	struct inode *inode = req->r_inode;
			
@@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work)
 
				 	req->r_flags =	CEPH_OSD_FLAG_ORDERSNAP |
			
 
				 			CEPH_OSD_FLAG_ONDISK |
			
 
				 			CEPH_OSD_FLAG_WRITE;
			
 
				-	req->r_base_oloc = orig_req->r_base_oloc;
			
 
				-	req->r_base_oid = orig_req->r_base_oid;
			
 
				+	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
			
 
				+	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
			
 
				+
			
 
				+	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
			
 
				+	if (ret) {
			
 
				+		ceph_osdc_put_request(req);
			
 
				+		req = orig_req;
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				 	req->r_ops[0] = orig_req->r_ops[0];
			
 
				 	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
			
 
				 
			
 
				-	ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
			
 
				-				snapc, CEPH_NOSNAP, &aio_req->mtime);
			
 
				+	req->r_mtime = aio_req->mtime;
			
 
				+	req->r_data_offset = req->r_ops[0].extent.offset;
			
 
				 
			
 
				 	ceph_osdc_put_request(orig_req);
			
 
				 
			
@@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 
				 out:
			
 
				 	if (ret < 0) {
			
 
				 		req->r_result = ret;
			
 
				-		ceph_aio_complete_req(req, NULL);
			
 
				+		ceph_aio_complete_req(req);
			
 
				 	}
			
 
				 
			
 
				 	ceph_put_snap_context(snapc);
			
@@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 
				 		list_add_tail(&req->r_unsafe_item,
			
 
				 			      &ci->i_unsafe_writes);
			
 
				 		spin_unlock(&ci->i_unsafe_lock);
			
 
				+
			
 
				+		complete_all(&req->r_completion);
			
 
				 	} else {
			
 
				 		spin_lock(&ci->i_unsafe_lock);
			
 
				 		list_del_init(&req->r_unsafe_item);
			
@@ -875,14 +936,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 
				 					(pos+len) | (PAGE_SIZE - 1));
			
 
				 
			
 
				 			osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
			
 
				+			req->r_mtime = mtime;
			
 
				 		}
			
 
				 
			
 
				-
			
 
				 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
			
 
				 						 false, false);
			
 
				 
			
 
				-		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
			
 
				-
			
 
				 		if (aio_req) {
			
 
				 			aio_req->total_len += len;
			
 
				 			aio_req->num_reqs++;
			
@@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 
				 							      req, false);
			
 
				 			if (ret < 0) {
			
 
				 				req->r_result = ret;
			
 
				-				ceph_aio_complete_req(req, NULL);
			
 
				+				ceph_aio_complete_req(req);
			
 
				 			}
			
 
				 		}
			
 
				 		return -EIOCBQUEUED;
			
@@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 
				 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
			
 
				 						false, true);
			
 
				 
			
 
				-		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
			
 
				-		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
			
 
				-
			
 
				+		req->r_mtime = mtime;
			
 
				 		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
			
 
				 		if (!ret)
			
 
				 			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
			
@@ -1524,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
			
 
				-				&inode->i_mtime);
			
 
				-
			
 
				+	req->r_mtime = inode->i_mtime;
			
 
				 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
			
 
				 	if (!ret) {
			
 
				 		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
			
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -11,6 +11,7 @@
 
				 #include <linux/xattr.h>
			
 
				 #include <linux/posix_acl.h>
			
 
				 #include <linux/random.h>
			
 
				+#include <linux/sort.h>
			
 
				 
			
 
				 #include "super.h"
			
 
				 #include "mds_client.h"
			
@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
 
				 		diri_auth = ci->i_auth_cap->mds;
			
 
				 	spin_unlock(&ci->i_ceph_lock);
			
 
				 
			
 
				+	if (mds == -1) /* CDIR_AUTH_PARENT */
			
 
				+		mds = diri_auth;
			
 
				+
			
 
				 	mutex_lock(&ci->i_fragtree_mutex);
			
 
				 	if (ndist == 0 && mds == diri_auth) {
			
 
				 		/* no delegation info needed. */
			
@@ -300,20 +304,38 @@ static int ceph_fill_dirfrag(struct inode *inode,
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				+static int frag_tree_split_cmp(const void *l, const void *r)
			
 
				+{
			
 
				+	struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
			
 
				+	struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
			
 
				+	return ceph_frag_compare(ls->frag, rs->frag);
			
 
				+}
			
 
				+
			
 
				+static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
			
 
				+{
			
 
				+	if (!frag)
			
 
				+		return f == ceph_frag_make(0, 0);
			
 
				+	if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
			
 
				+		return false;
			
 
				+	return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
			
 
				+}
			
 
				+
			
 
				 static int ceph_fill_fragtree(struct inode *inode,
			
 
				 			      struct ceph_frag_tree_head *fragtree,
			
 
				 			      struct ceph_mds_reply_dirfrag *dirinfo)
			
 
				 {
			
 
				 	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				-	struct ceph_inode_frag *frag;
			
 
				+	struct ceph_inode_frag *frag, *prev_frag = NULL;
			
 
				 	struct rb_node *rb_node;
			
 
				-	int i;
			
 
				-	u32 id, nsplits;
			
 
				+	unsigned i, split_by, nsplits;
			
 
				+	u32 id;
			
 
				 	bool update = false;
			
 
				 
			
 
				 	mutex_lock(&ci->i_fragtree_mutex);
			
 
				 	nsplits = le32_to_cpu(fragtree->nsplits);
			
 
				-	if (nsplits) {
			
 
				+	if (nsplits != ci->i_fragtree_nsplits) {
			
 
				+		update = true;
			
 
				+	} else if (nsplits) {
			
 
				 		i = prandom_u32() % nsplits;
			
 
				 		id = le32_to_cpu(fragtree->splits[i].frag);
			
 
				 		if (!__ceph_find_frag(ci, id))
			
@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
 
				 	if (!update)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				+	if (nsplits > 1) {
			
 
				+		sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
			
 
				+		     frag_tree_split_cmp, NULL);
			
 
				+	}
			
 
				+
			
 
				 	dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
			
 
				 	rb_node = rb_first(&ci->i_fragtree);
			
 
				 	for (i = 0; i < nsplits; i++) {
			
 
				 		id = le32_to_cpu(fragtree->splits[i].frag);
			
 
				+		split_by = le32_to_cpu(fragtree->splits[i].by);
			
 
				+		if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
			
 
				+			pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
			
 
				+			       "frag %x split by %d\n", ceph_vinop(inode),
			
 
				+			       i, nsplits, id, split_by);
			
 
				+			continue;
			
 
				+		}
			
 
				 		frag = NULL;
			
 
				 		while (rb_node) {
			
 
				 			frag = rb_entry(rb_node, struct ceph_inode_frag, node);
			
@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
 
				 				break;
			
 
				 			}
			
 
				 			rb_node = rb_next(rb_node);
			
 
				-			rb_erase(&frag->node, &ci->i_fragtree);
			
 
				-			kfree(frag);
			
 
				+			/* delete stale split/leaf node */
			
 
				+			if (frag->split_by > 0 ||
			
 
				+			    !is_frag_child(frag->frag, prev_frag)) {
			
 
				+				rb_erase(&frag->node, &ci->i_fragtree);
			
 
				+				if (frag->split_by > 0)
			
 
				+					ci->i_fragtree_nsplits--;
			
 
				+				kfree(frag);
			
 
				+			}
			
 
				 			frag = NULL;
			
 
				 		}
			
 
				 		if (!frag) {
			
@@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode,
 
				 			if (IS_ERR(frag))
			
 
				 				continue;
			
 
				 		}
			
 
				-		frag->split_by = le32_to_cpu(fragtree->splits[i].by);
			
 
				+		if (frag->split_by == 0)
			
 
				+			ci->i_fragtree_nsplits++;
			
 
				+		frag->split_by = split_by;
			
 
				 		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
			
 
				+		prev_frag = frag;
			
 
				 	}
			
 
				 	while (rb_node) {
			
 
				 		frag = rb_entry(rb_node, struct ceph_inode_frag, node);
			
 
				 		rb_node = rb_next(rb_node);
			
 
				-		rb_erase(&frag->node, &ci->i_fragtree);
			
 
				-		kfree(frag);
			
 
				+		/* delete stale split/leaf node */
			
 
				+		if (frag->split_by > 0 ||
			
 
				+		    !is_frag_child(frag->frag, prev_frag)) {
			
 
				+			rb_erase(&frag->node, &ci->i_fragtree);
			
 
				+			if (frag->split_by > 0)
			
 
				+				ci->i_fragtree_nsplits--;
			
 
				+			kfree(frag);
			
 
				+		}
			
 
				 	}
			
 
				 out_unlock:
			
 
				 	mutex_unlock(&ci->i_fragtree_mutex);
			
@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
 
				 		rb_erase(n, &ci->i_fragtree);
			
 
				 		kfree(frag);
			
 
				 	}
			
 
				+	ci->i_fragtree_nsplits = 0;
			
 
				 
			
 
				 	__ceph_destroy_xattrs(ci);
			
 
				 	if (ci->i_xattrs.blob)
			
@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode)
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				+static inline blkcnt_t calc_inode_blocks(u64 size)
			
 
				+{
			
 
				+	return (size + (1<<9) - 1) >> 9;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Helpers to fill in size, ctime, mtime, and atime.  We have to be
			
 
				  * careful because either the client or MDS may have more up to date
			
@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 
				 			size = 0;
			
 
				 		}
			
 
				 		i_size_write(inode, size);
			
 
				-		inode->i_blocks = (size + (1<<9) - 1) >> 9;
			
 
				+		inode->i_blocks = calc_inode_blocks(size);
			
 
				 		ci->i_reported_size = size;
			
 
				 		if (truncate_seq != ci->i_truncate_seq) {
			
 
				 			dout("truncate_seq %u -> %u\n",
			
@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 
				 
			
 
				 			spin_unlock(&ci->i_ceph_lock);
			
 
				 
			
 
				-			err = -EINVAL;
			
 
				-			if (WARN_ON(symlen != i_size_read(inode)))
			
 
				-				goto out;
			
 
				+			if (symlen != i_size_read(inode)) {
			
 
				+				pr_err("fill_inode %llx.%llx BAD symlink "
			
 
				+					"size %lld\n", ceph_vinop(inode),
			
 
				+					i_size_read(inode));
			
 
				+				i_size_write(inode, symlen);
			
 
				+				inode->i_blocks = calc_inode_blocks(symlen);
			
 
				+			}
			
 
				 
			
 
				 			err = -ENOMEM;
			
 
				 			sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
			
@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
 
				 	int i, err = 0;
			
 
				 
			
 
				 	for (i = 0; i < rinfo->dir_nr; i++) {
			
 
				+		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
			
 
				 		struct ceph_vino vino;
			
 
				 		struct inode *in;
			
 
				 		int rc;
			
 
				 
			
 
				-		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
			
 
				-		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
			
 
				+		vino.ino = le64_to_cpu(rde->inode.in->ino);
			
 
				+		vino.snap = le64_to_cpu(rde->inode.in->snapid);
			
 
				 
			
 
				 		in = ceph_get_inode(req->r_dentry->d_sb, vino);
			
 
				 		if (IS_ERR(in)) {
			
@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
 
				 			dout("new_inode badness got %d\n", err);
			
 
				 			continue;
			
 
				 		}
			
 
				-		rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
			
 
				+		rc = fill_inode(in, NULL, &rde->inode, NULL, session,
			
 
				 				req->r_request_started, -1,
			
 
				 				&req->r_caps_reservation);
			
 
				 		if (rc < 0) {
			
 
				 			pr_err("fill_inode badness on %p got %d\n", in, rc);
			
 
				 			err = rc;
			
 
				-			continue;
			
 
				 		}
			
 
				+		iput(in);
			
 
				 	}
			
 
				 
			
 
				 	return err;
			
@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 
				 			     struct ceph_mds_session *session)
			
 
				 {
			
 
				 	struct dentry *parent = req->r_dentry;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
			
 
				 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
			
 
				 	struct qstr dname;
			
 
				 	struct dentry *dn;
			
@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 
				 	int err = 0, skipped = 0, ret, i;
			
 
				 	struct inode *snapdir = NULL;
			
 
				 	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
			
 
				-	struct ceph_dentry_info *di;
			
 
				 	u32 frag = le32_to_cpu(rhead->args.readdir.frag);
			
 
				+	u32 last_hash = 0;
			
 
				+	u32 fpos_offset;
			
 
				 	struct ceph_readdir_cache_control cache_ctl = {};
			
 
				 
			
 
				 	if (req->r_aborted)
			
 
				 		return readdir_prepopulate_inodes_only(req, session);
			
 
				 
			
 
				+	if (rinfo->hash_order && req->r_path2) {
			
 
				+		last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
			
 
				+					  req->r_path2, strlen(req->r_path2));
			
 
				+		last_hash = ceph_frag_value(last_hash);
			
 
				+	}
			
 
				+
			
 
				 	if (rinfo->dir_dir &&
			
 
				 	    le32_to_cpu(rinfo->dir_dir->frag) != frag) {
			
 
				 		dout("readdir_prepopulate got new frag %x -> %x\n",
			
 
				 		     frag, le32_to_cpu(rinfo->dir_dir->frag));
			
 
				 		frag = le32_to_cpu(rinfo->dir_dir->frag);
			
 
				-		if (ceph_frag_is_leftmost(frag))
			
 
				+		if (!rinfo->hash_order)
			
 
				 			req->r_readdir_offset = 2;
			
 
				-		else
			
 
				-			req->r_readdir_offset = 0;
			
 
				 	}
			
 
				 
			
 
				 	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
			
@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 
				 	if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
			
 
				 		/* note dir version at start of readdir so we can tell
			
 
				 		 * if any dentries get dropped */
			
 
				-		struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
			
 
				 		req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
			
 
				 		req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
			
 
				 		req->r_readdir_cache_idx = 0;
			
 
				 	}
			
 
				 
			
 
				 	cache_ctl.index = req->r_readdir_cache_idx;
			
 
				+	fpos_offset = req->r_readdir_offset;
			
 
				 
			
 
				 	/* FIXME: release caps/leases if error occurs */
			
 
				 	for (i = 0; i < rinfo->dir_nr; i++) {
			
 
				+		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
			
 
				 		struct ceph_vino vino;
			
 
				 
			
 
				-		dname.name = rinfo->dir_dname[i];
			
 
				-		dname.len = rinfo->dir_dname_len[i];
			
 
				+		dname.name = rde->name;
			
 
				+		dname.len = rde->name_len;
			
 
				 		dname.hash = full_name_hash(dname.name, dname.len);
			
 
				 
			
 
				-		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
			
 
				-		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
			
 
				+		vino.ino = le64_to_cpu(rde->inode.in->ino);
			
 
				+		vino.snap = le64_to_cpu(rde->inode.in->snapid);
			
 
				+
			
 
				+		if (rinfo->hash_order) {
			
 
				+			u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
			
 
				+						 rde->name, rde->name_len);
			
 
				+			hash = ceph_frag_value(hash);
			
 
				+			if (hash != last_hash)
			
 
				+				fpos_offset = 2;
			
 
				+			last_hash = hash;
			
 
				+			rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
			
 
				+		} else {
			
 
				+			rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
			
 
				+		}
			
 
				 
			
 
				 retry_lookup:
			
 
				 		dn = d_lookup(parent, &dname);
			
@@ -1490,7 +1569,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
			
 
				+		ret = fill_inode(in, NULL, &rde->inode, NULL, session,
			
 
				 				 req->r_request_started, -1,
			
 
				 				 &req->r_caps_reservation);
			
 
				 		if (ret < 0) {
			
@@ -1523,11 +1602,9 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 
				 			dn = realdn;
			
 
				 		}
			
 
				 
			
 
				-		di = dn->d_fsdata;
			
 
				-		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
			
 
				+		ceph_dentry(dn)->offset = rde->offset;
			
 
				 
			
 
				-		update_dentry_lease(dn, rinfo->dir_dlease[i],
			
 
				-				    req->r_session,
			
 
				+		update_dentry_lease(dn, rde->lease, req->r_session,
			
 
				 				    req->r_request_started);
			
 
				 
			
 
				 		if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
			
@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
 
				 	spin_lock(&ci->i_ceph_lock);
			
 
				 	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
			
 
				 	i_size_write(inode, size);
			
 
				-	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
			
 
				+	inode->i_blocks = calc_inode_blocks(size);
			
 
				 
			
 
				 	/* tell the MDS if we are approaching max_size */
			
 
				 	if ((size << 1) >= ci->i_max_size &&
			
@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work)
 
				 	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
			
 
				 						  i_pg_inv_work);
			
 
				 	struct inode *inode = &ci->vfs_inode;
			
 
				+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
			
 
				 	u32 orig_gen;
			
 
				 	int check = 0;
			
 
				 
			
 
				 	mutex_lock(&ci->i_truncate_mutex);
			
 
				+
			
 
				+	if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
			
 
				+		pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
			
 
				+				    inode, ceph_ino(inode));
			
 
				+		mapping_set_error(inode->i_mapping, -EIO);
			
 
				+		truncate_pagecache(inode, 0);
			
 
				+		mutex_unlock(&ci->i_truncate_mutex);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				 	spin_lock(&ci->i_ceph_lock);
			
 
				 	dout("invalidate_pages %p gen %d revoking %d\n", inode,
			
 
				 	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
			
@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work)
 
				 	orig_gen = ci->i_rdcache_gen;
			
 
				 	spin_unlock(&ci->i_ceph_lock);
			
 
				 
			
 
				-	truncate_pagecache(inode, 0);
			
 
				+	if (invalidate_inode_pages2(inode->i_mapping) < 0) {
			
 
				+		pr_err("invalidate_pages %p fails\n", inode);
			
 
				+	}
			
 
				 
			
 
				 	spin_lock(&ci->i_ceph_lock);
			
 
				 	if (orig_gen == ci->i_rdcache_gen &&
			
@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
 
				 		if ((issued & CEPH_CAP_FILE_EXCL) &&
			
 
				 		    attr->ia_size > inode->i_size) {
			
 
				 			i_size_write(inode, attr->ia_size);
			
 
				-			inode->i_blocks =
			
 
				-				(attr->ia_size + (1 << 9) - 1) >> 9;
			
 
				+			inode->i_blocks = calc_inode_blocks(attr->ia_size);
			
 
				 			inode->i_ctime = attr->ia_ctime;
			
 
				 			ci->i_reported_size = attr->ia_size;
			
 
				 			dirtied |= CEPH_CAP_FILE_EXCL;
			
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
				 	if (copy_from_user(&dl, arg, sizeof(dl)))
			
 
				 		return -EFAULT;
			
 
				 
			
 
				-	down_read(&osdc->map_sem);
			
 
				+	down_read(&osdc->lock);
			
 
				 	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
			
 
				 					  &dl.object_no, &dl.object_offset,
			
 
				 					  &olen);
			
 
				 	if (r < 0) {
			
 
				-		up_read(&osdc->map_sem);
			
 
				+		up_read(&osdc->lock);
			
 
				 		return -EIO;
			
 
				 	}
			
 
				 	dl.file_offset -= dl.object_offset;
			
@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
				 		 ceph_ino(inode), dl.object_no);
			
 
				 
			
 
				 	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
			
 
				-	ceph_oid_set_name(&oid, dl.object_name);
			
 
				+	ceph_oid_printf(&oid, "%s", dl.object_name);
			
 
				 
			
 
				-	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
			
 
				+	r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
			
 
				 	if (r < 0) {
			
 
				-		up_read(&osdc->map_sem);
			
 
				+		up_read(&osdc->lock);
			
 
				 		return r;
			
 
				 	}
			
 
				 
			
 
				-	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
			
 
				+	dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
			
 
				 	if (dl.osd >= 0) {
			
 
				 		struct ceph_entity_addr *a =
			
 
				 			ceph_osd_addr(osdc->osdmap, dl.osd);
			
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
				 	} else {
			
 
				 		memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
			
 
				 	}
			
 
				-	up_read(&osdc->map_sem);
			
 
				+	up_read(&osdc->lock);
			
 
				 
			
 
				 	/* send result back to user */
			
 
				 	if (copy_to_user(arg, &dl, sizeof(dl)))
			
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end,
 
				 
			
 
				 	ceph_decode_need(p, end, sizeof(num) + 2, bad);
			
 
				 	num = ceph_decode_32(p);
			
 
				-	info->dir_end = ceph_decode_8(p);
			
 
				-	info->dir_complete = ceph_decode_8(p);
			
 
				+	{
			
 
				+		u16 flags = ceph_decode_16(p);
			
 
				+		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
			
 
				+		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
			
 
				+		info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
			
 
				+	}
			
 
				 	if (num == 0)
			
 
				 		goto done;
			
 
				 
			
 
				-	BUG_ON(!info->dir_in);
			
 
				-	info->dir_dname = (void *)(info->dir_in + num);
			
 
				-	info->dir_dname_len = (void *)(info->dir_dname + num);
			
 
				-	info->dir_dlease = (void *)(info->dir_dname_len + num);
			
 
				-	if ((unsigned long)(info->dir_dlease + num) >
			
 
				-	    (unsigned long)info->dir_in + info->dir_buf_size) {
			
 
				+	BUG_ON(!info->dir_entries);
			
 
				+	if ((unsigned long)(info->dir_entries + num) >
			
 
				+	    (unsigned long)info->dir_entries + info->dir_buf_size) {
			
 
				 		pr_err("dir contents are larger than expected\n");
			
 
				 		WARN_ON(1);
			
 
				 		goto bad;
			
@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end,
 
				 
			
 
				 	info->dir_nr = num;
			
 
				 	while (num) {
			
 
				+		struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
			
 
				 		/* dentry */
			
 
				 		ceph_decode_need(p, end, sizeof(u32)*2, bad);
			
 
				-		info->dir_dname_len[i] = ceph_decode_32(p);
			
 
				-		ceph_decode_need(p, end, info->dir_dname_len[i], bad);
			
 
				-		info->dir_dname[i] = *p;
			
 
				-		*p += info->dir_dname_len[i];
			
 
				-		dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
			
 
				-		     info->dir_dname[i]);
			
 
				-		info->dir_dlease[i] = *p;
			
 
				+		rde->name_len = ceph_decode_32(p);
			
 
				+		ceph_decode_need(p, end, rde->name_len, bad);
			
 
				+		rde->name = *p;
			
 
				+		*p += rde->name_len;
			
 
				+		dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
			
 
				+		rde->lease = *p;
			
 
				 		*p += sizeof(struct ceph_mds_reply_lease);
			
 
				 
			
 
				 		/* inode */
			
 
				-		err = parse_reply_info_in(p, end, &info->dir_in[i], features);
			
 
				+		err = parse_reply_info_in(p, end, &rde->inode, features);
			
 
				 		if (err < 0)
			
 
				 			goto out_bad;
			
 
				+		/* ceph_readdir_prepopulate() will update it */
			
 
				+		rde->offset = 0;
			
 
				 		i++;
			
 
				 		num--;
			
 
				 	}
			
@@ -345,9 +348,9 @@ static int parse_reply_info(struct ceph_msg *msg,
 
				 
			
 
				 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
			
 
				 {
			
 
				-	if (!info->dir_in)
			
 
				+	if (!info->dir_entries)
			
 
				 		return;
			
 
				-	free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
			
 
				+	free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
			
 
				 }
			
 
				 
			
 
				 
			
@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref)
 
				 	kfree(req);
			
 
				 }
			
 
				 
			
 
				+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
			
 
				+
			
 
				 /*
			
 
				  * lookup session, bump ref if found.
			
 
				  *
			
 
				  * called under mdsc->mutex.
			
 
				  */
			
 
				-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
			
 
				-					     u64 tid)
			
 
				+static struct ceph_mds_request *
			
 
				+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
			
 
				 {
			
 
				 	struct ceph_mds_request *req;
			
 
				-	struct rb_node *n = mdsc->request_tree.rb_node;
			
 
				-
			
 
				-	while (n) {
			
 
				-		req = rb_entry(n, struct ceph_mds_request, r_node);
			
 
				-		if (tid < req->r_tid)
			
 
				-			n = n->rb_left;
			
 
				-		else if (tid > req->r_tid)
			
 
				-			n = n->rb_right;
			
 
				-		else {
			
 
				-			ceph_mdsc_get_request(req);
			
 
				-			return req;
			
 
				-		}
			
 
				-	}
			
 
				-	return NULL;
			
 
				-}
			
 
				 
			
 
				-static void __insert_request(struct ceph_mds_client *mdsc,
			
 
				-			     struct ceph_mds_request *new)
			
 
				-{
			
 
				-	struct rb_node **p = &mdsc->request_tree.rb_node;
			
 
				-	struct rb_node *parent = NULL;
			
 
				-	struct ceph_mds_request *req = NULL;
			
 
				+	req = lookup_request(&mdsc->request_tree, tid);
			
 
				+	if (req)
			
 
				+		ceph_mdsc_get_request(req);
			
 
				 
			
 
				-	while (*p) {
			
 
				-		parent = *p;
			
 
				-		req = rb_entry(parent, struct ceph_mds_request, r_node);
			
 
				-		if (new->r_tid < req->r_tid)
			
 
				-			p = &(*p)->rb_left;
			
 
				-		else if (new->r_tid > req->r_tid)
			
 
				-			p = &(*p)->rb_right;
			
 
				-		else
			
 
				-			BUG();
			
 
				-	}
			
 
				-
			
 
				-	rb_link_node(&new->r_node, parent, p);
			
 
				-	rb_insert_color(&new->r_node, &mdsc->request_tree);
			
 
				+	return req;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
 
				 				  req->r_num_caps);
			
 
				 	dout("__register_request %p tid %lld\n", req, req->r_tid);
			
 
				 	ceph_mdsc_get_request(req);
			
 
				-	__insert_request(mdsc, req);
			
 
				+	insert_request(&mdsc->request_tree, req);
			
 
				 
			
 
				 	req->r_uid = current_fsuid();
			
 
				 	req->r_gid = current_fsgid();
			
@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	rb_erase(&req->r_node, &mdsc->request_tree);
			
 
				-	RB_CLEAR_NODE(&req->r_node);
			
 
				+	erase_request(&mdsc->request_tree, req);
			
 
				 
			
 
				 	if (req->r_unsafe_dir && req->r_got_unsafe) {
			
 
				 		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
			
@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 
				 	int metadata_bytes = 0;
			
 
				 	int metadata_key_count = 0;
			
 
				 	struct ceph_options *opt = mdsc->fsc->client->options;
			
 
				+	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
			
 
				 	void *p;
			
 
				 
			
 
				 	const char* metadata[][2] = {
			
 
				 		{"hostname", utsname()->nodename},
			
 
				 		{"kernel_version", utsname()->release},
			
 
				-		{"entity_id", opt->name ? opt->name : ""},
			
 
				+		{"entity_id", opt->name ? : ""},
			
 
				+		{"root", fsopt->server_path ? : "/"},
			
 
				 		{NULL, NULL}
			
 
				 	};
			
 
				 
			
@@ -1149,9 +1125,11 @@ static int iterate_session_caps(struct ceph_mds_session *session,
 
				 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
			
 
				 				  void *arg)
			
 
				 {
			
 
				+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
			
 
				 	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				 	LIST_HEAD(to_remove);
			
 
				-	int drop = 0;
			
 
				+	bool drop = false;
			
 
				+	bool invalidate = false;
			
 
				 
			
 
				 	dout("removing cap %p, ci is %p, inode is %p\n",
			
 
				 	     cap, ci, &ci->vfs_inode);
			
@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
				 	__ceph_remove_cap(cap, false);
			
 
				 	if (!ci->i_auth_cap) {
			
 
				 		struct ceph_cap_flush *cf;
			
 
				-		struct ceph_mds_client *mdsc =
			
 
				-			ceph_sb_to_client(inode->i_sb)->mdsc;
			
 
				+		struct ceph_mds_client *mdsc = fsc->mdsc;
			
 
				+
			
 
				+		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
			
 
				+
			
 
				+		if (ci->i_wrbuffer_ref > 0 &&
			
 
				+		    ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
			
 
				+			invalidate = true;
			
 
				 
			
 
				 		while (true) {
			
 
				 			struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
			
@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
				 				inode, ceph_ino(inode));
			
 
				 			ci->i_dirty_caps = 0;
			
 
				 			list_del_init(&ci->i_dirty_item);
			
 
				-			drop = 1;
			
 
				+			drop = true;
			
 
				 		}
			
 
				 		if (!list_empty(&ci->i_flushing_item)) {
			
 
				 			pr_warn_ratelimited(
			
@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
				 			ci->i_flushing_caps = 0;
			
 
				 			list_del_init(&ci->i_flushing_item);
			
 
				 			mdsc->num_cap_flushing--;
			
 
				-			drop = 1;
			
 
				+			drop = true;
			
 
				 		}
			
 
				 		spin_unlock(&mdsc->cap_dirty_lock);
			
 
				 
			
@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
				 		list_del(&cf->list);
			
 
				 		ceph_free_cap_flush(cf);
			
 
				 	}
			
 
				-	while (drop--)
			
 
				+
			
 
				+	wake_up_all(&ci->i_cap_wq);
			
 
				+	if (invalidate)
			
 
				+		ceph_queue_invalidate(inode);
			
 
				+	if (drop)
			
 
				 		iput(inode);
			
 
				 	return 0;
			
 
				 }
			
@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
				  */
			
 
				 static void remove_session_caps(struct ceph_mds_session *session)
			
 
				 {
			
 
				+	struct ceph_fs_client *fsc = session->s_mdsc->fsc;
			
 
				+	struct super_block *sb = fsc->sb;
			
 
				 	dout("remove_session_caps on %p\n", session);
			
 
				-	iterate_session_caps(session, remove_session_caps_cb, NULL);
			
 
				+	iterate_session_caps(session, remove_session_caps_cb, fsc);
			
 
				 
			
 
				 	spin_lock(&session->s_cap_lock);
			
 
				 	if (session->s_nr_caps > 0) {
			
 
				-		struct super_block *sb = session->s_mdsc->fsc->sb;
			
 
				 		struct inode *inode;
			
 
				 		struct ceph_cap *cap, *prev = NULL;
			
 
				 		struct ceph_vino vino;
			
@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 
				 {
			
 
				 	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				 
			
 
				-	wake_up_all(&ci->i_cap_wq);
			
 
				 	if (arg) {
			
 
				 		spin_lock(&ci->i_ceph_lock);
			
 
				 		ci->i_wanted_max_size = 0;
			
 
				 		ci->i_requested_max_size = 0;
			
 
				 		spin_unlock(&ci->i_ceph_lock);
			
 
				 	}
			
 
				+	wake_up_all(&ci->i_cap_wq);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 
				 	struct ceph_inode_info *ci = ceph_inode(dir);
			
 
				 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
			
 
				 	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
			
 
				-	size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
			
 
				-		      sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
			
 
				+	size_t size = sizeof(struct ceph_mds_reply_dir_entry);
			
 
				 	int order, num_entries;
			
 
				 
			
 
				 	spin_lock(&ci->i_ceph_lock);
			
@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 
				 
			
 
				 	order = get_order(size * num_entries);
			
 
				 	while (order >= 0) {
			
 
				-		rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
			
 
				-							__GFP_NOWARN,
			
 
				-							order);
			
 
				-		if (rinfo->dir_in)
			
 
				+		rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
			
 
				+							     __GFP_NOWARN,
			
 
				+							     order);
			
 
				+		if (rinfo->dir_entries)
			
 
				 			break;
			
 
				 		order--;
			
 
				 	}
			
 
				-	if (!rinfo->dir_in)
			
 
				+	if (!rinfo->dir_entries)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				 	num_entries = (PAGE_SIZE << order) / size;
			
@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 
				 	INIT_LIST_HEAD(&req->r_unsafe_target_item);
			
 
				 	req->r_fmode = -1;
			
 
				 	kref_init(&req->r_kref);
			
 
				+	RB_CLEAR_NODE(&req->r_node);
			
 
				 	INIT_LIST_HEAD(&req->r_wait);
			
 
				 	init_completion(&req->r_completion);
			
 
				 	init_completion(&req->r_safe_completion);
			
@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
				 	/* get request, session */
			
 
				 	tid = le64_to_cpu(msg->hdr.tid);
			
 
				 	mutex_lock(&mdsc->mutex);
			
 
				-	req = __lookup_request(mdsc, tid);
			
 
				+	req = lookup_get_request(mdsc, tid);
			
 
				 	if (!req) {
			
 
				 		dout("handle_reply on unknown tid %llu\n", tid);
			
 
				 		mutex_unlock(&mdsc->mutex);
			
@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 
				 	fwd_seq = ceph_decode_32(&p);
			
 
				 
			
 
				 	mutex_lock(&mdsc->mutex);
			
 
				-	req = __lookup_request(mdsc, tid);
			
 
				+	req = lookup_get_request(mdsc, tid);
			
 
				 	if (!req) {
			
 
				 		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
			
 
				 		goto out;  /* dup reply? */
			
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in {
 
				 	u32 pool_ns_len;
			
 
				 };
			
 
				 
			
 
				+struct ceph_mds_reply_dir_entry {
			
 
				+	char                          *name;
			
 
				+	u32                           name_len;
			
 
				+	struct ceph_mds_reply_lease   *lease;
			
 
				+	struct ceph_mds_reply_info_in inode;
			
 
				+	loff_t			      offset;
			
 
				+};
			
 
				+
			
 
				 /*
			
 
				  * parsed info about an mds reply, including information about
			
 
				  * either: 1) the target inode and/or its parent directory and dentry,
			
@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed {
 
				 			struct ceph_mds_reply_dirfrag *dir_dir;
			
 
				 			size_t			      dir_buf_size;
			
 
				 			int                           dir_nr;
			
 
				-			char                          **dir_dname;
			
 
				-			u32                           *dir_dname_len;
			
 
				-			struct ceph_mds_reply_lease   **dir_dlease;
			
 
				-			struct ceph_mds_reply_info_in *dir_in;
			
 
				-			u8                            dir_complete, dir_end;
			
 
				+			bool			      dir_complete;
			
 
				+			bool			      dir_end;
			
 
				+			bool			      hash_order;
			
 
				+			struct ceph_mds_reply_dir_entry  *dir_entries;
			
 
				 		};
			
 
				 
			
 
				 		/* for create results */
			
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
				 	const void *start = *p;
			
 
				 	int i, j, n;
			
 
				 	int err = -EINVAL;
			
 
				-	u16 version;
			
 
				+	u8 mdsmap_v, mdsmap_cv;
			
 
				 
			
 
				 	m = kzalloc(sizeof(*m), GFP_NOFS);
			
 
				 	if (m == NULL)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
 
				-	ceph_decode_16_safe(p, end, version, bad);
			
 
				-	if (version > 3) {
			
 
				-		pr_warn("got mdsmap version %d > 3, failing", version);
			
 
				-		goto bad;
			
 
				+	ceph_decode_need(p, end, 1 + 1, bad);
			
 
				+	mdsmap_v = ceph_decode_8(p);
			
 
				+	mdsmap_cv = ceph_decode_8(p);
			
 
				+	if (mdsmap_v >= 4) {
			
 
				+	       u32 mdsmap_len;
			
 
				+	       ceph_decode_32_safe(p, end, mdsmap_len, bad);
			
 
				+	       if (end < *p + mdsmap_len)
			
 
				+		       goto bad;
			
 
				+	       end = *p + mdsmap_len;
			
 
				 	}
			
 
				 
			
 
				 	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
			
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
				 		u32 namelen;
			
 
				 		s32 mds, inc, state;
			
 
				 		u64 state_seq;
			
 
				-		u8 infoversion;
			
 
				+		u8 info_v;
			
 
				+		void *info_end = NULL;
			
 
				 		struct ceph_entity_addr addr;
			
 
				 		u32 num_export_targets;
			
 
				 		void *pexport_targets = NULL;
			
 
				 		struct ceph_timespec laggy_since;
			
 
				 		struct ceph_mds_info *info;
			
 
				 
			
 
				-		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
			
 
				+		ceph_decode_need(p, end, sizeof(u64) + 1, bad);
			
 
				 		global_id = ceph_decode_64(p);
			
 
				-		infoversion = ceph_decode_8(p);
			
 
				+		info_v= ceph_decode_8(p);
			
 
				+		if (info_v >= 4) {
			
 
				+			u32 info_len;
			
 
				+			u8 info_cv;
			
 
				+			ceph_decode_need(p, end, 1 + sizeof(u32), bad);
			
 
				+			info_cv = ceph_decode_8(p);
			
 
				+			info_len = ceph_decode_32(p);
			
 
				+			info_end = *p + info_len;
			
 
				+			if (info_end > end)
			
 
				+				goto bad;
			
 
				+		}
			
 
				+
			
 
				+		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
			
 
				 		*p += sizeof(u64);
			
 
				 		namelen = ceph_decode_32(p);  /* skip mds name */
			
 
				 		*p += namelen;
			
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
				 		*p += sizeof(u32);
			
 
				 		ceph_decode_32_safe(p, end, namelen, bad);
			
 
				 		*p += namelen;
			
 
				-		if (infoversion >= 2) {
			
 
				+		if (info_v >= 2) {
			
 
				 			ceph_decode_32_safe(p, end, num_export_targets, bad);
			
 
				 			pexport_targets = *p;
			
 
				 			*p += num_export_targets * sizeof(u32);
			
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
				 			num_export_targets = 0;
			
 
				 		}
			
 
				 
			
 
				+		if (info_end && *p != info_end) {
			
 
				+			if (*p > info_end)
			
 
				+				goto bad;
			
 
				+			*p = info_end;
			
 
				+		}
			
 
				+
			
 
				 		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
			
 
				 		     i+1, n, global_id, mds, inc,
			
 
				 		     ceph_pr_addr(&addr.in_addr),
			
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
				 	m->m_cas_pg_pool = ceph_decode_64(p);
			
 
				 
			
 
				 	/* ok, we don't care about the rest. */
			
 
				+	*p = end;
			
 
				 	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
			
 
				 	return m;
			
 
				 
			
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
 
				  * mount options
			
 
				  */
			
 
				 enum {
			
 
				+	Opt_mds_namespace,
			
 
				 	Opt_wsize,
			
 
				 	Opt_rsize,
			
 
				 	Opt_rasize,
			
@@ -143,6 +144,7 @@ enum {
 
				 };
			
 
				 
			
 
				 static match_table_t fsopt_tokens = {
			
 
				+	{Opt_mds_namespace, "mds_namespace=%d"},
			
 
				 	{Opt_wsize, "wsize=%d"},
			
 
				 	{Opt_rsize, "rsize=%d"},
			
 
				 	{Opt_rasize, "rasize=%d"},
			
@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private)
 
				 		break;
			
 
				 
			
 
				 		/* misc */
			
 
				+	case Opt_mds_namespace:
			
 
				+		fsopt->mds_namespace = intval;
			
 
				+		break;
			
 
				 	case Opt_wsize:
			
 
				 		fsopt->wsize = intval;
			
 
				 		break;
			
@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
 
				 {
			
 
				 	dout("destroy_mount_options %p\n", args);
			
 
				 	kfree(args->snapdir_name);
			
 
				+	kfree(args->server_path);
			
 
				 	kfree(args);
			
 
				 }
			
 
				 
			
@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				+	ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				 	return ceph_compare_options(new_opt, fsc->client);
			
 
				 }
			
 
				 
			
 
				 static int parse_mount_options(struct ceph_mount_options **pfsopt,
			
 
				 			       struct ceph_options **popt,
			
 
				 			       int flags, char *options,
			
 
				-			       const char *dev_name,
			
 
				-			       const char **path)
			
 
				+			       const char *dev_name)
			
 
				 {
			
 
				 	struct ceph_mount_options *fsopt;
			
 
				 	const char *dev_name_end;
			
@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 
				 	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
			
 
				 	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
			
 
				 	fsopt->congestion_kb = default_congestion_kb();
			
 
				+	fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
			
 
				 
			
 
				 	/*
			
 
				 	 * Distinguish the server list from the path in "dev_name".
			
@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 
				 	 */
			
 
				 	dev_name_end = strchr(dev_name, '/');
			
 
				 	if (dev_name_end) {
			
 
				-		/* skip over leading '/' for path */
			
 
				-		*path = dev_name_end + 1;
			
 
				+		fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
			
 
				+		if (!fsopt->server_path) {
			
 
				+			err = -ENOMEM;
			
 
				+			goto out;
			
 
				+		}
			
 
				 	} else {
			
 
				-		/* path is empty */
			
 
				 		dev_name_end = dev_name + strlen(dev_name);
			
 
				-		*path = dev_name_end;
			
 
				 	}
			
 
				 	err = -EINVAL;
			
 
				 	dev_name_end--;		/* back up to ':' separator */
			
@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 
				 		goto out;
			
 
				 	}
			
 
				 	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
			
 
				-	dout("server path '%s'\n", *path);
			
 
				+	if (fsopt->server_path)
			
 
				+		dout("server path '%s'\n", fsopt->server_path);
			
 
				 
			
 
				 	*popt = ceph_parse_options(options, dev_name, dev_name_end,
			
 
				 				 parse_fsopt_token, (void *)fsopt);
			
@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 
				 		seq_puts(m, ",noacl");
			
 
				 #endif
			
 
				 
			
 
				+	if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
			
 
				+		seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
			
 
				 	if (fsopt->wsize)
			
 
				 		seq_printf(m, ",wsize=%d", fsopt->wsize);
			
 
				 	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
			
@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 
				 {
			
 
				 	struct ceph_fs_client *fsc;
			
 
				 	const u64 supported_features =
			
 
				-		CEPH_FEATURE_FLOCK |
			
 
				-		CEPH_FEATURE_DIRLAYOUTHASH |
			
 
				-		CEPH_FEATURE_MDS_INLINE_DATA;
			
 
				+		CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
			
 
				+		CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
			
 
				 	const u64 required_features = 0;
			
 
				 	int page_count;
			
 
				 	size_t size;
			
@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 
				 		goto fail;
			
 
				 	}
			
 
				 	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
			
 
				+	fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
			
 
				 	ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
			
 
				 
			
 
				 	fsc->mount_options = fsopt;
			
@@ -785,8 +799,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 
				 /*
			
 
				  * mount: join the ceph cluster, and open root directory.
			
 
				  */
			
 
				-static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
			
 
				-		      const char *path)
			
 
				+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
			
 
				 {
			
 
				 	int err;
			
 
				 	unsigned long started = jiffies;  /* note the start time */
			
@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 
				 			goto fail;
			
 
				 	}
			
 
				 
			
 
				-	if (path[0] == 0) {
			
 
				+	if (!fsc->mount_options->server_path) {
			
 
				 		root = fsc->sb->s_root;
			
 
				 		dget(root);
			
 
				 	} else {
			
 
				-		dout("mount opening base mountpoint\n");
			
 
				+		const char *path = fsc->mount_options->server_path + 1;
			
 
				+		dout("mount opening path %s\n", path);
			
 
				 		root = open_root_dentry(fsc, path, started);
			
 
				 		if (IS_ERR(root)) {
			
 
				 			err = PTR_ERR(root);
			
@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 
				 	struct dentry *res;
			
 
				 	int err;
			
 
				 	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
			
 
				-	const char *path = NULL;
			
 
				 	struct ceph_mount_options *fsopt = NULL;
			
 
				 	struct ceph_options *opt = NULL;
			
 
				 
			
@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 
				 #ifdef CONFIG_CEPH_FS_POSIX_ACL
			
 
				 	flags |= MS_POSIXACL;
			
 
				 #endif
			
 
				-	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
			
 
				+	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
			
 
				 	if (err < 0) {
			
 
				 		res = ERR_PTR(err);
			
 
				 		goto out_final;
			
@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	res = ceph_real_mount(fsc, path);
			
 
				+	res = ceph_real_mount(fsc);
			
 
				 	if (IS_ERR(res))
			
 
				 		goto out_splat;
			
 
				 	dout("root %p inode %p ino %llx.%llx\n", res,
			
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,6 +62,7 @@ struct ceph_mount_options {
 
				 	int cap_release_safety;
			
 
				 	int max_readdir;       /* max readdir result (entires) */
			
 
				 	int max_readdir_bytes; /* max readdir result (bytes) */
			
 
				+	int mds_namespace;
			
 
				 
			
 
				 	/*
			
 
				 	 * everything above this point can be memcmp'd; everything below
			
@@ -69,6 +70,7 @@ struct ceph_mount_options {
 
				 	 */
			
 
				 
			
 
				 	char *snapdir_name;   /* default ".snap" */
			
 
				+	char *server_path;    /* default  "/" */
			
 
				 };
			
 
				 
			
 
				 struct ceph_fs_client {
			
@@ -295,6 +297,7 @@ struct ceph_inode_info {
 
				 	u64 i_files, i_subdirs;
			
 
				 
			
 
				 	struct rb_root i_fragtree;
			
 
				+	int i_fragtree_nsplits;
			
 
				 	struct mutex i_fragtree_mutex;
			
 
				 
			
 
				 	struct ceph_inode_xattrs_info i_xattrs;
			
@@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 
				 #define CEPH_I_POOL_RD		(1 << 5)  /* can read from pool */
			
 
				 #define CEPH_I_POOL_WR		(1 << 6)  /* can write to pool */
			
 
				 #define CEPH_I_SEC_INITED	(1 << 7)  /* security initialized */
			
 
				+#define CEPH_I_CAP_DROPPED	(1 << 8)  /* caps were forcibly dropped */
			
 
				 
			
 
				 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
			
 
				 					   long long release_count,
			
@@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
 
				 	return (struct ceph_dentry_info *)dentry->d_fsdata;
			
 
				 }
			
 
				 
			
 
				-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
			
 
				-{
			
 
				-	return ((loff_t)frag << 32) | (loff_t)off;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * caps helpers
			
 
				  */
			
@@ -632,7 +631,6 @@ struct ceph_file_info {
 
				 	struct ceph_mds_request *last_readdir;
			
 
				 
			
 
				 	/* readdir: position within a frag */
			
 
				-	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
			
 
				 	unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
			
 
				 	char *last_name;       /* last entry in previous chunk */
			
 
				 	long long dir_release_count;
			
@@ -927,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 
				 /* file.c */
			
 
				 extern const struct file_operations ceph_file_fops;
			
 
				 
			
 
				+extern int ceph_renew_caps(struct inode *inode);
			
 
				 extern int ceph_open(struct inode *inode, struct file *file);
			
 
				 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
			
 
				 			    struct file *file, unsigned flags, umode_t mode,
			
@@ -942,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops;
 
				 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
			
 
				 	ceph_snapdir_dentry_ops;
			
 
				 
			
 
				+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
			
 
				 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
			
 
				 extern int ceph_handle_snapdir(struct ceph_mds_request *req,
			
 
				 			       struct dentry *dentry, int err);
			
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 
				 	char buf[128];
			
 
				 
			
 
				 	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
			
 
				-	down_read(&osdc->map_sem);
			
 
				+	down_read(&osdc->lock);
			
 
				 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
			
 
				 	if (pool_name) {
			
 
				 		size_t len = strlen(pool_name);
			
@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 
				 				ret = -ERANGE;
			
 
				 		}
			
 
				 	}
			
 
				-	up_read(&osdc->map_sem);
			
 
				+	up_read(&osdc->lock);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
 
				 	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
			
 
				 	const char *pool_name;
			
 
				 
			
 
				-	down_read(&osdc->map_sem);
			
 
				+	down_read(&osdc->lock);
			
 
				 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
			
 
				 	if (pool_name)
			
 
				 		ret = snprintf(val, size, "%s", pool_name);
			
 
				 	else
			
 
				 		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
			
 
				-	up_read(&osdc->map_sem);
			
 
				+	up_read(&osdc->lock);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 
				 	struct ceph_mds_request *req;
			
 
				 	struct ceph_mds_client *mdsc = fsc->mdsc;
			
 
				 	struct ceph_pagelist *pagelist = NULL;
			
 
				+	int op = CEPH_MDS_OP_SETXATTR;
			
 
				 	int err;
			
 
				 
			
 
				 	if (size > 0) {
			
@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 
				 		if (err)
			
 
				 			goto out;
			
 
				 	} else if (!value) {
			
 
				-		flags |= CEPH_XATTR_REMOVE;
			
 
				+		if (flags & CEPH_XATTR_REPLACE)
			
 
				+			op = CEPH_MDS_OP_RMXATTR;
			
 
				+		else
			
 
				+			flags |= CEPH_XATTR_REMOVE;
			
 
				 	}
			
 
				 
			
 
				 	dout("setxattr value=%.*s\n", (int)size, value);
			
 
				 
			
 
				 	/* do request */
			
 
				-	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
			
 
				-				       USE_AUTH_MDS);
			
 
				+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
			
 
				 	if (IS_ERR(req)) {
			
 
				 		err = PTR_ERR(req);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	req->r_args.setxattr.flags = cpu_to_le32(flags);
			
 
				 	req->r_path2 = kstrdup(name, GFP_NOFS);
			
 
				 	if (!req->r_path2) {
			
 
				 		ceph_mdsc_put_request(req);
			
@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	req->r_pagelist = pagelist;
			
 
				-	pagelist = NULL;
			
 
				+	if (op == CEPH_MDS_OP_SETXATTR) {
			
 
				+		req->r_args.setxattr.flags = cpu_to_le32(flags);
			
 
				+		req->r_pagelist = pagelist;
			
 
				+		pagelist = NULL;
			
 
				+	}
			
 
				 
			
 
				 	req->r_inode = inode;
			
 
				 	ihold(inode);
			
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
 
				 	return ceph_frag_make(newbits,
			
 
				 			 ceph_frag_value(f) | (i << (24 - newbits)));
			
 
				 }
			
 
				-static inline int ceph_frag_is_leftmost(__u32 f)
			
 
				+static inline bool ceph_frag_is_leftmost(__u32 f)
			
 
				 {
			
 
				 	return ceph_frag_value(f) == 0;
			
 
				 }
			
 
				-static inline int ceph_frag_is_rightmost(__u32 f)
			
 
				+static inline bool ceph_frag_is_rightmost(__u32 f)
			
 
				 {
			
 
				 	return ceph_frag_value(f) == ceph_frag_mask(f);
			
 
				 }
			
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -153,8 +153,9 @@ struct ceph_dir_layout {
 
				 
			
 
				 /* watch-notify operations */
			
 
				 enum {
			
 
				-  WATCH_NOTIFY				= 1, /* notifying watcher */
			
 
				-  WATCH_NOTIFY_COMPLETE			= 2, /* notifier notified when done */
			
 
				+	CEPH_WATCH_EVENT_NOTIFY		  = 1, /* notifying watcher */
			
 
				+	CEPH_WATCH_EVENT_NOTIFY_COMPLETE  = 2, /* notifier notified when done */
			
 
				+	CEPH_WATCH_EVENT_DISCONNECT       = 3, /* we were disconnected */
			
 
				 };
			
 
				 
			
 
				 
			
@@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack {
 
				 	struct ceph_fsid fsid;
			
 
				 } __attribute__ ((packed));
			
 
				 
			
 
				+#define CEPH_FS_CLUSTER_ID_NONE  -1
			
 
				+
			
 
				 /*
			
 
				  * mdsmap flags
			
 
				  */
			
@@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op);
 
				 #define CEPH_XATTR_REPLACE (1 << 1)
			
 
				 #define CEPH_XATTR_REMOVE  (1 << 31)
			
 
				 
			
 
				+/*
			
 
				+ * readdir request flags;
			
 
				+ */
			
 
				+#define CEPH_READDIR_REPLY_BITFLAGS	(1<<0)
			
 
				+
			
 
				+/*
			
 
				+ * readdir reply flags.
			
 
				+ */
			
 
				+#define CEPH_READDIR_FRAG_END		(1<<0)
			
 
				+#define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
			
 
				+#define CEPH_READDIR_HASH_ORDER		(1<<9)
			
 
				+
			
 
				 union ceph_mds_request_args {
			
 
				 	struct {
			
 
				 		__le32 mask;                 /* CEPH_CAP_* */
			
@@ -361,6 +376,7 @@ union ceph_mds_request_args {
 
				 		__le32 frag;                 /* which dir fragment */
			
 
				 		__le32 max_entries;          /* how many dentries to grab */
			
 
				 		__le32 max_bytes;
			
 
				+		__le16 flags;
			
 
				 	} __attribute__ ((packed)) readdir;
			
 
				 	struct {
			
 
				 		__le32 mode;
			
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
 
				 /*
			
 
				  * bounds check input.
			
 
				  */
			
 
				-static inline int ceph_has_room(void **p, void *end, size_t n)
			
 
				+static inline bool ceph_has_room(void **p, void *end, size_t n)
			
 
				 {
			
 
				 	return end >= *p && n <= end - *p;
			
 
				 }
			
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
 
				 		(off >> PAGE_SHIFT);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * These are not meant to be generic - an integer key is assumed.
			
 
				+ */
			
 
				+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)		\
			
 
				+static void insert_##name(struct rb_root *root, type *t)		\
			
 
				+{									\
			
 
				+	struct rb_node **n = &root->rb_node;				\
			
 
				+	struct rb_node *parent = NULL;					\
			
 
				+									\
			
 
				+	BUG_ON(!RB_EMPTY_NODE(&t->nodefld));				\
			
 
				+									\
			
 
				+	while (*n) {							\
			
 
				+		type *cur = rb_entry(*n, type, nodefld);		\
			
 
				+									\
			
 
				+		parent = *n;						\
			
 
				+		if (t->keyfld < cur->keyfld)				\
			
 
				+			n = &(*n)->rb_left;				\
			
 
				+		else if (t->keyfld > cur->keyfld)			\
			
 
				+			n = &(*n)->rb_right;				\
			
 
				+		else							\
			
 
				+			BUG();						\
			
 
				+	}								\
			
 
				+									\
			
 
				+	rb_link_node(&t->nodefld, parent, n);				\
			
 
				+	rb_insert_color(&t->nodefld, root);				\
			
 
				+}									\
			
 
				+static void erase_##name(struct rb_root *root, type *t)			\
			
 
				+{									\
			
 
				+	BUG_ON(RB_EMPTY_NODE(&t->nodefld));				\
			
 
				+	rb_erase(&t->nodefld, root);					\
			
 
				+	RB_CLEAR_NODE(&t->nodefld);					\
			
 
				+}
			
 
				+
			
 
				+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)		\
			
 
				+static type *lookup_##name(struct rb_root *root,			\
			
 
				+			   typeof(((type *)0)->keyfld) key)		\
			
 
				+{									\
			
 
				+	struct rb_node *n = root->rb_node;				\
			
 
				+									\
			
 
				+	while (n) {							\
			
 
				+		type *cur = rb_entry(n, type, nodefld);			\
			
 
				+									\
			
 
				+		if (key < cur->keyfld)					\
			
 
				+			n = n->rb_left;					\
			
 
				+		else if (key > cur->keyfld)				\
			
 
				+			n = n->rb_right;				\
			
 
				+		else							\
			
 
				+			return cur;					\
			
 
				+	}								\
			
 
				+									\
			
 
				+	return NULL;							\
			
 
				+}
			
 
				+
			
 
				+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld)			\
			
 
				+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)			\
			
 
				+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
			
 
				+
			
 
				 extern struct kmem_cache *ceph_inode_cachep;
			
 
				 extern struct kmem_cache *ceph_cap_cachep;
			
 
				 extern struct kmem_cache *ceph_cap_flush_cachep;
			
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -39,20 +39,31 @@ struct ceph_mon_request {
 
				 	ceph_monc_request_func_t do_request;
			
 
				 };
			
 
				 
			
 
				+typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
			
 
				+
			
 
				 /*
			
 
				  * ceph_mon_generic_request is being used for the statfs and
			
 
				  * mon_get_version requests which are being done a bit differently
			
 
				  * because we need to get data back to the caller
			
 
				  */
			
 
				 struct ceph_mon_generic_request {
			
 
				+	struct ceph_mon_client *monc;
			
 
				 	struct kref kref;
			
 
				 	u64 tid;
			
 
				 	struct rb_node node;
			
 
				 	int result;
			
 
				-	void *buf;
			
 
				+
			
 
				 	struct completion completion;
			
 
				+	ceph_monc_callback_t complete_cb;
			
 
				+	u64 private_data;          /* r_tid/linger_id */
			
 
				+
			
 
				 	struct ceph_msg *request;  /* original request */
			
 
				 	struct ceph_msg *reply;    /* and reply */
			
 
				+
			
 
				+	union {
			
 
				+		struct ceph_statfs *st;
			
 
				+		u64 newest;
			
 
				+	} u;
			
 
				 };
			
 
				 
			
 
				 struct ceph_mon_client {
			
@@ -77,7 +88,6 @@ struct ceph_mon_client {
 
				 
			
 
				 	/* pending generic requests */
			
 
				 	struct rb_root generic_request_tree;
			
 
				-	int num_generic_requests;
			
 
				 	u64 last_tid;
			
 
				 
			
 
				 	/* subs, indexed with CEPH_SUB_* */
			
@@ -86,6 +96,7 @@ struct ceph_mon_client {
 
				 		bool want;
			
 
				 		u32 have; /* epoch */
			
 
				 	} subs[3];
			
 
				+	int fs_cluster_id; /* "mdsmap.<id>" sub */
			
 
				 
			
 
				 #ifdef CONFIG_DEBUG_FS
			
 
				 	struct dentry *debugfs_file;
			
@@ -116,16 +127,18 @@ extern const char *ceph_sub_str[];
 
				 bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
			
 
				 			bool continuous);
			
 
				 void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
			
 
				+void ceph_monc_renew_subs(struct ceph_mon_client *monc);
			
 
				 
			
 
				-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
			
 
				 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
			
 
				 				 unsigned long timeout);
			
 
				 
			
 
				 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
			
 
				 			       struct ceph_statfs *buf);
			
 
				 
			
 
				-extern int ceph_monc_do_get_version(struct ceph_mon_client *monc,
			
 
				-				    const char *what, u64 *newest);
			
 
				+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
			
 
				+			  u64 *newest);
			
 
				+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
			
 
				+				ceph_monc_callback_t cb, u64 private_data);
			
 
				 
			
 
				 extern int ceph_monc_open_session(struct ceph_mon_client *monc);
			
 
				 
			
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -20,10 +20,11 @@ struct ceph_osd_client;
 
				 /*
			
 
				  * completion callback for async writepages
			
 
				  */
			
 
				-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
			
 
				-				     struct ceph_msg *);
			
 
				+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
			
 
				 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
			
 
				 
			
 
				+#define CEPH_HOMELESS_OSD	-1
			
 
				+
			
 
				 /* a given osd we're communicating with */
			
 
				 struct ceph_osd {
			
 
				 	atomic_t o_ref;
			
@@ -32,16 +33,15 @@ struct ceph_osd {
 
				 	int o_incarnation;
			
 
				 	struct rb_node o_node;
			
 
				 	struct ceph_connection o_con;
			
 
				-	struct list_head o_requests;
			
 
				-	struct list_head o_linger_requests;
			
 
				+	struct rb_root o_requests;
			
 
				+	struct rb_root o_linger_requests;
			
 
				 	struct list_head o_osd_lru;
			
 
				 	struct ceph_auth_handshake o_auth;
			
 
				 	unsigned long lru_ttl;
			
 
				-	int o_marked_for_keepalive;
			
 
				 	struct list_head o_keepalive_item;
			
 
				+	struct mutex lock;
			
 
				 };
			
 
				 
			
 
				-
			
 
				 #define CEPH_OSD_SLAB_OPS	2
			
 
				 #define CEPH_OSD_MAX_OPS	16
			
 
				 
			
@@ -104,15 +104,21 @@ struct ceph_osd_req_op {
 
				 			struct ceph_osd_data response_data;
			
 
				 			__u8 class_len;
			
 
				 			__u8 method_len;
			
 
				-			__u8 argc;
			
 
				+			u32 indata_len;
			
 
				 		} cls;
			
 
				 		struct {
			
 
				 			u64 cookie;
			
 
				-			u64 ver;
			
 
				-			u32 prot_ver;
			
 
				-			u32 timeout;
			
 
				-			__u8 flag;
			
 
				+			__u8 op;           /* CEPH_OSD_WATCH_OP_ */
			
 
				+			u32 gen;
			
 
				 		} watch;
			
 
				+		struct {
			
 
				+			struct ceph_osd_data request_data;
			
 
				+		} notify_ack;
			
 
				+		struct {
			
 
				+			u64 cookie;
			
 
				+			struct ceph_osd_data request_data;
			
 
				+			struct ceph_osd_data response_data;
			
 
				+		} notify;
			
 
				 		struct {
			
 
				 			u64 expected_object_size;
			
 
				 			u64 expected_write_size;
			
@@ -120,60 +126,73 @@ struct ceph_osd_req_op {
 
				 	};
			
 
				 };
			
 
				 
			
 
				+struct ceph_osd_request_target {
			
 
				+	struct ceph_object_id base_oid;
			
 
				+	struct ceph_object_locator base_oloc;
			
 
				+	struct ceph_object_id target_oid;
			
 
				+	struct ceph_object_locator target_oloc;
			
 
				+
			
 
				+	struct ceph_pg pgid;
			
 
				+	u32 pg_num;
			
 
				+	u32 pg_num_mask;
			
 
				+	struct ceph_osds acting;
			
 
				+	struct ceph_osds up;
			
 
				+	int size;
			
 
				+	int min_size;
			
 
				+	bool sort_bitwise;
			
 
				+
			
 
				+	unsigned int flags;                /* CEPH_OSD_FLAG_* */
			
 
				+	bool paused;
			
 
				+
			
 
				+	int osd;
			
 
				+};
			
 
				+
			
 
				 /* an in-flight request */
			
 
				 struct ceph_osd_request {
			
 
				 	u64             r_tid;              /* unique for this client */
			
 
				 	struct rb_node  r_node;
			
 
				-	struct list_head r_req_lru_item;
			
 
				-	struct list_head r_osd_item;
			
 
				-	struct list_head r_linger_item;
			
 
				-	struct list_head r_linger_osd_item;
			
 
				+	struct rb_node  r_mc_node;          /* map check */
			
 
				 	struct ceph_osd *r_osd;
			
 
				-	struct ceph_pg   r_pgid;
			
 
				-	int              r_pg_osds[CEPH_PG_MAX_SIZE];
			
 
				-	int              r_num_pg_osds;
			
 
				+
			
 
				+	struct ceph_osd_request_target r_t;
			
 
				+#define r_base_oid	r_t.base_oid
			
 
				+#define r_base_oloc	r_t.base_oloc
			
 
				+#define r_flags		r_t.flags
			
 
				 
			
 
				 	struct ceph_msg  *r_request, *r_reply;
			
 
				-	int               r_flags;     /* any additional flags for the osd */
			
 
				 	u32               r_sent;      /* >0 if r_request is sending/sent */
			
 
				 
			
 
				 	/* request osd ops array  */
			
 
				 	unsigned int		r_num_ops;
			
 
				 
			
 
				-	/* these are updated on each send */
			
 
				-	__le32           *r_request_osdmap_epoch;
			
 
				-	__le32           *r_request_flags;
			
 
				-	__le64           *r_request_pool;
			
 
				-	void             *r_request_pgid;
			
 
				-	__le32           *r_request_attempts;
			
 
				-	bool              r_paused;
			
 
				-	struct ceph_eversion *r_request_reassert_version;
			
 
				-
			
 
				 	int               r_result;
			
 
				-	int               r_got_reply;
			
 
				-	int		  r_linger;
			
 
				+	bool              r_got_reply;
			
 
				 
			
 
				 	struct ceph_osd_client *r_osdc;
			
 
				 	struct kref       r_kref;
			
 
				 	bool              r_mempool;
			
 
				-	struct completion r_completion, r_safe_completion;
			
 
				+	struct completion r_completion;
			
 
				+	struct completion r_safe_completion;  /* fsync waiter */
			
 
				 	ceph_osdc_callback_t r_callback;
			
 
				 	ceph_osdc_unsafe_callback_t r_unsafe_callback;
			
 
				-	struct ceph_eversion r_reassert_version;
			
 
				 	struct list_head  r_unsafe_item;
			
 
				 
			
 
				 	struct inode *r_inode;         	      /* for use by callbacks */
			
 
				 	void *r_priv;			      /* ditto */
			
 
				 
			
 
				-	struct ceph_object_locator r_base_oloc;
			
 
				-	struct ceph_object_id r_base_oid;
			
 
				-	struct ceph_object_locator r_target_oloc;
			
 
				-	struct ceph_object_id r_target_oid;
			
 
				-
			
 
				-	u64               r_snapid;
			
 
				-	unsigned long     r_stamp;            /* send OR check time */
			
 
				+	/* set by submitter */
			
 
				+	u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
			
 
				+	struct ceph_snap_context *r_snapc;    /* for writes */
			
 
				+	struct timespec r_mtime;              /* ditto */
			
 
				+	u64 r_data_offset;                    /* ditto */
			
 
				+	bool r_linger;                        /* don't resend on failure */
			
 
				 
			
 
				-	struct ceph_snap_context *r_snapc;    /* snap context for writes */
			
 
				+	/* internal */
			
 
				+	unsigned long r_stamp;                /* jiffies, send or check time */
			
 
				+	int r_attempts;
			
 
				+	struct ceph_eversion r_replay_version; /* aka reassert_version */
			
 
				+	u32 r_last_force_resend;
			
 
				+	u32 r_map_dne_bound;
			
 
				 
			
 
				 	struct ceph_osd_req_op r_ops[];
			
 
				 };
			
@@ -182,44 +201,70 @@ struct ceph_request_redirect {
 
				 	struct ceph_object_locator oloc;
			
 
				 };
			
 
				 
			
 
				-struct ceph_osd_event {
			
 
				-	u64 cookie;
			
 
				-	int one_shot;
			
 
				+typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
			
 
				+				 u64 notifier_id, void *data, size_t data_len);
			
 
				+typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
			
 
				+
			
 
				+struct ceph_osd_linger_request {
			
 
				 	struct ceph_osd_client *osdc;
			
 
				-	void (*cb)(u64, u64, u8, void *);
			
 
				-	void *data;
			
 
				-	struct rb_node node;
			
 
				-	struct list_head osd_node;
			
 
				+	u64 linger_id;
			
 
				+	bool committed;
			
 
				+	bool is_watch;                  /* watch or notify */
			
 
				+
			
 
				+	struct ceph_osd *osd;
			
 
				+	struct ceph_osd_request *reg_req;
			
 
				+	struct ceph_osd_request *ping_req;
			
 
				+	unsigned long ping_sent;
			
 
				+	unsigned long watch_valid_thru;
			
 
				+	struct list_head pending_lworks;
			
 
				+
			
 
				+	struct ceph_osd_request_target t;
			
 
				+	u32 last_force_resend;
			
 
				+	u32 map_dne_bound;
			
 
				+
			
 
				+	struct timespec mtime;
			
 
				+
			
 
				 	struct kref kref;
			
 
				-};
			
 
				+	struct mutex lock;
			
 
				+	struct rb_node node;            /* osd */
			
 
				+	struct rb_node osdc_node;       /* osdc */
			
 
				+	struct rb_node mc_node;         /* map check */
			
 
				+	struct list_head scan_item;
			
 
				+
			
 
				+	struct completion reg_commit_wait;
			
 
				+	struct completion notify_finish_wait;
			
 
				+	int reg_commit_error;
			
 
				+	int notify_finish_error;
			
 
				+	int last_error;
			
 
				+
			
 
				+	u32 register_gen;
			
 
				+	u64 notify_id;
			
 
				+
			
 
				+	rados_watchcb2_t wcb;
			
 
				+	rados_watcherrcb_t errcb;
			
 
				+	void *data;
			
 
				 
			
 
				-struct ceph_osd_event_work {
			
 
				-	struct work_struct work;
			
 
				-	struct ceph_osd_event *event;
			
 
				-        u64 ver;
			
 
				-        u64 notify_id;
			
 
				-        u8 opcode;
			
 
				+	struct page ***preply_pages;
			
 
				+	size_t *preply_len;
			
 
				 };
			
 
				 
			
 
				 struct ceph_osd_client {
			
 
				 	struct ceph_client     *client;
			
 
				 
			
 
				 	struct ceph_osdmap     *osdmap;       /* current map */
			
 
				-	struct rw_semaphore    map_sem;
			
 
				-	struct completion      map_waiters;
			
 
				-	u64                    last_requested_map;
			
 
				+	struct rw_semaphore    lock;
			
 
				 
			
 
				-	struct mutex           request_mutex;
			
 
				 	struct rb_root         osds;          /* osds */
			
 
				 	struct list_head       osd_lru;       /* idle osds */
			
 
				-	u64                    timeout_tid;   /* tid of timeout triggering rq */
			
 
				-	u64                    last_tid;      /* tid of last request */
			
 
				-	struct rb_root         requests;      /* pending requests */
			
 
				-	struct list_head       req_lru;	      /* in-flight lru */
			
 
				-	struct list_head       req_unsent;    /* unsent/need-resend queue */
			
 
				-	struct list_head       req_notarget;  /* map to no osd */
			
 
				-	struct list_head       req_linger;    /* lingering requests */
			
 
				-	int                    num_requests;
			
 
				+	spinlock_t             osd_lru_lock;
			
 
				+	struct ceph_osd        homeless_osd;
			
 
				+	atomic64_t             last_tid;      /* tid of last request */
			
 
				+	u64                    last_linger_id;
			
 
				+	struct rb_root         linger_requests; /* lingering requests */
			
 
				+	struct rb_root         map_checks;
			
 
				+	struct rb_root         linger_map_checks;
			
 
				+	atomic_t               num_requests;
			
 
				+	atomic_t               num_homeless;
			
 
				 	struct delayed_work    timeout_work;
			
 
				 	struct delayed_work    osds_timeout_work;
			
 
				 #ifdef CONFIG_DEBUG_FS
			
@@ -231,10 +276,6 @@ struct ceph_osd_client {
 
				 	struct ceph_msgpool	msgpool_op;
			
 
				 	struct ceph_msgpool	msgpool_op_reply;
			
 
				 
			
 
				-	spinlock_t		event_lock;
			
 
				-	struct rb_root		event_tree;
			
 
				-	u64			event_count;
			
 
				-
			
 
				 	struct workqueue_struct	*notify_wq;
			
 
				 };
			
 
				 
			
@@ -271,9 +312,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
 
				 extern struct ceph_osd_data *osd_req_op_extent_osd_data(
			
 
				 					struct ceph_osd_request *osd_req,
			
 
				 					unsigned int which);
			
 
				-extern struct ceph_osd_data *osd_req_op_cls_response_data(
			
 
				-					struct ceph_osd_request *osd_req,
			
 
				-					unsigned int which);
			
 
				 
			
 
				 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
			
 
				 					unsigned int which,
			
@@ -309,9 +347,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 
				 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
			
 
				 				 u16 opcode, const char *name, const void *value,
			
 
				 				 size_t size, u8 cmp_op, u8 cmp_mode);
			
 
				-extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
			
 
				-					unsigned int which, u16 opcode,
			
 
				-					u64 cookie, u64 version, int flag);
			
 
				 extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
			
 
				 				       unsigned int which,
			
 
				 				       u64 expected_object_size,
			
@@ -322,11 +357,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 
				 					       unsigned int num_ops,
			
 
				 					       bool use_mempool,
			
 
				 					       gfp_t gfp_flags);
			
 
				-
			
 
				-extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
			
 
				-				    struct ceph_snap_context *snapc,
			
 
				-				    u64 snap_id,
			
 
				-				    struct timespec *mtime);
			
 
				+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
			
 
				 
			
 
				 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
			
 
				 				      struct ceph_file_layout *layout,
			
@@ -338,9 +369,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 
				 				      u32 truncate_seq, u64 truncate_size,
			
 
				 				      bool use_mempool);
			
 
				 
			
 
				-extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
			
 
				-					 struct ceph_osd_request *req);
			
 
				-
			
 
				 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
			
 
				 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
			
 
				 
			
@@ -353,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 
				 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
			
 
				 
			
 
				 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
			
 
				+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
			
 
				 
			
 
				 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
			
 
				 			       struct ceph_vino vino,
			
@@ -371,11 +400,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 
				 				struct timespec *mtime,
			
 
				 				struct page **pages, int nr_pages);
			
 
				 
			
 
				-/* watch/notify events */
			
 
				-extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
			
 
				-				  void (*event_cb)(u64, u64, u8, void *),
			
 
				-				  void *data, struct ceph_osd_event **pevent);
			
 
				-extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
			
 
				-extern void ceph_osdc_put_event(struct ceph_osd_event *event);
			
 
				+/* watch/notify */
			
 
				+struct ceph_osd_linger_request *
			
 
				+ceph_osdc_watch(struct ceph_osd_client *osdc,
			
 
				+		struct ceph_object_id *oid,
			
 
				+		struct ceph_object_locator *oloc,
			
 
				+		rados_watchcb2_t wcb,
			
 
				+		rados_watcherrcb_t errcb,
			
 
				+		void *data);
			
 
				+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
			
 
				+		      struct ceph_osd_linger_request *lreq);
			
 
				+
			
 
				+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
			
 
				+			 struct ceph_object_id *oid,
			
 
				+			 struct ceph_object_locator *oloc,
			
 
				+			 u64 notify_id,
			
 
				+			 u64 cookie,
			
 
				+			 void *payload,
			
 
				+			 size_t payload_len);
			
 
				+int ceph_osdc_notify(struct ceph_osd_client *osdc,
			
 
				+		     struct ceph_object_id *oid,
			
 
				+		     struct ceph_object_locator *oloc,
			
 
				+		     void *payload,
			
 
				+		     size_t payload_len,
			
 
				+		     u32 timeout,
			
 
				+		     struct page ***preply_pages,
			
 
				+		     size_t *preply_len);
			
 
				+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
			
 
				+			  struct ceph_osd_linger_request *lreq);
			
 
				 #endif
			
 
				 
			
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -24,21 +24,29 @@ struct ceph_pg {
 
				 	uint32_t seed;
			
 
				 };
			
 
				 
			
 
				-#define CEPH_POOL_FLAG_HASHPSPOOL  1
			
 
				+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
			
 
				+
			
 
				+#define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
			
 
				+						       together */
			
 
				+#define CEPH_POOL_FLAG_FULL		(1ULL << 1) /* pool is full */
			
 
				 
			
 
				 struct ceph_pg_pool_info {
			
 
				 	struct rb_node node;
			
 
				 	s64 id;
			
 
				-	u8 type;
			
 
				+	u8 type; /* CEPH_POOL_TYPE_* */
			
 
				 	u8 size;
			
 
				+	u8 min_size;
			
 
				 	u8 crush_ruleset;
			
 
				 	u8 object_hash;
			
 
				+	u32 last_force_request_resend;
			
 
				 	u32 pg_num, pgp_num;
			
 
				 	int pg_num_mask, pgp_num_mask;
			
 
				 	s64 read_tier;
			
 
				 	s64 write_tier; /* wins for read+write ops */
			
 
				-	u64 flags;
			
 
				+	u64 flags; /* CEPH_POOL_FLAG_* */
			
 
				 	char *name;
			
 
				+
			
 
				+	bool was_full;  /* for handle_one_map() */
			
 
				 };
			
 
				 
			
 
				 static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
			
@@ -57,6 +65,22 @@ struct ceph_object_locator {
 
				 	s64 pool;
			
 
				 };
			
 
				 
			
 
				+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
			
 
				+{
			
 
				+	oloc->pool = -1;
			
 
				+}
			
 
				+
			
 
				+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
			
 
				+{
			
 
				+	return oloc->pool == -1;
			
 
				+}
			
 
				+
			
 
				+static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
			
 
				+				  const struct ceph_object_locator *src)
			
 
				+{
			
 
				+	dest->pool = src->pool;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Maximum supported by kernel client object name length
			
 
				  *
			
@@ -64,11 +88,47 @@ struct ceph_object_locator {
 
				  */
			
 
				 #define CEPH_MAX_OID_NAME_LEN 100
			
 
				 
			
 
				+/*
			
 
				+ * 51-char inline_name is long enough for all cephfs and all but one
			
 
				+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
			
 
				+ * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
			
 
				+ * other rbd requests fit into inline_name.
			
 
				+ *
			
 
				+ * Makes ceph_object_id 64 bytes on 64-bit.
			
 
				+ */
			
 
				+#define CEPH_OID_INLINE_LEN 52
			
 
				+
			
 
				+/*
			
 
				+ * Both inline and external buffers have space for a NUL-terminator,
			
 
				+ * which is carried around.  It's not required though - RADOS object
			
 
				+ * names don't have to be NUL-terminated and may contain NULs.
			
 
				+ */
			
 
				 struct ceph_object_id {
			
 
				-	char name[CEPH_MAX_OID_NAME_LEN];
			
 
				+	char *name;
			
 
				+	char inline_name[CEPH_OID_INLINE_LEN];
			
 
				 	int name_len;
			
 
				 };
			
 
				 
			
 
				+static inline void ceph_oid_init(struct ceph_object_id *oid)
			
 
				+{
			
 
				+	oid->name = oid->inline_name;
			
 
				+	oid->name_len = 0;
			
 
				+}
			
 
				+
			
 
				+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
			
 
				+{
			
 
				+	return oid->name == oid->inline_name && !oid->name_len;
			
 
				+}
			
 
				+
			
 
				+void ceph_oid_copy(struct ceph_object_id *dest,
			
 
				+		   const struct ceph_object_id *src);
			
 
				+__printf(2, 3)
			
 
				+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
			
 
				+__printf(3, 4)
			
 
				+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
			
 
				+		     const char *fmt, ...);
			
 
				+void ceph_oid_destroy(struct ceph_object_id *oid);
			
 
				+
			
 
				 struct ceph_pg_mapping {
			
 
				 	struct rb_node node;
			
 
				 	struct ceph_pg pgid;
			
@@ -87,7 +147,6 @@ struct ceph_pg_mapping {
 
				 struct ceph_osdmap {
			
 
				 	struct ceph_fsid fsid;
			
 
				 	u32 epoch;
			
 
				-	u32 mkfs_epoch;
			
 
				 	struct ceph_timespec created, modified;
			
 
				 
			
 
				 	u32 flags;         /* CEPH_OSDMAP_* */
			
@@ -113,43 +172,19 @@ struct ceph_osdmap {
 
				 	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
			
 
				 };
			
 
				 
			
 
				-static inline void ceph_oid_set_name(struct ceph_object_id *oid,
			
 
				-				     const char *name)
			
 
				-{
			
 
				-	int len;
			
 
				-
			
 
				-	len = strlen(name);
			
 
				-	if (len > sizeof(oid->name)) {
			
 
				-		WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
			
 
				-		     name, len, sizeof(oid->name));
			
 
				-		len = sizeof(oid->name);
			
 
				-	}
			
 
				-
			
 
				-	memcpy(oid->name, name, len);
			
 
				-	oid->name_len = len;
			
 
				-}
			
 
				-
			
 
				-static inline void ceph_oid_copy(struct ceph_object_id *dest,
			
 
				-				 struct ceph_object_id *src)
			
 
				-{
			
 
				-	BUG_ON(src->name_len > sizeof(dest->name));
			
 
				-	memcpy(dest->name, src->name, src->name_len);
			
 
				-	dest->name_len = src->name_len;
			
 
				-}
			
 
				-
			
 
				-static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
			
 
				+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
			
 
				 {
			
 
				 	return osd >= 0 && osd < map->max_osd &&
			
 
				 	       (map->osd_state[osd] & CEPH_OSD_EXISTS);
			
 
				 }
			
 
				 
			
 
				-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
			
 
				+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
			
 
				 {
			
 
				 	return ceph_osd_exists(map, osd) &&
			
 
				 	       (map->osd_state[osd] & CEPH_OSD_UP);
			
 
				 }
			
 
				 
			
 
				-static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
			
 
				+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
			
 
				 {
			
 
				 	return !ceph_osd_is_up(map, osd);
			
 
				 }
			
@@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+struct ceph_osdmap *ceph_osdmap_alloc(void);
			
 
				 extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
			
 
				-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
			
 
				-					    struct ceph_osdmap *map,
			
 
				-					    struct ceph_messenger *msgr);
			
 
				+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
			
 
				+					     struct ceph_osdmap *map);
			
 
				 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
			
 
				 
			
 
				+struct ceph_osds {
			
 
				+	int osds[CEPH_PG_MAX_SIZE];
			
 
				+	int size;
			
 
				+	int primary; /* id, NOT index */
			
 
				+};
			
 
				+
			
 
				+static inline void ceph_osds_init(struct ceph_osds *set)
			
 
				+{
			
 
				+	set->size = 0;
			
 
				+	set->primary = -1;
			
 
				+}
			
 
				+
			
 
				+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
			
 
				+
			
 
				+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
			
 
				+			  const struct ceph_osds *new_acting,
			
 
				+			  const struct ceph_osds *old_up,
			
 
				+			  const struct ceph_osds *new_up,
			
 
				+			  int old_size,
			
 
				+			  int new_size,
			
 
				+			  int old_min_size,
			
 
				+			  int new_min_size,
			
 
				+			  u32 old_pg_num,
			
 
				+			  u32 new_pg_num,
			
 
				+			  bool old_sort_bitwise,
			
 
				+			  bool new_sort_bitwise,
			
 
				+			  const struct ceph_pg *pgid);
			
 
				+bool ceph_osds_changed(const struct ceph_osds *old_acting,
			
 
				+		       const struct ceph_osds *new_acting,
			
 
				+		       bool any_change);
			
 
				+
			
 
				 /* calculate mapping of a file extent to an object */
			
 
				 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
			
 
				 					 u64 off, u64 len,
			
 
				 					 u64 *bno, u64 *oxoff, u64 *oxlen);
			
 
				 
			
 
				-/* calculate mapping of object to a placement group */
			
 
				-extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
			
 
				-			       struct ceph_object_locator *oloc,
			
 
				-			       struct ceph_object_id *oid,
			
 
				-			       struct ceph_pg *pg_out);
			
 
				-
			
 
				-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
			
 
				-			       struct ceph_pg pgid,
			
 
				-			       int *osds, int *primary);
			
 
				-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
			
 
				-				struct ceph_pg pgid);
			
 
				+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
			
 
				+			      struct ceph_object_id *oid,
			
 
				+			      struct ceph_object_locator *oloc,
			
 
				+			      struct ceph_pg *raw_pgid);
			
 
				+
			
 
				+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
			
 
				+			       const struct ceph_pg *raw_pgid,
			
 
				+			       struct ceph_osds *up,
			
 
				+			       struct ceph_osds *acting);
			
 
				+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
			
 
				+			      const struct ceph_pg *raw_pgid);
			
 
				 
			
 
				 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
			
 
				 						    u64 id);
			
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -114,8 +114,8 @@ struct ceph_object_layout {
 
				  * compound epoch+version, used by storage layer to serialize mutations
			
 
				  */
			
 
				 struct ceph_eversion {
			
 
				-	__le32 epoch;
			
 
				 	__le64 version;
			
 
				+	__le32 epoch;
			
 
				 } __attribute__ ((packed));
			
 
				 
			
 
				 /*
			
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
 
				 #define CEPH_OSDMAP_NOIN     (1<<8)  /* block osd auto mark-in */
			
 
				 #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
			
 
				 #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
			
 
				+#define CEPH_OSDMAP_NOSCRUB  (1<<11) /* block periodic scrub */
			
 
				+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
			
 
				+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
			
 
				+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
			
 
				+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
			
 
				 
			
 
				 /*
			
 
				  * The error code to return when an OSD can't handle a write
			
@@ -389,6 +394,13 @@ enum {
 
				 	CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
			
 
				 	CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
			
 
				 	CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
			
 
				+	CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000,  /* map snap direct to clone id */
			
 
				+	CEPH_OSD_FLAG_ENFORCE_SNAPC   = 0x100000,  /* use snapc provided even if
			
 
				+						      pool uses pool snaps */
			
 
				+	CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
			
 
				+	CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
			
 
				+	CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
			
 
				+	CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
			
 
				 };
			
 
				 
			
 
				 enum {
			
@@ -415,7 +427,17 @@ enum {
 
				 	CEPH_OSD_CMPXATTR_MODE_U64    = 2
			
 
				 };
			
 
				 
			
 
				-#define RADOS_NOTIFY_VER	1
			
 
				+enum {
			
 
				+	CEPH_OSD_WATCH_OP_UNWATCH = 0,
			
 
				+	CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
			
 
				+	/* note: use only ODD ids to prevent pre-giant code from
			
 
				+	   interpreting the op as UNWATCH */
			
 
				+	CEPH_OSD_WATCH_OP_WATCH = 3,
			
 
				+	CEPH_OSD_WATCH_OP_RECONNECT = 5,
			
 
				+	CEPH_OSD_WATCH_OP_PING = 7,
			
 
				+};
			
 
				+
			
 
				+const char *ceph_osd_watch_op_name(int o);
			
 
				 
			
 
				 /*
			
 
				  * an individual object operation.  each may be accompanied by some data
			
@@ -450,9 +472,13 @@ struct ceph_osd_op {
 
				 	        } __attribute__ ((packed)) snap;
			
 
				 		struct {
			
 
				 			__le64 cookie;
			
 
				-			__le64 ver;
			
 
				-			__u8 flag;	/* 0 = unwatch, 1 = watch */
			
 
				+			__le64 ver;     /* no longer used */
			
 
				+			__u8 op;	/* CEPH_OSD_WATCH_OP_* */
			
 
				+			__le32 gen;     /* registration generation */
			
 
				 		} __attribute__ ((packed)) watch;
			
 
				+		struct {
			
 
				+			__le64 cookie;
			
 
				+		} __attribute__ ((packed)) notify;
			
 
				 		struct {
			
 
				 			__le64 offset, length;
			
 
				 			__le64 src_offset;
			
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
 
				 /*
			
 
				  * true if we have the mon map (and have thus joined the cluster)
			
 
				  */
			
 
				-static int have_mon_and_osd_map(struct ceph_client *client)
			
 
				+static bool have_mon_and_osd_map(struct ceph_client *client)
			
 
				 {
			
 
				 	return client->monc.monmap && client->monc.monmap->epoch &&
			
 
				 	       client->osdc.osdmap && client->osdc.osdmap->epoch;
			
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+const char *ceph_osd_watch_op_name(int o)
			
 
				+{
			
 
				+	switch (o) {
			
 
				+	case CEPH_OSD_WATCH_OP_UNWATCH:
			
 
				+		return "unwatch";
			
 
				+	case CEPH_OSD_WATCH_OP_WATCH:
			
 
				+		return "watch";
			
 
				+	case CEPH_OSD_WATCH_OP_RECONNECT:
			
 
				+		return "reconnect";
			
 
				+	case CEPH_OSD_WATCH_OP_PING:
			
 
				+		return "ping";
			
 
				+	default:
			
 
				+		return "???";
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 const char *ceph_osd_state_name(int s)
			
 
				 {
			
 
				 	switch (s) {
			
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
 
				 {
			
 
				 	int i;
			
 
				 	struct ceph_client *client = s->private;
			
 
				-	struct ceph_osdmap *map = client->osdc.osdmap;
			
 
				+	struct ceph_osd_client *osdc = &client->osdc;
			
 
				+	struct ceph_osdmap *map = osdc->osdmap;
			
 
				 	struct rb_node *n;
			
 
				 
			
 
				 	if (map == NULL)
			
 
				 		return 0;
			
 
				 
			
 
				-	seq_printf(s, "epoch %d\n", map->epoch);
			
 
				-	seq_printf(s, "flags%s%s\n",
			
 
				-		   (map->flags & CEPH_OSDMAP_NEARFULL) ?  " NEARFULL" : "",
			
 
				-		   (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : "");
			
 
				+	down_read(&osdc->lock);
			
 
				+	seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
			
 
				 
			
 
				 	for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
			
 
				-		struct ceph_pg_pool_info *pool =
			
 
				+		struct ceph_pg_pool_info *pi =
			
 
				 			rb_entry(n, struct ceph_pg_pool_info, node);
			
 
				 
			
 
				-		seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
			
 
				-			   pool->id, pool->pg_num, pool->pg_num_mask,
			
 
				-			   pool->read_tier, pool->write_tier);
			
 
				+		seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
			
 
				+			   pi->id, pi->name, pi->type, pi->size, pi->min_size,
			
 
				+			   pi->pg_num, pi->pg_num_mask, pi->flags,
			
 
				+			   pi->last_force_request_resend, pi->read_tier,
			
 
				+			   pi->write_tier);
			
 
				 	}
			
 
				 	for (i = 0; i < map->max_osd; i++) {
			
 
				 		struct ceph_entity_addr *addr = &map->osd_addr[i];
			
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 
				 			   pg->pgid.seed, pg->primary_temp.osd);
			
 
				 	}
			
 
				 
			
 
				+	up_read(&osdc->lock);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
 
				 					CEPH_SUBSCRIBE_ONETIME ?  "" : "+"));
			
 
				 		seq_putc(s, '\n');
			
 
				 	}
			
 
				+	seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
			
 
				 
			
 
				 	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
			
 
				 		__u16 op;
			
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int osdc_show(struct seq_file *s, void *pp)
			
 
				+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
			
 
				 {
			
 
				-	struct ceph_client *client = s->private;
			
 
				-	struct ceph_osd_client *osdc = &client->osdc;
			
 
				-	struct rb_node *p;
			
 
				+	int i;
			
 
				 
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
			
 
				-		struct ceph_osd_request *req;
			
 
				-		unsigned int i;
			
 
				-		int opcode;
			
 
				+	seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
			
 
				+	for (i = 0; i < t->up.size; i++)
			
 
				+		seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
			
 
				+	seq_printf(s, "]/%d\t[", t->up.primary);
			
 
				+	for (i = 0; i < t->acting.size; i++)
			
 
				+		seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
			
 
				+	seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
			
 
				+		   t->target_oid.name_len, t->target_oid.name, t->flags);
			
 
				+	if (t->paused)
			
 
				+		seq_puts(s, "\tP");
			
 
				+}
			
 
				 
			
 
				-		req = rb_entry(p, struct ceph_osd_request, r_node);
			
 
				+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
			
 
				+{
			
 
				+	int i;
			
 
				 
			
 
				-		seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
			
 
				-			   req->r_osd ? req->r_osd->o_osd : -1,
			
 
				-			   req->r_pgid.pool, req->r_pgid.seed);
			
 
				+	seq_printf(s, "%llu\t", req->r_tid);
			
 
				+	dump_target(s, &req->r_t);
			
 
				 
			
 
				-		seq_printf(s, "%.*s", req->r_base_oid.name_len,
			
 
				-			   req->r_base_oid.name);
			
 
				+	seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
			
 
				+		   le32_to_cpu(req->r_replay_version.epoch),
			
 
				+		   le64_to_cpu(req->r_replay_version.version));
			
 
				 
			
 
				-		if (req->r_reassert_version.epoch)
			
 
				-			seq_printf(s, "\t%u'%llu",
			
 
				-			   (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
			
 
				-			   le64_to_cpu(req->r_reassert_version.version));
			
 
				-		else
			
 
				-			seq_printf(s, "\t");
			
 
				+	for (i = 0; i < req->r_num_ops; i++) {
			
 
				+		struct ceph_osd_req_op *op = &req->r_ops[i];
			
 
				+
			
 
				+		seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
			
 
				+			   ceph_osd_op_name(op->op));
			
 
				+		if (op->op == CEPH_OSD_OP_WATCH)
			
 
				+			seq_printf(s, "-%s",
			
 
				+				   ceph_osd_watch_op_name(op->watch.op));
			
 
				+	}
			
 
				+
			
 
				+	seq_putc(s, '\n');
			
 
				+}
			
 
				+
			
 
				+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
			
 
				+{
			
 
				+	struct rb_node *n;
			
 
				+
			
 
				+	mutex_lock(&osd->lock);
			
 
				+	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
			
 
				+		struct ceph_osd_request *req =
			
 
				+		    rb_entry(n, struct ceph_osd_request, r_node);
			
 
				+
			
 
				+		dump_request(s, req);
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&osd->lock);
			
 
				+}
			
 
				 
			
 
				-		for (i = 0; i < req->r_num_ops; i++) {
			
 
				-			opcode = req->r_ops[i].op;
			
 
				-			seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
			
 
				-				   ceph_osd_op_name(opcode));
			
 
				-		}
			
 
				+static void dump_linger_request(struct seq_file *s,
			
 
				+				struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	seq_printf(s, "%llu\t", lreq->linger_id);
			
 
				+	dump_target(s, &lreq->t);
			
 
				+
			
 
				+	seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
			
 
				+		   lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
			
 
				+		   lreq->last_error);
			
 
				+}
			
 
				+
			
 
				+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
			
 
				+{
			
 
				+	struct rb_node *n;
			
 
				+
			
 
				+	mutex_lock(&osd->lock);
			
 
				+	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
			
 
				+		struct ceph_osd_linger_request *lreq =
			
 
				+		    rb_entry(n, struct ceph_osd_linger_request, node);
			
 
				+
			
 
				+		dump_linger_request(s, lreq);
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&osd->lock);
			
 
				+}
			
 
				 
			
 
				-		seq_printf(s, "\n");
			
 
				+static int osdc_show(struct seq_file *s, void *pp)
			
 
				+{
			
 
				+	struct ceph_client *client = s->private;
			
 
				+	struct ceph_osd_client *osdc = &client->osdc;
			
 
				+	struct rb_node *n;
			
 
				+
			
 
				+	down_read(&osdc->lock);
			
 
				+	seq_printf(s, "REQUESTS %d homeless %d\n",
			
 
				+		   atomic_read(&osdc->num_requests),
			
 
				+		   atomic_read(&osdc->num_homeless));
			
 
				+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
			
 
				+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
			
 
				+
			
 
				+		dump_requests(s, osd);
			
 
				 	}
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				+	dump_requests(s, &osdc->homeless_osd);
			
 
				+
			
 
				+	seq_puts(s, "LINGER REQUESTS\n");
			
 
				+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
			
 
				+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
			
 
				+
			
 
				+		dump_linger_requests(s, osd);
			
 
				+	}
			
 
				+	dump_linger_requests(s, &osdc->homeless_osd);
			
 
				+
			
 
				+	up_read(&osdc->lock);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 
				 	BUG_ON(num < 1); /* monmap sub is always there */
			
 
				 	ceph_encode_32(&p, num);
			
 
				 	for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
			
 
				-		const char *s = ceph_sub_str[i];
			
 
				+		char buf[32];
			
 
				+		int len;
			
 
				 
			
 
				 		if (!monc->subs[i].want)
			
 
				 			continue;
			
 
				 
			
 
				-		dout("%s %s start %llu flags 0x%x\n", __func__, s,
			
 
				+		len = sprintf(buf, "%s", ceph_sub_str[i]);
			
 
				+		if (i == CEPH_SUB_MDSMAP &&
			
 
				+		    monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
			
 
				+			len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
			
 
				+
			
 
				+		dout("%s %s start %llu flags 0x%x\n", __func__, buf,
			
 
				 		     le64_to_cpu(monc->subs[i].item.start),
			
 
				 		     monc->subs[i].item.flags);
			
 
				-		ceph_encode_string(&p, end, s, strlen(s));
			
 
				+		ceph_encode_string(&p, end, buf, len);
			
 
				 		memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
			
 
				 		p += sizeof(monc->subs[i].item);
			
 
				 	}
			
 
				 
			
 
				-	BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
			
 
				+	BUG_ON(p > end);
			
 
				 	msg->front.iov_len = p - msg->front.iov_base;
			
 
				 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
			
 
				 	ceph_msg_revoke(msg);
			
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
 
				 }
			
 
				 EXPORT_SYMBOL(ceph_monc_got_map);
			
 
				 
			
 
				-/*
			
 
				- * Register interest in the next osdmap
			
 
				- */
			
 
				-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
			
 
				+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
			
 
				 {
			
 
				-	dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
			
 
				 	mutex_lock(&monc->mutex);
			
 
				-	if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
			
 
				-				 monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
			
 
				-		__send_subscribe(monc);
			
 
				+	__send_subscribe(monc);
			
 
				 	mutex_unlock(&monc->mutex);
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
			
 
				+EXPORT_SYMBOL(ceph_monc_renew_subs);
			
 
				 
			
 
				 /*
			
 
				  * Wait for an osdmap with a given epoch.
			
@@ -478,51 +478,17 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 
				 /*
			
 
				  * generic requests (currently statfs, mon_get_version)
			
 
				  */
			
 
				-static struct ceph_mon_generic_request *__lookup_generic_req(
			
 
				-	struct ceph_mon_client *monc, u64 tid)
			
 
				-{
			
 
				-	struct ceph_mon_generic_request *req;
			
 
				-	struct rb_node *n = monc->generic_request_tree.rb_node;
			
 
				-
			
 
				-	while (n) {
			
 
				-		req = rb_entry(n, struct ceph_mon_generic_request, node);
			
 
				-		if (tid < req->tid)
			
 
				-			n = n->rb_left;
			
 
				-		else if (tid > req->tid)
			
 
				-			n = n->rb_right;
			
 
				-		else
			
 
				-			return req;
			
 
				-	}
			
 
				-	return NULL;
			
 
				-}
			
 
				-
			
 
				-static void __insert_generic_request(struct ceph_mon_client *monc,
			
 
				-			    struct ceph_mon_generic_request *new)
			
 
				-{
			
 
				-	struct rb_node **p = &monc->generic_request_tree.rb_node;
			
 
				-	struct rb_node *parent = NULL;
			
 
				-	struct ceph_mon_generic_request *req = NULL;
			
 
				-
			
 
				-	while (*p) {
			
 
				-		parent = *p;
			
 
				-		req = rb_entry(parent, struct ceph_mon_generic_request, node);
			
 
				-		if (new->tid < req->tid)
			
 
				-			p = &(*p)->rb_left;
			
 
				-		else if (new->tid > req->tid)
			
 
				-			p = &(*p)->rb_right;
			
 
				-		else
			
 
				-			BUG();
			
 
				-	}
			
 
				-
			
 
				-	rb_link_node(&new->node, parent, p);
			
 
				-	rb_insert_color(&new->node, &monc->generic_request_tree);
			
 
				-}
			
 
				+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
			
 
				 
			
 
				 static void release_generic_request(struct kref *kref)
			
 
				 {
			
 
				 	struct ceph_mon_generic_request *req =
			
 
				 		container_of(kref, struct ceph_mon_generic_request, kref);
			
 
				 
			
 
				+	dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
			
 
				+	     req->reply);
			
 
				+	WARN_ON(!RB_EMPTY_NODE(&req->node));
			
 
				+
			
 
				 	if (req->reply)
			
 
				 		ceph_msg_put(req->reply);
			
 
				 	if (req->request)
			
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
 
				 
			
 
				 static void put_generic_request(struct ceph_mon_generic_request *req)
			
 
				 {
			
 
				-	kref_put(&req->kref, release_generic_request);
			
 
				+	if (req)
			
 
				+		kref_put(&req->kref, release_generic_request);
			
 
				 }
			
 
				 
			
 
				 static void get_generic_request(struct ceph_mon_generic_request *req)
			
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
 
				 	kref_get(&req->kref);
			
 
				 }
			
 
				 
			
 
				+static struct ceph_mon_generic_request *
			
 
				+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
			
 
				+{
			
 
				+	struct ceph_mon_generic_request *req;
			
 
				+
			
 
				+	req = kzalloc(sizeof(*req), gfp);
			
 
				+	if (!req)
			
 
				+		return NULL;
			
 
				+
			
 
				+	req->monc = monc;
			
 
				+	kref_init(&req->kref);
			
 
				+	RB_CLEAR_NODE(&req->node);
			
 
				+	init_completion(&req->completion);
			
 
				+
			
 
				+	dout("%s greq %p\n", __func__, req);
			
 
				+	return req;
			
 
				+}
			
 
				+
			
 
				+static void register_generic_request(struct ceph_mon_generic_request *req)
			
 
				+{
			
 
				+	struct ceph_mon_client *monc = req->monc;
			
 
				+
			
 
				+	WARN_ON(req->tid);
			
 
				+
			
 
				+	get_generic_request(req);
			
 
				+	req->tid = ++monc->last_tid;
			
 
				+	insert_generic_request(&monc->generic_request_tree, req);
			
 
				+}
			
 
				+
			
 
				+static void send_generic_request(struct ceph_mon_client *monc,
			
 
				+				 struct ceph_mon_generic_request *req)
			
 
				+{
			
 
				+	WARN_ON(!req->tid);
			
 
				+
			
 
				+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
			
 
				+	req->request->hdr.tid = cpu_to_le64(req->tid);
			
 
				+	ceph_con_send(&monc->con, ceph_msg_get(req->request));
			
 
				+}
			
 
				+
			
 
				+static void __finish_generic_request(struct ceph_mon_generic_request *req)
			
 
				+{
			
 
				+	struct ceph_mon_client *monc = req->monc;
			
 
				+
			
 
				+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
			
 
				+	erase_generic_request(&monc->generic_request_tree, req);
			
 
				+
			
 
				+	ceph_msg_revoke(req->request);
			
 
				+	ceph_msg_revoke_incoming(req->reply);
			
 
				+}
			
 
				+
			
 
				+static void finish_generic_request(struct ceph_mon_generic_request *req)
			
 
				+{
			
 
				+	__finish_generic_request(req);
			
 
				+	put_generic_request(req);
			
 
				+}
			
 
				+
			
 
				+static void complete_generic_request(struct ceph_mon_generic_request *req)
			
 
				+{
			
 
				+	if (req->complete_cb)
			
 
				+		req->complete_cb(req);
			
 
				+	else
			
 
				+		complete_all(&req->completion);
			
 
				+	put_generic_request(req);
			
 
				+}
			
 
				+
			
 
				+void cancel_generic_request(struct ceph_mon_generic_request *req)
			
 
				+{
			
 
				+	struct ceph_mon_client *monc = req->monc;
			
 
				+	struct ceph_mon_generic_request *lookup_req;
			
 
				+
			
 
				+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	lookup_req = lookup_generic_request(&monc->generic_request_tree,
			
 
				+					    req->tid);
			
 
				+	if (lookup_req) {
			
 
				+		WARN_ON(lookup_req != req);
			
 
				+		finish_generic_request(req);
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+}
			
 
				+
			
 
				+static int wait_generic_request(struct ceph_mon_generic_request *req)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
			
 
				+	ret = wait_for_completion_interruptible(&req->completion);
			
 
				+	if (ret)
			
 
				+		cancel_generic_request(req);
			
 
				+	else
			
 
				+		ret = req->result; /* completed */
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
			
 
				 					 struct ceph_msg_header *hdr,
			
 
				 					 int *skip)
			
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 
				 	struct ceph_msg *m;
			
 
				 
			
 
				 	mutex_lock(&monc->mutex);
			
 
				-	req = __lookup_generic_req(monc, tid);
			
 
				+	req = lookup_generic_request(&monc->generic_request_tree, tid);
			
 
				 	if (!req) {
			
 
				 		dout("get_generic_reply %lld dne\n", tid);
			
 
				 		*skip = 1;
			
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 
				 	return m;
			
 
				 }
			
 
				 
			
 
				-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
			
 
				-				struct ceph_mon_generic_request *req)
			
 
				-{
			
 
				-	int err;
			
 
				-
			
 
				-	/* register request */
			
 
				-	req->tid = tid != 0 ? tid : ++monc->last_tid;
			
 
				-	req->request->hdr.tid = cpu_to_le64(req->tid);
			
 
				-	__insert_generic_request(monc, req);
			
 
				-	monc->num_generic_requests++;
			
 
				-	ceph_con_send(&monc->con, ceph_msg_get(req->request));
			
 
				-	mutex_unlock(&monc->mutex);
			
 
				-
			
 
				-	err = wait_for_completion_interruptible(&req->completion);
			
 
				-
			
 
				-	mutex_lock(&monc->mutex);
			
 
				-	rb_erase(&req->node, &monc->generic_request_tree);
			
 
				-	monc->num_generic_requests--;
			
 
				-
			
 
				-	if (!err)
			
 
				-		err = req->result;
			
 
				-	return err;
			
 
				-}
			
 
				-
			
 
				-static int do_generic_request(struct ceph_mon_client *monc,
			
 
				-			      struct ceph_mon_generic_request *req)
			
 
				-{
			
 
				-	int err;
			
 
				-
			
 
				-	mutex_lock(&monc->mutex);
			
 
				-	err = __do_generic_request(monc, 0, req);
			
 
				-	mutex_unlock(&monc->mutex);
			
 
				-
			
 
				-	return err;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * statfs
			
 
				  */
			
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 
				 	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
			
 
				 	u64 tid = le64_to_cpu(msg->hdr.tid);
			
 
				 
			
 
				+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
			
 
				+
			
 
				 	if (msg->front.iov_len != sizeof(*reply))
			
 
				 		goto bad;
			
 
				-	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
			
 
				 
			
 
				 	mutex_lock(&monc->mutex);
			
 
				-	req = __lookup_generic_req(monc, tid);
			
 
				-	if (req) {
			
 
				-		*(struct ceph_statfs *)req->buf = reply->st;
			
 
				-		req->result = 0;
			
 
				-		get_generic_request(req);
			
 
				+	req = lookup_generic_request(&monc->generic_request_tree, tid);
			
 
				+	if (!req) {
			
 
				+		mutex_unlock(&monc->mutex);
			
 
				+		return;
			
 
				 	}
			
 
				+
			
 
				+	req->result = 0;
			
 
				+	*req->u.st = reply->st; /* struct */
			
 
				+	__finish_generic_request(req);
			
 
				 	mutex_unlock(&monc->mutex);
			
 
				-	if (req) {
			
 
				-		complete_all(&req->completion);
			
 
				-		put_generic_request(req);
			
 
				-	}
			
 
				+
			
 
				+	complete_generic_request(req);
			
 
				 	return;
			
 
				 
			
 
				 bad:
			
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 
				 {
			
 
				 	struct ceph_mon_generic_request *req;
			
 
				 	struct ceph_mon_statfs *h;
			
 
				-	int err;
			
 
				+	int ret = -ENOMEM;
			
 
				 
			
 
				-	req = kzalloc(sizeof(*req), GFP_NOFS);
			
 
				+	req = alloc_generic_request(monc, GFP_NOFS);
			
 
				 	if (!req)
			
 
				-		return -ENOMEM;
			
 
				-
			
 
				-	kref_init(&req->kref);
			
 
				-	req->buf = buf;
			
 
				-	init_completion(&req->completion);
			
 
				+		goto out;
			
 
				 
			
 
				-	err = -ENOMEM;
			
 
				 	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
			
 
				 				    true);
			
 
				 	if (!req->request)
			
 
				 		goto out;
			
 
				-	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
			
 
				-				  true);
			
 
				+
			
 
				+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
			
 
				 	if (!req->reply)
			
 
				 		goto out;
			
 
				 
			
 
				+	req->u.st = buf;
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	register_generic_request(req);
			
 
				 	/* fill out request */
			
 
				 	h = req->request->front.iov_base;
			
 
				 	h->monhdr.have_version = 0;
			
 
				 	h->monhdr.session_mon = cpu_to_le16(-1);
			
 
				 	h->monhdr.session_mon_tid = 0;
			
 
				 	h->fsid = monc->monmap->fsid;
			
 
				+	send_generic_request(monc, req);
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				 
			
 
				-	err = do_generic_request(monc, req);
			
 
				-
			
 
				+	ret = wait_generic_request(req);
			
 
				 out:
			
 
				 	put_generic_request(req);
			
 
				-	return err;
			
 
				+	return ret;
			
 
				 }
			
 
				 EXPORT_SYMBOL(ceph_monc_do_statfs);
			
 
				 
			
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 
				 	void *end = p + msg->front_alloc_len;
			
 
				 	u64 handle;
			
 
				 
			
 
				-	dout("%s %p tid %llu\n", __func__, msg, tid);
			
 
				+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
			
 
				 
			
 
				 	ceph_decode_need(&p, end, 2*sizeof(u64), bad);
			
 
				 	handle = ceph_decode_64(&p);
			
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 
				 		goto bad;
			
 
				 
			
 
				 	mutex_lock(&monc->mutex);
			
 
				-	req = __lookup_generic_req(monc, handle);
			
 
				-	if (req) {
			
 
				-		*(u64 *)req->buf = ceph_decode_64(&p);
			
 
				-		req->result = 0;
			
 
				-		get_generic_request(req);
			
 
				+	req = lookup_generic_request(&monc->generic_request_tree, handle);
			
 
				+	if (!req) {
			
 
				+		mutex_unlock(&monc->mutex);
			
 
				+		return;
			
 
				 	}
			
 
				+
			
 
				+	req->result = 0;
			
 
				+	req->u.newest = ceph_decode_64(&p);
			
 
				+	__finish_generic_request(req);
			
 
				 	mutex_unlock(&monc->mutex);
			
 
				-	if (req) {
			
 
				-		complete_all(&req->completion);
			
 
				-		put_generic_request(req);
			
 
				-	}
			
 
				 
			
 
				+	complete_generic_request(req);
			
 
				 	return;
			
 
				+
			
 
				 bad:
			
 
				 	pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
			
 
				 	ceph_msg_dump(msg);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Send MMonGetVersion and wait for the reply.
			
 
				- *
			
 
				- * @what: one of "mdsmap", "osdmap" or "monmap"
			
 
				- */
			
 
				-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
			
 
				-			     u64 *newest)
			
 
				+static struct ceph_mon_generic_request *
			
 
				+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
			
 
				+			ceph_monc_callback_t cb, u64 private_data)
			
 
				 {
			
 
				 	struct ceph_mon_generic_request *req;
			
 
				-	void *p, *end;
			
 
				-	u64 tid;
			
 
				-	int err;
			
 
				 
			
 
				-	req = kzalloc(sizeof(*req), GFP_NOFS);
			
 
				+	req = alloc_generic_request(monc, GFP_NOIO);
			
 
				 	if (!req)
			
 
				-		return -ENOMEM;
			
 
				-
			
 
				-	kref_init(&req->kref);
			
 
				-	req->buf = newest;
			
 
				-	init_completion(&req->completion);
			
 
				+		goto err_put_req;
			
 
				 
			
 
				 	req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
			
 
				 				    sizeof(u64) + sizeof(u32) + strlen(what),
			
 
				-				    GFP_NOFS, true);
			
 
				-	if (!req->request) {
			
 
				-		err = -ENOMEM;
			
 
				-		goto out;
			
 
				-	}
			
 
				+				    GFP_NOIO, true);
			
 
				+	if (!req->request)
			
 
				+		goto err_put_req;
			
 
				 
			
 
				-	req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
			
 
				-				  GFP_NOFS, true);
			
 
				-	if (!req->reply) {
			
 
				-		err = -ENOMEM;
			
 
				-		goto out;
			
 
				-	}
			
 
				+	req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
			
 
				+				  true);
			
 
				+	if (!req->reply)
			
 
				+		goto err_put_req;
			
 
				 
			
 
				-	p = req->request->front.iov_base;
			
 
				-	end = p + req->request->front_alloc_len;
			
 
				+	req->complete_cb = cb;
			
 
				+	req->private_data = private_data;
			
 
				 
			
 
				-	/* fill out request */
			
 
				 	mutex_lock(&monc->mutex);
			
 
				-	tid = ++monc->last_tid;
			
 
				-	ceph_encode_64(&p, tid); /* handle */
			
 
				-	ceph_encode_string(&p, end, what, strlen(what));
			
 
				+	register_generic_request(req);
			
 
				+	{
			
 
				+		void *p = req->request->front.iov_base;
			
 
				+		void *const end = p + req->request->front_alloc_len;
			
 
				+
			
 
				+		ceph_encode_64(&p, req->tid); /* handle */
			
 
				+		ceph_encode_string(&p, end, what, strlen(what));
			
 
				+		WARN_ON(p != end);
			
 
				+	}
			
 
				+	send_generic_request(monc, req);
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				 
			
 
				-	err = __do_generic_request(monc, tid, req);
			
 
				+	return req;
			
 
				 
			
 
				-	mutex_unlock(&monc->mutex);
			
 
				-out:
			
 
				+err_put_req:
			
 
				 	put_generic_request(req);
			
 
				-	return err;
			
 
				+	return ERR_PTR(-ENOMEM);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Send MMonGetVersion and wait for the reply.
			
 
				+ *
			
 
				+ * @what: one of "mdsmap", "osdmap" or "monmap"
			
 
				+ */
			
 
				+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
			
 
				+			  u64 *newest)
			
 
				+{
			
 
				+	struct ceph_mon_generic_request *req;
			
 
				+	int ret;
			
 
				+
			
 
				+	req = __ceph_monc_get_version(monc, what, NULL, 0);
			
 
				+	if (IS_ERR(req))
			
 
				+		return PTR_ERR(req);
			
 
				+
			
 
				+	ret = wait_generic_request(req);
			
 
				+	if (!ret)
			
 
				+		*newest = req->u.newest;
			
 
				+
			
 
				+	put_generic_request(req);
			
 
				+	return ret;
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_monc_do_get_version);
			
 
				+EXPORT_SYMBOL(ceph_monc_get_version);
			
 
				+
			
 
				+/*
			
 
				+ * Send MMonGetVersion,
			
 
				+ *
			
 
				+ * @what: one of "mdsmap", "osdmap" or "monmap"
			
 
				+ */
			
 
				+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
			
 
				+				ceph_monc_callback_t cb, u64 private_data)
			
 
				+{
			
 
				+	struct ceph_mon_generic_request *req;
			
 
				+
			
 
				+	req = __ceph_monc_get_version(monc, what, cb, private_data);
			
 
				+	if (IS_ERR(req))
			
 
				+		return PTR_ERR(req);
			
 
				+
			
 
				+	put_generic_request(req);
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL(ceph_monc_get_version_async);
			
 
				 
			
 
				 /*
			
 
				  * Resend pending generic requests.
			
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
				 	if (!monc->m_subscribe_ack)
			
 
				 		goto out_auth;
			
 
				 
			
 
				-	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
			
 
				+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
			
 
				 					 true);
			
 
				 	if (!monc->m_subscribe)
			
 
				 		goto out_subscribe_ack;
			
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
				 
			
 
				 	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
			
 
				 	monc->generic_request_tree = RB_ROOT;
			
 
				-	monc->num_generic_requests = 0;
			
 
				 	monc->last_tid = 0;
			
 
				 
			
 
				+	monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
			
 
				+
			
 
				 	return 0;
			
 
				 
			
 
				 out_auth_reply:
			
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
				 
			
 
				 	ceph_auth_destroy(monc->auth);
			
 
				 
			
 
				+	WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
			
 
				+
			
 
				 	ceph_msg_put(monc->m_auth);
			
 
				 	ceph_msg_put(monc->m_auth_reply);
			
 
				 	ceph_msg_put(monc->m_subscribe);
			
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -19,25 +19,12 @@
 
				 #include <linux/ceph/auth.h>
			
 
				 #include <linux/ceph/pagelist.h>
			
 
				 
			
 
				-#define OSD_OP_FRONT_LEN	4096
			
 
				 #define OSD_OPREPLY_FRONT_LEN	512
			
 
				 
			
 
				 static struct kmem_cache	*ceph_osd_request_cache;
			
 
				 
			
 
				 static const struct ceph_connection_operations osd_con_ops;
			
 
				 
			
 
				-static void __send_queued(struct ceph_osd_client *osdc);
			
 
				-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
			
 
				-static void __register_request(struct ceph_osd_client *osdc,
			
 
				-			       struct ceph_osd_request *req);
			
 
				-static void __unregister_request(struct ceph_osd_client *osdc,
			
 
				-				 struct ceph_osd_request *req);
			
 
				-static void __unregister_linger_request(struct ceph_osd_client *osdc,
			
 
				-					struct ceph_osd_request *req);
			
 
				-static void __enqueue_request(struct ceph_osd_request *req);
			
 
				-static void __send_request(struct ceph_osd_client *osdc,
			
 
				-			   struct ceph_osd_request *req);
			
 
				-
			
 
				 /*
			
 
				  * Implement client access to distributed object storage cluster.
			
 
				  *
			
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
 
				  * channel with an OSD is reset.
			
 
				  */
			
 
				 
			
 
				+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
			
 
				+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
			
 
				+static void link_linger(struct ceph_osd *osd,
			
 
				+			struct ceph_osd_linger_request *lreq);
			
 
				+static void unlink_linger(struct ceph_osd *osd,
			
 
				+			  struct ceph_osd_linger_request *lreq);
			
 
				+
			
 
				+#if 1
			
 
				+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
			
 
				+{
			
 
				+	bool wrlocked = true;
			
 
				+
			
 
				+	if (unlikely(down_read_trylock(sem))) {
			
 
				+		wrlocked = false;
			
 
				+		up_read(sem);
			
 
				+	}
			
 
				+
			
 
				+	return wrlocked;
			
 
				+}
			
 
				+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
			
 
				+{
			
 
				+	WARN_ON(!rwsem_is_locked(&osdc->lock));
			
 
				+}
			
 
				+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
			
 
				+{
			
 
				+	WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
			
 
				+}
			
 
				+static inline void verify_osd_locked(struct ceph_osd *osd)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = osd->o_osdc;
			
 
				+
			
 
				+	WARN_ON(!(mutex_is_locked(&osd->lock) &&
			
 
				+		  rwsem_is_locked(&osdc->lock)) &&
			
 
				+		!rwsem_is_wrlocked(&osdc->lock));
			
 
				+}
			
 
				+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	WARN_ON(!mutex_is_locked(&lreq->lock));
			
 
				+}
			
 
				+#else
			
 
				+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
			
 
				+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
			
 
				+static inline void verify_osd_locked(struct ceph_osd *osd) { }
			
 
				+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * calculate the mapping of a file extent onto an object, and fill out the
			
 
				  * request accordingly.  shorten extent as necessary if it crosses an
			
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 
				 }
			
 
				 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
			
 
				 
			
 
				-struct ceph_osd_data *
			
 
				-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
			
 
				-			unsigned int which)
			
 
				-{
			
 
				-	return osd_req_op_data(osd_req, which, cls, response_data);
			
 
				-}
			
 
				-EXPORT_SYMBOL(osd_req_op_cls_response_data);	/* ??? */
			
 
				-
			
 
				 void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
			
 
				 			unsigned int which, struct page **pages,
			
 
				 			u64 length, u32 alignment,
			
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
 
				 
			
 
				 	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
			
 
				 	ceph_osd_data_pagelist_init(osd_data, pagelist);
			
 
				+	osd_req->r_ops[which].cls.indata_len += pagelist->length;
			
 
				+	osd_req->r_ops[which].indata_len += pagelist->length;
			
 
				 }
			
 
				 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
			
 
				 
			
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
 
				 	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
			
 
				 	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
			
 
				 				pages_from_pool, own_pages);
			
 
				+	osd_req->r_ops[which].cls.indata_len += length;
			
 
				+	osd_req->r_ops[which].indata_len += length;
			
 
				 }
			
 
				 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
			
 
				 
			
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 
				 	case CEPH_OSD_OP_STAT:
			
 
				 		ceph_osd_data_release(&op->raw_data_in);
			
 
				 		break;
			
 
				+	case CEPH_OSD_OP_NOTIFY_ACK:
			
 
				+		ceph_osd_data_release(&op->notify_ack.request_data);
			
 
				+		break;
			
 
				+	case CEPH_OSD_OP_NOTIFY:
			
 
				+		ceph_osd_data_release(&op->notify.request_data);
			
 
				+		ceph_osd_data_release(&op->notify.response_data);
			
 
				+		break;
			
 
				 	default:
			
 
				 		break;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Assumes @t is zero-initialized.
			
 
				+ */
			
 
				+static void target_init(struct ceph_osd_request_target *t)
			
 
				+{
			
 
				+	ceph_oid_init(&t->base_oid);
			
 
				+	ceph_oloc_init(&t->base_oloc);
			
 
				+	ceph_oid_init(&t->target_oid);
			
 
				+	ceph_oloc_init(&t->target_oloc);
			
 
				+
			
 
				+	ceph_osds_init(&t->acting);
			
 
				+	ceph_osds_init(&t->up);
			
 
				+	t->size = -1;
			
 
				+	t->min_size = -1;
			
 
				+
			
 
				+	t->osd = CEPH_HOMELESS_OSD;
			
 
				+}
			
 
				+
			
 
				+static void target_copy(struct ceph_osd_request_target *dest,
			
 
				+			const struct ceph_osd_request_target *src)
			
 
				+{
			
 
				+	ceph_oid_copy(&dest->base_oid, &src->base_oid);
			
 
				+	ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
			
 
				+	ceph_oid_copy(&dest->target_oid, &src->target_oid);
			
 
				+	ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
			
 
				+
			
 
				+	dest->pgid = src->pgid; /* struct */
			
 
				+	dest->pg_num = src->pg_num;
			
 
				+	dest->pg_num_mask = src->pg_num_mask;
			
 
				+	ceph_osds_copy(&dest->acting, &src->acting);
			
 
				+	ceph_osds_copy(&dest->up, &src->up);
			
 
				+	dest->size = src->size;
			
 
				+	dest->min_size = src->min_size;
			
 
				+	dest->sort_bitwise = src->sort_bitwise;
			
 
				+
			
 
				+	dest->flags = src->flags;
			
 
				+	dest->paused = src->paused;
			
 
				+
			
 
				+	dest->osd = src->osd;
			
 
				+}
			
 
				+
			
 
				+static void target_destroy(struct ceph_osd_request_target *t)
			
 
				+{
			
 
				+	ceph_oid_destroy(&t->base_oid);
			
 
				+	ceph_oid_destroy(&t->target_oid);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * requests
			
 
				  */
			
 
				+static void request_release_checks(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
			
 
				+	WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
			
 
				+	WARN_ON(!list_empty(&req->r_unsafe_item));
			
 
				+	WARN_ON(req->r_osd);
			
 
				+}
			
 
				+
			
 
				 static void ceph_osdc_release_request(struct kref *kref)
			
 
				 {
			
 
				 	struct ceph_osd_request *req = container_of(kref,
			
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
 
				 
			
 
				 	dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
			
 
				 	     req->r_request, req->r_reply);
			
 
				-	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
			
 
				-	WARN_ON(!list_empty(&req->r_req_lru_item));
			
 
				-	WARN_ON(!list_empty(&req->r_osd_item));
			
 
				-	WARN_ON(!list_empty(&req->r_linger_item));
			
 
				-	WARN_ON(!list_empty(&req->r_linger_osd_item));
			
 
				-	WARN_ON(req->r_osd);
			
 
				+	request_release_checks(req);
			
 
				 
			
 
				 	if (req->r_request)
			
 
				 		ceph_msg_put(req->r_request);
			
 
				-	if (req->r_reply) {
			
 
				-		ceph_msg_revoke_incoming(req->r_reply);
			
 
				+	if (req->r_reply)
			
 
				 		ceph_msg_put(req->r_reply);
			
 
				-	}
			
 
				 
			
 
				 	for (which = 0; which < req->r_num_ops; which++)
			
 
				 		osd_req_op_data_release(req, which);
			
 
				 
			
 
				+	target_destroy(&req->r_t);
			
 
				 	ceph_put_snap_context(req->r_snapc);
			
 
				+
			
 
				 	if (req->r_mempool)
			
 
				 		mempool_free(req, req->r_osdc->req_mempool);
			
 
				 	else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
			
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
 
				 
			
 
				 void ceph_osdc_put_request(struct ceph_osd_request *req)
			
 
				 {
			
 
				-	dout("%s %p (was %d)\n", __func__, req,
			
 
				-	     atomic_read(&req->r_kref.refcount));
			
 
				-	kref_put(&req->r_kref, ceph_osdc_release_request);
			
 
				+	if (req) {
			
 
				+		dout("%s %p (was %d)\n", __func__, req,
			
 
				+		     atomic_read(&req->r_kref.refcount));
			
 
				+		kref_put(&req->r_kref, ceph_osdc_release_request);
			
 
				+	}
			
 
				 }
			
 
				 EXPORT_SYMBOL(ceph_osdc_put_request);
			
 
				 
			
 
				+static void request_init(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	/* req only, each op is zeroed in _osd_req_op_init() */
			
 
				+	memset(req, 0, sizeof(*req));
			
 
				+
			
 
				+	kref_init(&req->r_kref);
			
 
				+	init_completion(&req->r_completion);
			
 
				+	init_completion(&req->r_safe_completion);
			
 
				+	RB_CLEAR_NODE(&req->r_node);
			
 
				+	RB_CLEAR_NODE(&req->r_mc_node);
			
 
				+	INIT_LIST_HEAD(&req->r_unsafe_item);
			
 
				+
			
 
				+	target_init(&req->r_t);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This is ugly, but it allows us to reuse linger registration and ping
			
 
				+ * requests, keeping the structure of the code around send_linger{_ping}()
			
 
				+ * reasonable.  Setting up a min_nr=2 mempool for each linger request
			
 
				+ * and dealing with copying ops (this blasts req only, watch op remains
			
 
				+ * intact) isn't any better.
			
 
				+ */
			
 
				+static void request_reinit(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				+	bool mempool = req->r_mempool;
			
 
				+	unsigned int num_ops = req->r_num_ops;
			
 
				+	u64 snapid = req->r_snapid;
			
 
				+	struct ceph_snap_context *snapc = req->r_snapc;
			
 
				+	bool linger = req->r_linger;
			
 
				+	struct ceph_msg *request_msg = req->r_request;
			
 
				+	struct ceph_msg *reply_msg = req->r_reply;
			
 
				+
			
 
				+	dout("%s req %p\n", __func__, req);
			
 
				+	WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
			
 
				+	request_release_checks(req);
			
 
				+
			
 
				+	WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
			
 
				+	WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
			
 
				+	target_destroy(&req->r_t);
			
 
				+
			
 
				+	request_init(req);
			
 
				+	req->r_osdc = osdc;
			
 
				+	req->r_mempool = mempool;
			
 
				+	req->r_num_ops = num_ops;
			
 
				+	req->r_snapid = snapid;
			
 
				+	req->r_snapc = snapc;
			
 
				+	req->r_linger = linger;
			
 
				+	req->r_request = request_msg;
			
 
				+	req->r_reply = reply_msg;
			
 
				+}
			
 
				+
			
 
				 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
			
 
				 					       struct ceph_snap_context *snapc,
			
 
				 					       unsigned int num_ops,
			
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 
				 					       gfp_t gfp_flags)
			
 
				 {
			
 
				 	struct ceph_osd_request *req;
			
 
				-	struct ceph_msg *msg;
			
 
				-	size_t msg_size;
			
 
				 
			
 
				 	if (use_mempool) {
			
 
				 		BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
			
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 
				 	if (unlikely(!req))
			
 
				 		return NULL;
			
 
				 
			
 
				-	/* req only, each op is zeroed in _osd_req_op_init() */
			
 
				-	memset(req, 0, sizeof(*req));
			
 
				-
			
 
				+	request_init(req);
			
 
				 	req->r_osdc = osdc;
			
 
				 	req->r_mempool = use_mempool;
			
 
				 	req->r_num_ops = num_ops;
			
 
				+	req->r_snapid = CEPH_NOSNAP;
			
 
				+	req->r_snapc = ceph_get_snap_context(snapc);
			
 
				 
			
 
				-	kref_init(&req->r_kref);
			
 
				-	init_completion(&req->r_completion);
			
 
				-	init_completion(&req->r_safe_completion);
			
 
				-	RB_CLEAR_NODE(&req->r_node);
			
 
				-	INIT_LIST_HEAD(&req->r_unsafe_item);
			
 
				-	INIT_LIST_HEAD(&req->r_linger_item);
			
 
				-	INIT_LIST_HEAD(&req->r_linger_osd_item);
			
 
				-	INIT_LIST_HEAD(&req->r_req_lru_item);
			
 
				-	INIT_LIST_HEAD(&req->r_osd_item);
			
 
				-
			
 
				-	req->r_base_oloc.pool = -1;
			
 
				-	req->r_target_oloc.pool = -1;
			
 
				+	dout("%s req %p\n", __func__, req);
			
 
				+	return req;
			
 
				+}
			
 
				+EXPORT_SYMBOL(ceph_osdc_alloc_request);
			
 
				 
			
 
				-	msg_size = OSD_OPREPLY_FRONT_LEN;
			
 
				-	if (num_ops > CEPH_OSD_SLAB_OPS) {
			
 
				-		/* ceph_osd_op and rval */
			
 
				-		msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
			
 
				-			    (sizeof(struct ceph_osd_op) + 4);
			
 
				-	}
			
 
				+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				+	struct ceph_msg *msg;
			
 
				+	int msg_size;
			
 
				 
			
 
				-	/* create reply message */
			
 
				-	if (use_mempool)
			
 
				-		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
			
 
				-	else
			
 
				-		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
			
 
				-				   gfp_flags, true);
			
 
				-	if (!msg) {
			
 
				-		ceph_osdc_put_request(req);
			
 
				-		return NULL;
			
 
				-	}
			
 
				-	req->r_reply = msg;
			
 
				+	WARN_ON(ceph_oid_empty(&req->r_base_oid));
			
 
				 
			
 
				+	/* create request message */
			
 
				 	msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
			
 
				 	msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
			
 
				 	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
			
 
				 	msg_size += 1 + 8 + 4 + 4; /* pgid */
			
 
				-	msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
			
 
				-	msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
			
 
				+	msg_size += 4 + req->r_base_oid.name_len; /* oid */
			
 
				+	msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
			
 
				 	msg_size += 8; /* snapid */
			
 
				 	msg_size += 8; /* snap_seq */
			
 
				-	msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
			
 
				+	msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
			
 
				 	msg_size += 4; /* retry_attempt */
			
 
				 
			
 
				-	/* create request message; allow space for oid */
			
 
				-	if (use_mempool)
			
 
				+	if (req->r_mempool)
			
 
				 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
			
 
				 	else
			
 
				-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
			
 
				-	if (!msg) {
			
 
				-		ceph_osdc_put_request(req);
			
 
				-		return NULL;
			
 
				-	}
			
 
				+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
			
 
				+	if (!msg)
			
 
				+		return -ENOMEM;
			
 
				 
			
 
				 	memset(msg->front.iov_base, 0, msg->front.iov_len);
			
 
				-
			
 
				 	req->r_request = msg;
			
 
				 
			
 
				-	return req;
			
 
				+	/* create reply message */
			
 
				+	msg_size = OSD_OPREPLY_FRONT_LEN;
			
 
				+	msg_size += req->r_base_oid.name_len;
			
 
				+	msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
			
 
				+
			
 
				+	if (req->r_mempool)
			
 
				+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
			
 
				+	else
			
 
				+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
			
 
				+	if (!msg)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	req->r_reply = msg;
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_osdc_alloc_request);
			
 
				+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
			
 
				 
			
 
				 static bool osd_req_opcode_valid(u16 opcode)
			
 
				 {
			
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 
				 
			
 
				 	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
			
 
				 
			
 
				-	op->cls.argc = 0;	/* currently unused */
			
 
				-
			
 
				 	op->indata_len = payload_len;
			
 
				 }
			
 
				 EXPORT_SYMBOL(osd_req_op_cls_init);
			
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 
				 }
			
 
				 EXPORT_SYMBOL(osd_req_op_xattr_init);
			
 
				 
			
 
				-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
			
 
				-				unsigned int which, u16 opcode,
			
 
				-				u64 cookie, u64 version, int flag)
			
 
				+/*
			
 
				+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
			
 
				+ */
			
 
				+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
			
 
				+				  u64 cookie, u8 watch_opcode)
			
 
				 {
			
 
				-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
			
 
				-						      opcode, 0);
			
 
				-
			
 
				-	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
			
 
				+	struct ceph_osd_req_op *op;
			
 
				 
			
 
				+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
			
 
				 	op->watch.cookie = cookie;
			
 
				-	op->watch.ver = version;
			
 
				-	if (opcode == CEPH_OSD_OP_WATCH && flag)
			
 
				-		op->watch.flag = (u8)1;
			
 
				+	op->watch.op = watch_opcode;
			
 
				+	op->watch.gen = 0;
			
 
				 }
			
 
				-EXPORT_SYMBOL(osd_req_op_watch_init);
			
 
				 
			
 
				 void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
			
 
				 				unsigned int which,
			
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static u64 osd_req_encode_op(struct ceph_osd_request *req,
			
 
				-			      struct ceph_osd_op *dst, unsigned int which)
			
 
				+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
			
 
				+			     const struct ceph_osd_req_op *src)
			
 
				 {
			
 
				-	struct ceph_osd_req_op *src;
			
 
				-	struct ceph_osd_data *osd_data;
			
 
				-	u64 request_data_len = 0;
			
 
				-	u64 data_length;
			
 
				-
			
 
				-	BUG_ON(which >= req->r_num_ops);
			
 
				-	src = &req->r_ops[which];
			
 
				 	if (WARN_ON(!osd_req_opcode_valid(src->op))) {
			
 
				 		pr_err("unrecognized osd opcode %d\n", src->op);
			
 
				 
			
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
				 
			
 
				 	switch (src->op) {
			
 
				 	case CEPH_OSD_OP_STAT:
			
 
				-		osd_data = &src->raw_data_in;
			
 
				-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
			
 
				 		break;
			
 
				 	case CEPH_OSD_OP_READ:
			
 
				 	case CEPH_OSD_OP_WRITE:
			
 
				 	case CEPH_OSD_OP_WRITEFULL:
			
 
				 	case CEPH_OSD_OP_ZERO:
			
 
				 	case CEPH_OSD_OP_TRUNCATE:
			
 
				-		if (src->op == CEPH_OSD_OP_WRITE ||
			
 
				-		    src->op == CEPH_OSD_OP_WRITEFULL)
			
 
				-			request_data_len = src->extent.length;
			
 
				 		dst->extent.offset = cpu_to_le64(src->extent.offset);
			
 
				 		dst->extent.length = cpu_to_le64(src->extent.length);
			
 
				 		dst->extent.truncate_size =
			
 
				 			cpu_to_le64(src->extent.truncate_size);
			
 
				 		dst->extent.truncate_seq =
			
 
				 			cpu_to_le32(src->extent.truncate_seq);
			
 
				-		osd_data = &src->extent.osd_data;
			
 
				-		if (src->op == CEPH_OSD_OP_WRITE ||
			
 
				-		    src->op == CEPH_OSD_OP_WRITEFULL)
			
 
				-			ceph_osdc_msg_data_add(req->r_request, osd_data);
			
 
				-		else
			
 
				-			ceph_osdc_msg_data_add(req->r_reply, osd_data);
			
 
				 		break;
			
 
				 	case CEPH_OSD_OP_CALL:
			
 
				 		dst->cls.class_len = src->cls.class_len;
			
 
				 		dst->cls.method_len = src->cls.method_len;
			
 
				-		osd_data = &src->cls.request_info;
			
 
				-		ceph_osdc_msg_data_add(req->r_request, osd_data);
			
 
				-		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
			
 
				-		request_data_len = osd_data->pagelist->length;
			
 
				-
			
 
				-		osd_data = &src->cls.request_data;
			
 
				-		data_length = ceph_osd_data_length(osd_data);
			
 
				-		if (data_length) {
			
 
				-			BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
			
 
				-			dst->cls.indata_len = cpu_to_le32(data_length);
			
 
				-			ceph_osdc_msg_data_add(req->r_request, osd_data);
			
 
				-			src->indata_len += data_length;
			
 
				-			request_data_len += data_length;
			
 
				-		}
			
 
				-		osd_data = &src->cls.response_data;
			
 
				-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
			
 
				+		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
			
 
				 		break;
			
 
				 	case CEPH_OSD_OP_STARTSYNC:
			
 
				 		break;
			
 
				-	case CEPH_OSD_OP_NOTIFY_ACK:
			
 
				 	case CEPH_OSD_OP_WATCH:
			
 
				 		dst->watch.cookie = cpu_to_le64(src->watch.cookie);
			
 
				-		dst->watch.ver = cpu_to_le64(src->watch.ver);
			
 
				-		dst->watch.flag = src->watch.flag;
			
 
				+		dst->watch.ver = cpu_to_le64(0);
			
 
				+		dst->watch.op = src->watch.op;
			
 
				+		dst->watch.gen = cpu_to_le32(src->watch.gen);
			
 
				+		break;
			
 
				+	case CEPH_OSD_OP_NOTIFY_ACK:
			
 
				+		break;
			
 
				+	case CEPH_OSD_OP_NOTIFY:
			
 
				+		dst->notify.cookie = cpu_to_le64(src->notify.cookie);
			
 
				 		break;
			
 
				 	case CEPH_OSD_OP_SETALLOCHINT:
			
 
				 		dst->alloc_hint.expected_object_size =
			
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
				 		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
			
 
				 		dst->xattr.cmp_op = src->xattr.cmp_op;
			
 
				 		dst->xattr.cmp_mode = src->xattr.cmp_mode;
			
 
				-		osd_data = &src->xattr.osd_data;
			
 
				-		ceph_osdc_msg_data_add(req->r_request, osd_data);
			
 
				-		request_data_len = osd_data->pagelist->length;
			
 
				 		break;
			
 
				 	case CEPH_OSD_OP_CREATE:
			
 
				 	case CEPH_OSD_OP_DELETE:
			
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
				 	dst->flags = cpu_to_le32(src->flags);
			
 
				 	dst->payload_len = cpu_to_le32(src->indata_len);
			
 
				 
			
 
				-	return request_data_len;
			
 
				+	return src->indata_len;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
				 
			
 
				 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
			
 
				 					GFP_NOFS);
			
 
				-	if (!req)
			
 
				-		return ERR_PTR(-ENOMEM);
			
 
				-
			
 
				-	req->r_flags = flags;
			
 
				+	if (!req) {
			
 
				+		r = -ENOMEM;
			
 
				+		goto fail;
			
 
				+	}
			
 
				 
			
 
				 	/* calculate max write size */
			
 
				 	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
			
 
				-	if (r < 0) {
			
 
				-		ceph_osdc_put_request(req);
			
 
				-		return ERR_PTR(r);
			
 
				-	}
			
 
				+	if (r)
			
 
				+		goto fail;
			
 
				 
			
 
				 	if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
			
 
				 		osd_req_op_init(req, which, opcode, 0);
			
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
				 				       truncate_size, truncate_seq);
			
 
				 	}
			
 
				 
			
 
				+	req->r_flags = flags;
			
 
				 	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
			
 
				+	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
			
 
				 
			
 
				-	snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
			
 
				-		 "%llx.%08llx", vino.ino, objnum);
			
 
				-	req->r_base_oid.name_len = strlen(req->r_base_oid.name);
			
 
				+	req->r_snapid = vino.snap;
			
 
				+	if (flags & CEPH_OSD_FLAG_WRITE)
			
 
				+		req->r_data_offset = off;
			
 
				+
			
 
				+	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
			
 
				+	if (r)
			
 
				+		goto fail;
			
 
				 
			
 
				 	return req;
			
 
				+
			
 
				+fail:
			
 
				+	ceph_osdc_put_request(req);
			
 
				+	return ERR_PTR(r);
			
 
				 }
			
 
				 EXPORT_SYMBOL(ceph_osdc_new_request);
			
 
				 
			
 
				 /*
			
 
				  * We keep osd requests in an rbtree, sorted by ->r_tid.
			
 
				  */
			
 
				-static void __insert_request(struct ceph_osd_client *osdc,
			
 
				-			     struct ceph_osd_request *new)
			
 
				-{
			
 
				-	struct rb_node **p = &osdc->requests.rb_node;
			
 
				-	struct rb_node *parent = NULL;
			
 
				-	struct ceph_osd_request *req = NULL;
			
 
				-
			
 
				-	while (*p) {
			
 
				-		parent = *p;
			
 
				-		req = rb_entry(parent, struct ceph_osd_request, r_node);
			
 
				-		if (new->r_tid < req->r_tid)
			
 
				-			p = &(*p)->rb_left;
			
 
				-		else if (new->r_tid > req->r_tid)
			
 
				-			p = &(*p)->rb_right;
			
 
				-		else
			
 
				-			BUG();
			
 
				-	}
			
 
				-
			
 
				-	rb_link_node(&new->r_node, parent, p);
			
 
				-	rb_insert_color(&new->r_node, &osdc->requests);
			
 
				-}
			
 
				-
			
 
				-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
			
 
				-						 u64 tid)
			
 
				-{
			
 
				-	struct ceph_osd_request *req;
			
 
				-	struct rb_node *n = osdc->requests.rb_node;
			
 
				-
			
 
				-	while (n) {
			
 
				-		req = rb_entry(n, struct ceph_osd_request, r_node);
			
 
				-		if (tid < req->r_tid)
			
 
				-			n = n->rb_left;
			
 
				-		else if (tid > req->r_tid)
			
 
				-			n = n->rb_right;
			
 
				-		else
			
 
				-			return req;
			
 
				-	}
			
 
				-	return NULL;
			
 
				-}
			
 
				+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
			
 
				+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
			
 
				 
			
 
				-static struct ceph_osd_request *
			
 
				-__lookup_request_ge(struct ceph_osd_client *osdc,
			
 
				-		    u64 tid)
			
 
				+static bool osd_homeless(struct ceph_osd *osd)
			
 
				 {
			
 
				-	struct ceph_osd_request *req;
			
 
				-	struct rb_node *n = osdc->requests.rb_node;
			
 
				-
			
 
				-	while (n) {
			
 
				-		req = rb_entry(n, struct ceph_osd_request, r_node);
			
 
				-		if (tid < req->r_tid) {
			
 
				-			if (!n->rb_left)
			
 
				-				return req;
			
 
				-			n = n->rb_left;
			
 
				-		} else if (tid > req->r_tid) {
			
 
				-			n = n->rb_right;
			
 
				-		} else {
			
 
				-			return req;
			
 
				-		}
			
 
				-	}
			
 
				-	return NULL;
			
 
				+	return osd->o_osd == CEPH_HOMELESS_OSD;
			
 
				 }
			
 
				 
			
 
				-static void __kick_linger_request(struct ceph_osd_request *req)
			
 
				+static bool osd_registered(struct ceph_osd *osd)
			
 
				 {
			
 
				-	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				-	struct ceph_osd *osd = req->r_osd;
			
 
				-
			
 
				-	/*
			
 
				-	 * Linger requests need to be resent with a new tid to avoid
			
 
				-	 * the dup op detection logic on the OSDs.  Achieve this with
			
 
				-	 * a re-register dance instead of open-coding.
			
 
				-	 */
			
 
				-	ceph_osdc_get_request(req);
			
 
				-	if (!list_empty(&req->r_linger_item))
			
 
				-		__unregister_linger_request(osdc, req);
			
 
				-	else
			
 
				-		__unregister_request(osdc, req);
			
 
				-	__register_request(osdc, req);
			
 
				-	ceph_osdc_put_request(req);
			
 
				-
			
 
				-	/*
			
 
				-	 * Unless request has been registered as both normal and
			
 
				-	 * lingering, __unregister{,_linger}_request clears r_osd.
			
 
				-	 * However, here we need to preserve r_osd to make sure we
			
 
				-	 * requeue on the same OSD.
			
 
				-	 */
			
 
				-	WARN_ON(req->r_osd || !osd);
			
 
				-	req->r_osd = osd;
			
 
				+	verify_osdc_locked(osd->o_osdc);
			
 
				 
			
 
				-	dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
			
 
				-	__enqueue_request(req);
			
 
				+	return !RB_EMPTY_NODE(&osd->o_node);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Resubmit requests pending on the given osd.
			
 
				+ * Assumes @osd is zero-initialized.
			
 
				  */
			
 
				-static void __kick_osd_requests(struct ceph_osd_client *osdc,
			
 
				-				struct ceph_osd *osd)
			
 
				+static void osd_init(struct ceph_osd *osd)
			
 
				 {
			
 
				-	struct ceph_osd_request *req, *nreq;
			
 
				-	LIST_HEAD(resend);
			
 
				-	LIST_HEAD(resend_linger);
			
 
				-	int err;
			
 
				-
			
 
				-	dout("%s osd%d\n", __func__, osd->o_osd);
			
 
				-	err = __reset_osd(osdc, osd);
			
 
				-	if (err)
			
 
				-		return;
			
 
				-
			
 
				-	/*
			
 
				-	 * Build up a list of requests to resend by traversing the
			
 
				-	 * osd's list of requests.  Requests for a given object are
			
 
				-	 * sent in tid order, and that is also the order they're
			
 
				-	 * kept on this list.  Therefore all requests that are in
			
 
				-	 * flight will be found first, followed by all requests that
			
 
				-	 * have not yet been sent.  And to resend requests while
			
 
				-	 * preserving this order we will want to put any sent
			
 
				-	 * requests back on the front of the osd client's unsent
			
 
				-	 * list.
			
 
				-	 *
			
 
				-	 * So we build a separate ordered list of already-sent
			
 
				-	 * requests for the affected osd and splice it onto the
			
 
				-	 * front of the osd client's unsent list.  Once we've seen a
			
 
				-	 * request that has not yet been sent we're done.  Those
			
 
				-	 * requests are already sitting right where they belong.
			
 
				-	 */
			
 
				-	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
			
 
				-		if (!req->r_sent)
			
 
				-			break;
			
 
				-
			
 
				-		if (!req->r_linger) {
			
 
				-			dout("%s requeueing %p tid %llu\n", __func__, req,
			
 
				-			     req->r_tid);
			
 
				-			list_move_tail(&req->r_req_lru_item, &resend);
			
 
				-			req->r_flags |= CEPH_OSD_FLAG_RETRY;
			
 
				-		} else {
			
 
				-			list_move_tail(&req->r_req_lru_item, &resend_linger);
			
 
				-		}
			
 
				-	}
			
 
				-	list_splice(&resend, &osdc->req_unsent);
			
 
				-
			
 
				-	/*
			
 
				-	 * Both registered and not yet registered linger requests are
			
 
				-	 * enqueued with a new tid on the same OSD.  We add/move them
			
 
				-	 * to req_unsent/o_requests at the end to keep things in tid
			
 
				-	 * order.
			
 
				-	 */
			
 
				-	list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
			
 
				-				 r_linger_osd_item) {
			
 
				-		WARN_ON(!list_empty(&req->r_req_lru_item));
			
 
				-		__kick_linger_request(req);
			
 
				-	}
			
 
				-
			
 
				-	list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
			
 
				-		__kick_linger_request(req);
			
 
				+	atomic_set(&osd->o_ref, 1);
			
 
				+	RB_CLEAR_NODE(&osd->o_node);
			
 
				+	osd->o_requests = RB_ROOT;
			
 
				+	osd->o_linger_requests = RB_ROOT;
			
 
				+	INIT_LIST_HEAD(&osd->o_osd_lru);
			
 
				+	INIT_LIST_HEAD(&osd->o_keepalive_item);
			
 
				+	osd->o_incarnation = 1;
			
 
				+	mutex_init(&osd->lock);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * If the osd connection drops, we need to resubmit all requests.
			
 
				- */
			
 
				-static void osd_reset(struct ceph_connection *con)
			
 
				+static void osd_cleanup(struct ceph_osd *osd)
			
 
				 {
			
 
				-	struct ceph_osd *osd = con->private;
			
 
				-	struct ceph_osd_client *osdc;
			
 
				-
			
 
				-	if (!osd)
			
 
				-		return;
			
 
				-	dout("osd_reset osd%d\n", osd->o_osd);
			
 
				-	osdc = osd->o_osdc;
			
 
				-	down_read(&osdc->map_sem);
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	__kick_osd_requests(osdc, osd);
			
 
				-	__send_queued(osdc);
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				-	up_read(&osdc->map_sem);
			
 
				+	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
			
 
				+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
			
 
				+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
			
 
				+	WARN_ON(!list_empty(&osd->o_osd_lru));
			
 
				+	WARN_ON(!list_empty(&osd->o_keepalive_item));
			
 
				+
			
 
				+	if (osd->o_auth.authorizer) {
			
 
				+		WARN_ON(osd_homeless(osd));
			
 
				+		ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 
				 {
			
 
				 	struct ceph_osd *osd;
			
 
				 
			
 
				-	osd = kzalloc(sizeof(*osd), GFP_NOFS);
			
 
				-	if (!osd)
			
 
				-		return NULL;
			
 
				+	WARN_ON(onum == CEPH_HOMELESS_OSD);
			
 
				 
			
 
				-	atomic_set(&osd->o_ref, 1);
			
 
				+	osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
			
 
				+	osd_init(osd);
			
 
				 	osd->o_osdc = osdc;
			
 
				 	osd->o_osd = onum;
			
 
				-	RB_CLEAR_NODE(&osd->o_node);
			
 
				-	INIT_LIST_HEAD(&osd->o_requests);
			
 
				-	INIT_LIST_HEAD(&osd->o_linger_requests);
			
 
				-	INIT_LIST_HEAD(&osd->o_osd_lru);
			
 
				-	osd->o_incarnation = 1;
			
 
				 
			
 
				 	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
			
 
				 
			
 
				-	INIT_LIST_HEAD(&osd->o_keepalive_item);
			
 
				 	return osd;
			
 
				 }
			
 
				 
			
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
 
				 	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
			
 
				 	     atomic_read(&osd->o_ref) - 1);
			
 
				 	if (atomic_dec_and_test(&osd->o_ref)) {
			
 
				-		if (osd->o_auth.authorizer)
			
 
				-			ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
			
 
				+		osd_cleanup(osd);
			
 
				 		kfree(osd);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * remove an osd from our map
			
 
				- */
			
 
				-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
			
 
				-{
			
 
				-	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
			
 
				-	WARN_ON(!list_empty(&osd->o_requests));
			
 
				-	WARN_ON(!list_empty(&osd->o_linger_requests));
			
 
				-
			
 
				-	list_del_init(&osd->o_osd_lru);
			
 
				-	rb_erase(&osd->o_node, &osdc->osds);
			
 
				-	RB_CLEAR_NODE(&osd->o_node);
			
 
				-}
			
 
				-
			
 
				-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
			
 
				-{
			
 
				-	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
			
 
				-
			
 
				-	if (!RB_EMPTY_NODE(&osd->o_node)) {
			
 
				-		ceph_con_close(&osd->o_con);
			
 
				-		__remove_osd(osdc, osd);
			
 
				-		put_osd(osd);
			
 
				-	}
			
 
				-}
			
 
				+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
			
 
				 
			
 
				-static void remove_all_osds(struct ceph_osd_client *osdc)
			
 
				+static void __move_osd_to_lru(struct ceph_osd *osd)
			
 
				 {
			
 
				-	dout("%s %p\n", __func__, osdc);
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	while (!RB_EMPTY_ROOT(&osdc->osds)) {
			
 
				-		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
			
 
				-						struct ceph_osd, o_node);
			
 
				-		remove_osd(osdc, osd);
			
 
				-	}
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				-}
			
 
				+	struct ceph_osd_client *osdc = osd->o_osdc;
			
 
				 
			
 
				-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
			
 
				-			      struct ceph_osd *osd)
			
 
				-{
			
 
				-	dout("%s %p\n", __func__, osd);
			
 
				+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
			
 
				 	BUG_ON(!list_empty(&osd->o_osd_lru));
			
 
				 
			
 
				+	spin_lock(&osdc->osd_lru_lock);
			
 
				 	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
			
 
				+	spin_unlock(&osdc->osd_lru_lock);
			
 
				+
			
 
				 	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
			
 
				 }
			
 
				 
			
 
				-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
			
 
				-				  struct ceph_osd *osd)
			
 
				+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
			
 
				 {
			
 
				-	dout("%s %p\n", __func__, osd);
			
 
				-
			
 
				-	if (list_empty(&osd->o_requests) &&
			
 
				-	    list_empty(&osd->o_linger_requests))
			
 
				-		__move_osd_to_lru(osdc, osd);
			
 
				+	if (RB_EMPTY_ROOT(&osd->o_requests) &&
			
 
				+	    RB_EMPTY_ROOT(&osd->o_linger_requests))
			
 
				+		__move_osd_to_lru(osd);
			
 
				 }
			
 
				 
			
 
				 static void __remove_osd_from_lru(struct ceph_osd *osd)
			
 
				 {
			
 
				-	dout("__remove_osd_from_lru %p\n", osd);
			
 
				+	struct ceph_osd_client *osdc = osd->o_osdc;
			
 
				+
			
 
				+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
			
 
				+
			
 
				+	spin_lock(&osdc->osd_lru_lock);
			
 
				 	if (!list_empty(&osd->o_osd_lru))
			
 
				 		list_del_init(&osd->o_osd_lru);
			
 
				+	spin_unlock(&osdc->osd_lru_lock);
			
 
				 }
			
 
				 
			
 
				-static void remove_old_osds(struct ceph_osd_client *osdc)
			
 
				+/*
			
 
				+ * Close the connection and assign any leftover requests to the
			
 
				+ * homeless session.
			
 
				+ */
			
 
				+static void close_osd(struct ceph_osd *osd)
			
 
				 {
			
 
				-	struct ceph_osd *osd, *nosd;
			
 
				+	struct ceph_osd_client *osdc = osd->o_osdc;
			
 
				+	struct rb_node *n;
			
 
				 
			
 
				-	dout("__remove_old_osds %p\n", osdc);
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
			
 
				-		if (time_before(jiffies, osd->lru_ttl))
			
 
				-			break;
			
 
				-		remove_osd(osdc, osd);
			
 
				+	verify_osdc_wrlocked(osdc);
			
 
				+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
			
 
				+
			
 
				+	ceph_con_close(&osd->o_con);
			
 
				+
			
 
				+	for (n = rb_first(&osd->o_requests); n; ) {
			
 
				+		struct ceph_osd_request *req =
			
 
				+		    rb_entry(n, struct ceph_osd_request, r_node);
			
 
				+
			
 
				+		n = rb_next(n); /* unlink_request() */
			
 
				+
			
 
				+		dout(" reassigning req %p tid %llu\n", req, req->r_tid);
			
 
				+		unlink_request(osd, req);
			
 
				+		link_request(&osdc->homeless_osd, req);
			
 
				+	}
			
 
				+	for (n = rb_first(&osd->o_linger_requests); n; ) {
			
 
				+		struct ceph_osd_linger_request *lreq =
			
 
				+		    rb_entry(n, struct ceph_osd_linger_request, node);
			
 
				+
			
 
				+		n = rb_next(n); /* unlink_linger() */
			
 
				+
			
 
				+		dout(" reassigning lreq %p linger_id %llu\n", lreq,
			
 
				+		     lreq->linger_id);
			
 
				+		unlink_linger(osd, lreq);
			
 
				+		link_linger(&osdc->homeless_osd, lreq);
			
 
				 	}
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				+
			
 
				+	__remove_osd_from_lru(osd);
			
 
				+	erase_osd(&osdc->osds, osd);
			
 
				+	put_osd(osd);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				  * reset osd connect
			
 
				  */
			
 
				-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
			
 
				+static int reopen_osd(struct ceph_osd *osd)
			
 
				 {
			
 
				 	struct ceph_entity_addr *peer_addr;
			
 
				 
			
 
				-	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
			
 
				-	if (list_empty(&osd->o_requests) &&
			
 
				-	    list_empty(&osd->o_linger_requests)) {
			
 
				-		remove_osd(osdc, osd);
			
 
				+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
			
 
				+
			
 
				+	if (RB_EMPTY_ROOT(&osd->o_requests) &&
			
 
				+	    RB_EMPTY_ROOT(&osd->o_linger_requests)) {
			
 
				+		close_osd(osd);
			
 
				 		return -ENODEV;
			
 
				 	}
			
 
				 
			
 
				-	peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
			
 
				+	peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
			
 
				 	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
			
 
				 			!ceph_con_opened(&osd->o_con)) {
			
 
				-		struct ceph_osd_request *req;
			
 
				+		struct rb_node *n;
			
 
				 
			
 
				 		dout("osd addr hasn't changed and connection never opened, "
			
 
				 		     "letting msgr retry\n");
			
 
				 		/* touch each r_stamp for handle_timeout()'s benfit */
			
 
				-		list_for_each_entry(req, &osd->o_requests, r_osd_item)
			
 
				+		for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
			
 
				+			struct ceph_osd_request *req =
			
 
				+			    rb_entry(n, struct ceph_osd_request, r_node);
			
 
				 			req->r_stamp = jiffies;
			
 
				+		}
			
 
				 
			
 
				 		return -EAGAIN;
			
 
				 	}
			
@@ -1206,455 +1170,1370 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
			
 
				+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
			
 
				+					  bool wrlocked)
			
 
				 {
			
 
				-	struct rb_node **p = &osdc->osds.rb_node;
			
 
				-	struct rb_node *parent = NULL;
			
 
				-	struct ceph_osd *osd = NULL;
			
 
				+	struct ceph_osd *osd;
			
 
				 
			
 
				-	dout("__insert_osd %p osd%d\n", new, new->o_osd);
			
 
				-	while (*p) {
			
 
				-		parent = *p;
			
 
				-		osd = rb_entry(parent, struct ceph_osd, o_node);
			
 
				-		if (new->o_osd < osd->o_osd)
			
 
				-			p = &(*p)->rb_left;
			
 
				-		else if (new->o_osd > osd->o_osd)
			
 
				-			p = &(*p)->rb_right;
			
 
				-		else
			
 
				-			BUG();
			
 
				-	}
			
 
				+	if (wrlocked)
			
 
				+		verify_osdc_wrlocked(osdc);
			
 
				+	else
			
 
				+		verify_osdc_locked(osdc);
			
 
				 
			
 
				-	rb_link_node(&new->o_node, parent, p);
			
 
				-	rb_insert_color(&new->o_node, &osdc->osds);
			
 
				+	if (o != CEPH_HOMELESS_OSD)
			
 
				+		osd = lookup_osd(&osdc->osds, o);
			
 
				+	else
			
 
				+		osd = &osdc->homeless_osd;
			
 
				+	if (!osd) {
			
 
				+		if (!wrlocked)
			
 
				+			return ERR_PTR(-EAGAIN);
			
 
				+
			
 
				+		osd = create_osd(osdc, o);
			
 
				+		insert_osd(&osdc->osds, osd);
			
 
				+		ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
			
 
				+			      &osdc->osdmap->osd_addr[osd->o_osd]);
			
 
				+	}
			
 
				+
			
 
				+	dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
			
 
				+	return osd;
			
 
				 }
			
 
				 
			
 
				-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
			
 
				+/*
			
 
				+ * Create request <-> OSD session relation.
			
 
				+ *
			
 
				+ * @req has to be assigned a tid, @osd may be homeless.
			
 
				+ */
			
 
				+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
			
 
				 {
			
 
				-	struct ceph_osd *osd;
			
 
				-	struct rb_node *n = osdc->osds.rb_node;
			
 
				-
			
 
				-	while (n) {
			
 
				-		osd = rb_entry(n, struct ceph_osd, o_node);
			
 
				-		if (o < osd->o_osd)
			
 
				-			n = n->rb_left;
			
 
				-		else if (o > osd->o_osd)
			
 
				-			n = n->rb_right;
			
 
				-		else
			
 
				-			return osd;
			
 
				-	}
			
 
				-	return NULL;
			
 
				+	verify_osd_locked(osd);
			
 
				+	WARN_ON(!req->r_tid || req->r_osd);
			
 
				+	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
			
 
				+	     req, req->r_tid);
			
 
				+
			
 
				+	if (!osd_homeless(osd))
			
 
				+		__remove_osd_from_lru(osd);
			
 
				+	else
			
 
				+		atomic_inc(&osd->o_osdc->num_homeless);
			
 
				+
			
 
				+	get_osd(osd);
			
 
				+	insert_request(&osd->o_requests, req);
			
 
				+	req->r_osd = osd;
			
 
				 }
			
 
				 
			
 
				-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
			
 
				+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
			
 
				 {
			
 
				-	schedule_delayed_work(&osdc->timeout_work,
			
 
				-			      osdc->client->options->osd_keepalive_timeout);
			
 
				+	verify_osd_locked(osd);
			
 
				+	WARN_ON(req->r_osd != osd);
			
 
				+	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
			
 
				+	     req, req->r_tid);
			
 
				+
			
 
				+	req->r_osd = NULL;
			
 
				+	erase_request(&osd->o_requests, req);
			
 
				+	put_osd(osd);
			
 
				+
			
 
				+	if (!osd_homeless(osd))
			
 
				+		maybe_move_osd_to_lru(osd);
			
 
				+	else
			
 
				+		atomic_dec(&osd->o_osdc->num_homeless);
			
 
				 }
			
 
				 
			
 
				-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
			
 
				+static bool __pool_full(struct ceph_pg_pool_info *pi)
			
 
				 {
			
 
				-	cancel_delayed_work(&osdc->timeout_work);
			
 
				+	return pi->flags & CEPH_POOL_FLAG_FULL;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Register request, assign tid.  If this is the first request, set up
			
 
				- * the timeout event.
			
 
				- */
			
 
				-static void __register_request(struct ceph_osd_client *osdc,
			
 
				-			       struct ceph_osd_request *req)
			
 
				+static bool have_pool_full(struct ceph_osd_client *osdc)
			
 
				 {
			
 
				-	req->r_tid = ++osdc->last_tid;
			
 
				-	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
			
 
				-	dout("__register_request %p tid %lld\n", req, req->r_tid);
			
 
				-	__insert_request(osdc, req);
			
 
				-	ceph_osdc_get_request(req);
			
 
				-	osdc->num_requests++;
			
 
				-	if (osdc->num_requests == 1) {
			
 
				-		dout(" first request, scheduling timeout\n");
			
 
				-		__schedule_osd_timeout(osdc);
			
 
				+	struct rb_node *n;
			
 
				+
			
 
				+	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
			
 
				+		struct ceph_pg_pool_info *pi =
			
 
				+		    rb_entry(n, struct ceph_pg_pool_info, node);
			
 
				+
			
 
				+		if (__pool_full(pi))
			
 
				+			return true;
			
 
				 	}
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
			
 
				+{
			
 
				+	struct ceph_pg_pool_info *pi;
			
 
				+
			
 
				+	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
			
 
				+	if (!pi)
			
 
				+		return false;
			
 
				+
			
 
				+	return __pool_full(pi);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * called under osdc->request_mutex
			
 
				+ * Returns whether a request should be blocked from being sent
			
 
				+ * based on the current osdmap and osd_client settings.
			
 
				  */
			
 
				-static void __unregister_request(struct ceph_osd_client *osdc,
			
 
				-				 struct ceph_osd_request *req)
			
 
				+static bool target_should_be_paused(struct ceph_osd_client *osdc,
			
 
				+				    const struct ceph_osd_request_target *t,
			
 
				+				    struct ceph_pg_pool_info *pi)
			
 
				 {
			
 
				-	if (RB_EMPTY_NODE(&req->r_node)) {
			
 
				-		dout("__unregister_request %p tid %lld not registered\n",
			
 
				-			req, req->r_tid);
			
 
				-		return;
			
 
				+	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
			
 
				+	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
			
 
				+		       ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
			
 
				+		       __pool_full(pi);
			
 
				+
			
 
				+	WARN_ON(pi->id != t->base_oloc.pool);
			
 
				+	return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
			
 
				+	       (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
			
 
				+}
			
 
				+
			
 
				+enum calc_target_result {
			
 
				+	CALC_TARGET_NO_ACTION = 0,
			
 
				+	CALC_TARGET_NEED_RESEND,
			
 
				+	CALC_TARGET_POOL_DNE,
			
 
				+};
			
 
				+
			
 
				+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
			
 
				+					   struct ceph_osd_request_target *t,
			
 
				+					   u32 *last_force_resend,
			
 
				+					   bool any_change)
			
 
				+{
			
 
				+	struct ceph_pg_pool_info *pi;
			
 
				+	struct ceph_pg pgid, last_pgid;
			
 
				+	struct ceph_osds up, acting;
			
 
				+	bool force_resend = false;
			
 
				+	bool need_check_tiering = false;
			
 
				+	bool need_resend = false;
			
 
				+	bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
			
 
				+					     CEPH_OSDMAP_SORTBITWISE);
			
 
				+	enum calc_target_result ct_res;
			
 
				+	int ret;
			
 
				+
			
 
				+	pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
			
 
				+	if (!pi) {
			
 
				+		t->osd = CEPH_HOMELESS_OSD;
			
 
				+		ct_res = CALC_TARGET_POOL_DNE;
			
 
				+		goto out;
			
 
				 	}
			
 
				 
			
 
				-	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
			
 
				-	rb_erase(&req->r_node, &osdc->requests);
			
 
				-	RB_CLEAR_NODE(&req->r_node);
			
 
				-	osdc->num_requests--;
			
 
				+	if (osdc->osdmap->epoch == pi->last_force_request_resend) {
			
 
				+		if (last_force_resend &&
			
 
				+		    *last_force_resend < pi->last_force_request_resend) {
			
 
				+			*last_force_resend = pi->last_force_request_resend;
			
 
				+			force_resend = true;
			
 
				+		} else if (!last_force_resend) {
			
 
				+			force_resend = true;
			
 
				+		}
			
 
				+	}
			
 
				+	if (ceph_oid_empty(&t->target_oid) || force_resend) {
			
 
				+		ceph_oid_copy(&t->target_oid, &t->base_oid);
			
 
				+		need_check_tiering = true;
			
 
				+	}
			
 
				+	if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
			
 
				+		ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
			
 
				+		need_check_tiering = true;
			
 
				+	}
			
 
				 
			
 
				-	if (req->r_osd) {
			
 
				-		/* make sure the original request isn't in flight. */
			
 
				-		ceph_msg_revoke(req->r_request);
			
 
				+	if (need_check_tiering &&
			
 
				+	    (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
			
 
				+		if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
			
 
				+			t->target_oloc.pool = pi->read_tier;
			
 
				+		if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
			
 
				+			t->target_oloc.pool = pi->write_tier;
			
 
				+	}
			
 
				 
			
 
				-		list_del_init(&req->r_osd_item);
			
 
				-		maybe_move_osd_to_lru(osdc, req->r_osd);
			
 
				-		if (list_empty(&req->r_linger_osd_item))
			
 
				-			req->r_osd = NULL;
			
 
				+	ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
			
 
				+					&t->target_oloc, &pgid);
			
 
				+	if (ret) {
			
 
				+		WARN_ON(ret != -ENOENT);
			
 
				+		t->osd = CEPH_HOMELESS_OSD;
			
 
				+		ct_res = CALC_TARGET_POOL_DNE;
			
 
				+		goto out;
			
 
				+	}
			
 
				+	last_pgid.pool = pgid.pool;
			
 
				+	last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
			
 
				+
			
 
				+	ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
			
 
				+	if (any_change &&
			
 
				+	    ceph_is_new_interval(&t->acting,
			
 
				+				 &acting,
			
 
				+				 &t->up,
			
 
				+				 &up,
			
 
				+				 t->size,
			
 
				+				 pi->size,
			
 
				+				 t->min_size,
			
 
				+				 pi->min_size,
			
 
				+				 t->pg_num,
			
 
				+				 pi->pg_num,
			
 
				+				 t->sort_bitwise,
			
 
				+				 sort_bitwise,
			
 
				+				 &last_pgid))
			
 
				+		force_resend = true;
			
 
				+
			
 
				+	if (t->paused && !target_should_be_paused(osdc, t, pi)) {
			
 
				+		t->paused = false;
			
 
				+		need_resend = true;
			
 
				 	}
			
 
				 
			
 
				-	list_del_init(&req->r_req_lru_item);
			
 
				-	ceph_osdc_put_request(req);
			
 
				+	if (ceph_pg_compare(&t->pgid, &pgid) ||
			
 
				+	    ceph_osds_changed(&t->acting, &acting, any_change) ||
			
 
				+	    force_resend) {
			
 
				+		t->pgid = pgid; /* struct */
			
 
				+		ceph_osds_copy(&t->acting, &acting);
			
 
				+		ceph_osds_copy(&t->up, &up);
			
 
				+		t->size = pi->size;
			
 
				+		t->min_size = pi->min_size;
			
 
				+		t->pg_num = pi->pg_num;
			
 
				+		t->pg_num_mask = pi->pg_num_mask;
			
 
				+		t->sort_bitwise = sort_bitwise;
			
 
				+
			
 
				+		t->osd = acting.primary;
			
 
				+		need_resend = true;
			
 
				+	}
			
 
				+
			
 
				+	ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
			
 
				+out:
			
 
				+	dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
			
 
				+	return ct_res;
			
 
				+}
			
 
				+
			
 
				+static void setup_request_data(struct ceph_osd_request *req,
			
 
				+			       struct ceph_msg *msg)
			
 
				+{
			
 
				+	u32 data_len = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	if (!list_empty(&msg->data))
			
 
				+		return;
			
 
				+
			
 
				+	WARN_ON(msg->data_length);
			
 
				+	for (i = 0; i < req->r_num_ops; i++) {
			
 
				+		struct ceph_osd_req_op *op = &req->r_ops[i];
			
 
				+
			
 
				+		switch (op->op) {
			
 
				+		/* request */
			
 
				+		case CEPH_OSD_OP_WRITE:
			
 
				+		case CEPH_OSD_OP_WRITEFULL:
			
 
				+			WARN_ON(op->indata_len != op->extent.length);
			
 
				+			ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
			
 
				+			break;
			
 
				+		case CEPH_OSD_OP_SETXATTR:
			
 
				+		case CEPH_OSD_OP_CMPXATTR:
			
 
				+			WARN_ON(op->indata_len != op->xattr.name_len +
			
 
				+						  op->xattr.value_len);
			
 
				+			ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
			
 
				+			break;
			
 
				+		case CEPH_OSD_OP_NOTIFY_ACK:
			
 
				+			ceph_osdc_msg_data_add(msg,
			
 
				+					       &op->notify_ack.request_data);
			
 
				+			break;
			
 
				+
			
 
				+		/* reply */
			
 
				+		case CEPH_OSD_OP_STAT:
			
 
				+			ceph_osdc_msg_data_add(req->r_reply,
			
 
				+					       &op->raw_data_in);
			
 
				+			break;
			
 
				+		case CEPH_OSD_OP_READ:
			
 
				+			ceph_osdc_msg_data_add(req->r_reply,
			
 
				+					       &op->extent.osd_data);
			
 
				+			break;
			
 
				 
			
 
				-	if (osdc->num_requests == 0) {
			
 
				-		dout(" no requests, canceling timeout\n");
			
 
				-		__cancel_osd_timeout(osdc);
			
 
				+		/* both */
			
 
				+		case CEPH_OSD_OP_CALL:
			
 
				+			WARN_ON(op->indata_len != op->cls.class_len +
			
 
				+						  op->cls.method_len +
			
 
				+						  op->cls.indata_len);
			
 
				+			ceph_osdc_msg_data_add(msg, &op->cls.request_info);
			
 
				+			/* optional, can be NONE */
			
 
				+			ceph_osdc_msg_data_add(msg, &op->cls.request_data);
			
 
				+			/* optional, can be NONE */
			
 
				+			ceph_osdc_msg_data_add(req->r_reply,
			
 
				+					       &op->cls.response_data);
			
 
				+			break;
			
 
				+		case CEPH_OSD_OP_NOTIFY:
			
 
				+			ceph_osdc_msg_data_add(msg,
			
 
				+					       &op->notify.request_data);
			
 
				+			ceph_osdc_msg_data_add(req->r_reply,
			
 
				+					       &op->notify.response_data);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		data_len += op->indata_len;
			
 
				 	}
			
 
				+
			
 
				+	WARN_ON(data_len != msg->data_length);
			
 
				+}
			
 
				+
			
 
				+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
			
 
				+{
			
 
				+	void *p = msg->front.iov_base;
			
 
				+	void *const end = p + msg->front_alloc_len;
			
 
				+	u32 data_len = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
			
 
				+		/* snapshots aren't writeable */
			
 
				+		WARN_ON(req->r_snapid != CEPH_NOSNAP);
			
 
				+	} else {
			
 
				+		WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
			
 
				+			req->r_data_offset || req->r_snapc);
			
 
				+	}
			
 
				+
			
 
				+	setup_request_data(req, msg);
			
 
				+
			
 
				+	ceph_encode_32(&p, 1); /* client_inc, always 1 */
			
 
				+	ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
			
 
				+	ceph_encode_32(&p, req->r_flags);
			
 
				+	ceph_encode_timespec(p, &req->r_mtime);
			
 
				+	p += sizeof(struct ceph_timespec);
			
 
				+	/* aka reassert_version */
			
 
				+	memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
			
 
				+	p += sizeof(req->r_replay_version);
			
 
				+
			
 
				+	/* oloc */
			
 
				+	ceph_encode_8(&p, 4);
			
 
				+	ceph_encode_8(&p, 4);
			
 
				+	ceph_encode_32(&p, 8 + 4 + 4);
			
 
				+	ceph_encode_64(&p, req->r_t.target_oloc.pool);
			
 
				+	ceph_encode_32(&p, -1); /* preferred */
			
 
				+	ceph_encode_32(&p, 0); /* key len */
			
 
				+
			
 
				+	/* pgid */
			
 
				+	ceph_encode_8(&p, 1);
			
 
				+	ceph_encode_64(&p, req->r_t.pgid.pool);
			
 
				+	ceph_encode_32(&p, req->r_t.pgid.seed);
			
 
				+	ceph_encode_32(&p, -1); /* preferred */
			
 
				+
			
 
				+	/* oid */
			
 
				+	ceph_encode_32(&p, req->r_t.target_oid.name_len);
			
 
				+	memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
			
 
				+	p += req->r_t.target_oid.name_len;
			
 
				+
			
 
				+	/* ops, can imply data */
			
 
				+	ceph_encode_16(&p, req->r_num_ops);
			
 
				+	for (i = 0; i < req->r_num_ops; i++) {
			
 
				+		data_len += osd_req_encode_op(p, &req->r_ops[i]);
			
 
				+		p += sizeof(struct ceph_osd_op);
			
 
				+	}
			
 
				+
			
 
				+	ceph_encode_64(&p, req->r_snapid); /* snapid */
			
 
				+	if (req->r_snapc) {
			
 
				+		ceph_encode_64(&p, req->r_snapc->seq);
			
 
				+		ceph_encode_32(&p, req->r_snapc->num_snaps);
			
 
				+		for (i = 0; i < req->r_snapc->num_snaps; i++)
			
 
				+			ceph_encode_64(&p, req->r_snapc->snaps[i]);
			
 
				+	} else {
			
 
				+		ceph_encode_64(&p, 0); /* snap_seq */
			
 
				+		ceph_encode_32(&p, 0); /* snaps len */
			
 
				+	}
			
 
				+
			
 
				+	ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
			
 
				+
			
 
				+	BUG_ON(p > end);
			
 
				+	msg->front.iov_len = p - msg->front.iov_base;
			
 
				+	msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
			
 
				+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
			
 
				+	msg->hdr.data_len = cpu_to_le32(data_len);
			
 
				+	/*
			
 
				+	 * The header "data_off" is a hint to the receiver allowing it
			
 
				+	 * to align received data into its buffers such that there's no
			
 
				+	 * need to re-copy it before writing it to disk (direct I/O).
			
 
				+	 */
			
 
				+	msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
			
 
				+
			
 
				+	dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
			
 
				+	     req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
			
 
				+	     req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Cancel a previously queued request message
			
 
				+ * @req has to be assigned a tid and registered.
			
 
				  */
			
 
				-static void __cancel_request(struct ceph_osd_request *req)
			
 
				+static void send_request(struct ceph_osd_request *req)
			
 
				 {
			
 
				-	if (req->r_sent && req->r_osd) {
			
 
				+	struct ceph_osd *osd = req->r_osd;
			
 
				+
			
 
				+	verify_osd_locked(osd);
			
 
				+	WARN_ON(osd->o_osd != req->r_t.osd);
			
 
				+
			
 
				+	/*
			
 
				+	 * We may have a previously queued request message hanging
			
 
				+	 * around.  Cancel it to avoid corrupting the msgr.
			
 
				+	 */
			
 
				+	if (req->r_sent)
			
 
				 		ceph_msg_revoke(req->r_request);
			
 
				-		req->r_sent = 0;
			
 
				+
			
 
				+	req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
			
 
				+	if (req->r_attempts)
			
 
				+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
			
 
				+	else
			
 
				+		WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
			
 
				+
			
 
				+	encode_request(req, req->r_request);
			
 
				+
			
 
				+	dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
			
 
				+	     __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
			
 
				+	     req->r_t.osd, req->r_flags, req->r_attempts);
			
 
				+
			
 
				+	req->r_t.paused = false;
			
 
				+	req->r_stamp = jiffies;
			
 
				+	req->r_attempts++;
			
 
				+
			
 
				+	req->r_sent = osd->o_incarnation;
			
 
				+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
			
 
				+	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
			
 
				+}
			
 
				+
			
 
				+static void maybe_request_map(struct ceph_osd_client *osdc)
			
 
				+{
			
 
				+	bool continuous = false;
			
 
				+
			
 
				+	verify_osdc_locked(osdc);
			
 
				+	WARN_ON(!osdc->osdmap->epoch);
			
 
				+
			
 
				+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
			
 
				+	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
			
 
				+	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
			
 
				+		dout("%s osdc %p continuous\n", __func__, osdc);
			
 
				+		continuous = true;
			
 
				+	} else {
			
 
				+		dout("%s osdc %p onetime\n", __func__, osdc);
			
 
				 	}
			
 
				+
			
 
				+	if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
			
 
				+			       osdc->osdmap->epoch + 1, continuous))
			
 
				+		ceph_monc_renew_subs(&osdc->client->monc);
			
 
				 }
			
 
				 
			
 
				-static void __register_linger_request(struct ceph_osd_client *osdc,
			
 
				-				    struct ceph_osd_request *req)
			
 
				+static void send_map_check(struct ceph_osd_request *req);
			
 
				+
			
 
				+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
			
 
				 {
			
 
				-	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
			
 
				-	WARN_ON(!req->r_linger);
			
 
				+	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				+	struct ceph_osd *osd;
			
 
				+	enum calc_target_result ct_res;
			
 
				+	bool need_send = false;
			
 
				+	bool promoted = false;
			
 
				+
			
 
				+	WARN_ON(req->r_tid || req->r_got_reply);
			
 
				+	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
			
 
				+
			
 
				+again:
			
 
				+	ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
			
 
				+	if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
			
 
				+		goto promote;
			
 
				+
			
 
				+	osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
			
 
				+	if (IS_ERR(osd)) {
			
 
				+		WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
			
 
				+		goto promote;
			
 
				+	}
			
 
				 
			
 
				+	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
			
 
				+	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
			
 
				+		dout("req %p pausewr\n", req);
			
 
				+		req->r_t.paused = true;
			
 
				+		maybe_request_map(osdc);
			
 
				+	} else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
			
 
				+		   ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
			
 
				+		dout("req %p pauserd\n", req);
			
 
				+		req->r_t.paused = true;
			
 
				+		maybe_request_map(osdc);
			
 
				+	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
			
 
				+		   !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
			
 
				+				     CEPH_OSD_FLAG_FULL_FORCE)) &&
			
 
				+		   (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
			
 
				+		    pool_full(osdc, req->r_t.base_oloc.pool))) {
			
 
				+		dout("req %p full/pool_full\n", req);
			
 
				+		pr_warn_ratelimited("FULL or reached pool quota\n");
			
 
				+		req->r_t.paused = true;
			
 
				+		maybe_request_map(osdc);
			
 
				+	} else if (!osd_homeless(osd)) {
			
 
				+		need_send = true;
			
 
				+	} else {
			
 
				+		maybe_request_map(osdc);
			
 
				+	}
			
 
				+
			
 
				+	mutex_lock(&osd->lock);
			
 
				+	/*
			
 
				+	 * Assign the tid atomically with send_request() to protect
			
 
				+	 * multiple writes to the same object from racing with each
			
 
				+	 * other, resulting in out of order ops on the OSDs.
			
 
				+	 */
			
 
				+	req->r_tid = atomic64_inc_return(&osdc->last_tid);
			
 
				+	link_request(osd, req);
			
 
				+	if (need_send)
			
 
				+		send_request(req);
			
 
				+	mutex_unlock(&osd->lock);
			
 
				+
			
 
				+	if (ct_res == CALC_TARGET_POOL_DNE)
			
 
				+		send_map_check(req);
			
 
				+
			
 
				+	if (promoted)
			
 
				+		downgrade_write(&osdc->lock);
			
 
				+	return;
			
 
				+
			
 
				+promote:
			
 
				+	up_read(&osdc->lock);
			
 
				+	down_write(&osdc->lock);
			
 
				+	wrlocked = true;
			
 
				+	promoted = true;
			
 
				+	goto again;
			
 
				+}
			
 
				+
			
 
				+static void account_request(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
			
 
				+
			
 
				+	if (req->r_flags & CEPH_OSD_FLAG_READ) {
			
 
				+		WARN_ON(req->r_flags & mask);
			
 
				+		req->r_flags |= CEPH_OSD_FLAG_ACK;
			
 
				+	} else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
			
 
				+		WARN_ON(!(req->r_flags & mask));
			
 
				+	else
			
 
				+		WARN_ON(1);
			
 
				+
			
 
				+	WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
			
 
				+	atomic_inc(&req->r_osdc->num_requests);
			
 
				+}
			
 
				+
			
 
				+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
			
 
				+{
			
 
				 	ceph_osdc_get_request(req);
			
 
				-	list_add_tail(&req->r_linger_item, &osdc->req_linger);
			
 
				-	if (req->r_osd)
			
 
				-		list_add_tail(&req->r_linger_osd_item,
			
 
				-			      &req->r_osd->o_linger_requests);
			
 
				+	account_request(req);
			
 
				+	__submit_request(req, wrlocked);
			
 
				 }
			
 
				 
			
 
				-static void __unregister_linger_request(struct ceph_osd_client *osdc,
			
 
				-					struct ceph_osd_request *req)
			
 
				+static void __finish_request(struct ceph_osd_request *req)
			
 
				 {
			
 
				-	WARN_ON(!req->r_linger);
			
 
				+	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				+	struct ceph_osd *osd = req->r_osd;
			
 
				 
			
 
				-	if (list_empty(&req->r_linger_item)) {
			
 
				-		dout("%s %p tid %llu not registered\n", __func__, req,
			
 
				-		     req->r_tid);
			
 
				+	verify_osd_locked(osd);
			
 
				+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
			
 
				+
			
 
				+	WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
			
 
				+	unlink_request(osd, req);
			
 
				+	atomic_dec(&osdc->num_requests);
			
 
				+
			
 
				+	/*
			
 
				+	 * If an OSD has failed or returned and a request has been sent
			
 
				+	 * twice, it's possible to get a reply and end up here while the
			
 
				+	 * request message is queued for delivery.  We will ignore the
			
 
				+	 * reply, so not a big deal, but better to try and catch it.
			
 
				+	 */
			
 
				+	ceph_msg_revoke(req->r_request);
			
 
				+	ceph_msg_revoke_incoming(req->r_reply);
			
 
				+}
			
 
				+
			
 
				+static void finish_request(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	__finish_request(req);
			
 
				+	ceph_osdc_put_request(req);
			
 
				+}
			
 
				+
			
 
				+static void __complete_request(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	if (req->r_callback)
			
 
				+		req->r_callback(req);
			
 
				+	else
			
 
				+		complete_all(&req->r_completion);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Note that this is open-coded in handle_reply(), which has to deal
			
 
				+ * with ack vs commit, dup acks, etc.
			
 
				+ */
			
 
				+static void complete_request(struct ceph_osd_request *req, int err)
			
 
				+{
			
 
				+	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
			
 
				+
			
 
				+	req->r_result = err;
			
 
				+	__finish_request(req);
			
 
				+	__complete_request(req);
			
 
				+	complete_all(&req->r_safe_completion);
			
 
				+	ceph_osdc_put_request(req);
			
 
				+}
			
 
				+
			
 
				+static void cancel_map_check(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				+	struct ceph_osd_request *lookup_req;
			
 
				+
			
 
				+	verify_osdc_wrlocked(osdc);
			
 
				+
			
 
				+	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
			
 
				+	if (!lookup_req)
			
 
				 		return;
			
 
				+
			
 
				+	WARN_ON(lookup_req != req);
			
 
				+	erase_request_mc(&osdc->map_checks, req);
			
 
				+	ceph_osdc_put_request(req);
			
 
				+}
			
 
				+
			
 
				+static void cancel_request(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
			
 
				+
			
 
				+	cancel_map_check(req);
			
 
				+	finish_request(req);
			
 
				+}
			
 
				+
			
 
				+static void check_pool_dne(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				+	struct ceph_osdmap *map = osdc->osdmap;
			
 
				+
			
 
				+	verify_osdc_wrlocked(osdc);
			
 
				+	WARN_ON(!map->epoch);
			
 
				+
			
 
				+	if (req->r_attempts) {
			
 
				+		/*
			
 
				+		 * We sent a request earlier, which means that
			
 
				+		 * previously the pool existed, and now it does not
			
 
				+		 * (i.e., it was deleted).
			
 
				+		 */
			
 
				+		req->r_map_dne_bound = map->epoch;
			
 
				+		dout("%s req %p tid %llu pool disappeared\n", __func__, req,
			
 
				+		     req->r_tid);
			
 
				+	} else {
			
 
				+		dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
			
 
				+		     req, req->r_tid, req->r_map_dne_bound, map->epoch);
			
 
				 	}
			
 
				 
			
 
				-	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
			
 
				-	list_del_init(&req->r_linger_item);
			
 
				+	if (req->r_map_dne_bound) {
			
 
				+		if (map->epoch >= req->r_map_dne_bound) {
			
 
				+			/* we had a new enough map */
			
 
				+			pr_info_ratelimited("tid %llu pool does not exist\n",
			
 
				+					    req->r_tid);
			
 
				+			complete_request(req, -ENOENT);
			
 
				+		}
			
 
				+	} else {
			
 
				+		send_map_check(req);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void map_check_cb(struct ceph_mon_generic_request *greq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
			
 
				+	struct ceph_osd_request *req;
			
 
				+	u64 tid = greq->private_data;
			
 
				+
			
 
				+	WARN_ON(greq->result || !greq->u.newest);
			
 
				 
			
 
				-	if (req->r_osd) {
			
 
				-		list_del_init(&req->r_linger_osd_item);
			
 
				-		maybe_move_osd_to_lru(osdc, req->r_osd);
			
 
				-		if (list_empty(&req->r_osd_item))
			
 
				-			req->r_osd = NULL;
			
 
				+	down_write(&osdc->lock);
			
 
				+	req = lookup_request_mc(&osdc->map_checks, tid);
			
 
				+	if (!req) {
			
 
				+		dout("%s tid %llu dne\n", __func__, tid);
			
 
				+		goto out_unlock;
			
 
				 	}
			
 
				+
			
 
				+	dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
			
 
				+	     req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
			
 
				+	if (!req->r_map_dne_bound)
			
 
				+		req->r_map_dne_bound = greq->u.newest;
			
 
				+	erase_request_mc(&osdc->map_checks, req);
			
 
				+	check_pool_dne(req);
			
 
				+
			
 
				 	ceph_osdc_put_request(req);
			
 
				+out_unlock:
			
 
				+	up_write(&osdc->lock);
			
 
				 }
			
 
				 
			
 
				-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
			
 
				-				  struct ceph_osd_request *req)
			
 
				+static void send_map_check(struct ceph_osd_request *req)
			
 
				 {
			
 
				-	if (!req->r_linger) {
			
 
				-		dout("set_request_linger %p\n", req);
			
 
				-		req->r_linger = 1;
			
 
				+	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				+	struct ceph_osd_request *lookup_req;
			
 
				+	int ret;
			
 
				+
			
 
				+	verify_osdc_wrlocked(osdc);
			
 
				+
			
 
				+	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
			
 
				+	if (lookup_req) {
			
 
				+		WARN_ON(lookup_req != req);
			
 
				+		return;
			
 
				 	}
			
 
				+
			
 
				+	ceph_osdc_get_request(req);
			
 
				+	insert_request_mc(&osdc->map_checks, req);
			
 
				+	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
			
 
				+					  map_check_cb, req->r_tid);
			
 
				+	WARN_ON(ret);
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
			
 
				 
			
 
				 /*
			
 
				- * Returns whether a request should be blocked from being sent
			
 
				- * based on the current osdmap and osd_client settings.
			
 
				- *
			
 
				- * Caller should hold map_sem for read.
			
 
				+ * lingering requests, watch/notify v2 infrastructure
			
 
				  */
			
 
				-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
			
 
				-				   struct ceph_osd_request *req)
			
 
				+static void linger_release(struct kref *kref)
			
 
				 {
			
 
				-	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
			
 
				-	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
			
 
				-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
			
 
				-	return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
			
 
				-		(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
			
 
				+	struct ceph_osd_linger_request *lreq =
			
 
				+	    container_of(kref, struct ceph_osd_linger_request, kref);
			
 
				+
			
 
				+	dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
			
 
				+	     lreq->reg_req, lreq->ping_req);
			
 
				+	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
			
 
				+	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
			
 
				+	WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
			
 
				+	WARN_ON(!list_empty(&lreq->scan_item));
			
 
				+	WARN_ON(!list_empty(&lreq->pending_lworks));
			
 
				+	WARN_ON(lreq->osd);
			
 
				+
			
 
				+	if (lreq->reg_req)
			
 
				+		ceph_osdc_put_request(lreq->reg_req);
			
 
				+	if (lreq->ping_req)
			
 
				+		ceph_osdc_put_request(lreq->ping_req);
			
 
				+	target_destroy(&lreq->t);
			
 
				+	kfree(lreq);
			
 
				 }
			
 
				 
			
 
				+static void linger_put(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	if (lreq)
			
 
				+		kref_put(&lreq->kref, linger_release);
			
 
				+}
			
 
				+
			
 
				+static struct ceph_osd_linger_request *
			
 
				+linger_get(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	kref_get(&lreq->kref);
			
 
				+	return lreq;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_osd_linger_request *
			
 
				+linger_alloc(struct ceph_osd_client *osdc)
			
 
				+{
			
 
				+	struct ceph_osd_linger_request *lreq;
			
 
				+
			
 
				+	lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
			
 
				+	if (!lreq)
			
 
				+		return NULL;
			
 
				+
			
 
				+	kref_init(&lreq->kref);
			
 
				+	mutex_init(&lreq->lock);
			
 
				+	RB_CLEAR_NODE(&lreq->node);
			
 
				+	RB_CLEAR_NODE(&lreq->osdc_node);
			
 
				+	RB_CLEAR_NODE(&lreq->mc_node);
			
 
				+	INIT_LIST_HEAD(&lreq->scan_item);
			
 
				+	INIT_LIST_HEAD(&lreq->pending_lworks);
			
 
				+	init_completion(&lreq->reg_commit_wait);
			
 
				+	init_completion(&lreq->notify_finish_wait);
			
 
				+
			
 
				+	lreq->osdc = osdc;
			
 
				+	target_init(&lreq->t);
			
 
				+
			
 
				+	dout("%s lreq %p\n", __func__, lreq);
			
 
				+	return lreq;
			
 
				+}
			
 
				+
			
 
				+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
			
 
				+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
			
 
				+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
			
 
				+
			
 
				 /*
			
 
				- * Calculate mapping of a request to a PG.  Takes tiering into account.
			
 
				+ * Create linger request <-> OSD session relation.
			
 
				+ *
			
 
				+ * @lreq has to be registered, @osd may be homeless.
			
 
				  */
			
 
				-static int __calc_request_pg(struct ceph_osdmap *osdmap,
			
 
				-			     struct ceph_osd_request *req,
			
 
				-			     struct ceph_pg *pg_out)
			
 
				+static void link_linger(struct ceph_osd *osd,
			
 
				+			struct ceph_osd_linger_request *lreq)
			
 
				 {
			
 
				-	bool need_check_tiering;
			
 
				+	verify_osd_locked(osd);
			
 
				+	WARN_ON(!lreq->linger_id || lreq->osd);
			
 
				+	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
			
 
				+	     osd->o_osd, lreq, lreq->linger_id);
			
 
				 
			
 
				-	need_check_tiering = false;
			
 
				-	if (req->r_target_oloc.pool == -1) {
			
 
				-		req->r_target_oloc = req->r_base_oloc; /* struct */
			
 
				-		need_check_tiering = true;
			
 
				+	if (!osd_homeless(osd))
			
 
				+		__remove_osd_from_lru(osd);
			
 
				+	else
			
 
				+		atomic_inc(&osd->o_osdc->num_homeless);
			
 
				+
			
 
				+	get_osd(osd);
			
 
				+	insert_linger(&osd->o_linger_requests, lreq);
			
 
				+	lreq->osd = osd;
			
 
				+}
			
 
				+
			
 
				+static void unlink_linger(struct ceph_osd *osd,
			
 
				+			  struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	verify_osd_locked(osd);
			
 
				+	WARN_ON(lreq->osd != osd);
			
 
				+	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
			
 
				+	     osd->o_osd, lreq, lreq->linger_id);
			
 
				+
			
 
				+	lreq->osd = NULL;
			
 
				+	erase_linger(&osd->o_linger_requests, lreq);
			
 
				+	put_osd(osd);
			
 
				+
			
 
				+	if (!osd_homeless(osd))
			
 
				+		maybe_move_osd_to_lru(osd);
			
 
				+	else
			
 
				+		atomic_dec(&osd->o_osdc->num_homeless);
			
 
				+}
			
 
				+
			
 
				+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	verify_osdc_locked(lreq->osdc);
			
 
				+
			
 
				+	return !RB_EMPTY_NODE(&lreq->osdc_node);
			
 
				+}
			
 
				+
			
 
				+static bool linger_registered(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+	bool registered;
			
 
				+
			
 
				+	down_read(&osdc->lock);
			
 
				+	registered = __linger_registered(lreq);
			
 
				+	up_read(&osdc->lock);
			
 
				+
			
 
				+	return registered;
			
 
				+}
			
 
				+
			
 
				+static void linger_register(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+
			
 
				+	verify_osdc_wrlocked(osdc);
			
 
				+	WARN_ON(lreq->linger_id);
			
 
				+
			
 
				+	linger_get(lreq);
			
 
				+	lreq->linger_id = ++osdc->last_linger_id;
			
 
				+	insert_linger_osdc(&osdc->linger_requests, lreq);
			
 
				+}
			
 
				+
			
 
				+static void linger_unregister(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+
			
 
				+	verify_osdc_wrlocked(osdc);
			
 
				+
			
 
				+	erase_linger_osdc(&osdc->linger_requests, lreq);
			
 
				+	linger_put(lreq);
			
 
				+}
			
 
				+
			
 
				+static void cancel_linger_request(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	struct ceph_osd_linger_request *lreq = req->r_priv;
			
 
				+
			
 
				+	WARN_ON(!req->r_linger);
			
 
				+	cancel_request(req);
			
 
				+	linger_put(lreq);
			
 
				+}
			
 
				+
			
 
				+struct linger_work {
			
 
				+	struct work_struct work;
			
 
				+	struct ceph_osd_linger_request *lreq;
			
 
				+	struct list_head pending_item;
			
 
				+	unsigned long queued_stamp;
			
 
				+
			
 
				+	union {
			
 
				+		struct {
			
 
				+			u64 notify_id;
			
 
				+			u64 notifier_id;
			
 
				+			void *payload; /* points into @msg front */
			
 
				+			size_t payload_len;
			
 
				+
			
 
				+			struct ceph_msg *msg; /* for ceph_msg_put() */
			
 
				+		} notify;
			
 
				+		struct {
			
 
				+			int err;
			
 
				+		} error;
			
 
				+	};
			
 
				+};
			
 
				+
			
 
				+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
			
 
				+				       work_func_t workfn)
			
 
				+{
			
 
				+	struct linger_work *lwork;
			
 
				+
			
 
				+	lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
			
 
				+	if (!lwork)
			
 
				+		return NULL;
			
 
				+
			
 
				+	INIT_WORK(&lwork->work, workfn);
			
 
				+	INIT_LIST_HEAD(&lwork->pending_item);
			
 
				+	lwork->lreq = linger_get(lreq);
			
 
				+
			
 
				+	return lwork;
			
 
				+}
			
 
				+
			
 
				+static void lwork_free(struct linger_work *lwork)
			
 
				+{
			
 
				+	struct ceph_osd_linger_request *lreq = lwork->lreq;
			
 
				+
			
 
				+	mutex_lock(&lreq->lock);
			
 
				+	list_del(&lwork->pending_item);
			
 
				+	mutex_unlock(&lreq->lock);
			
 
				+
			
 
				+	linger_put(lreq);
			
 
				+	kfree(lwork);
			
 
				+}
			
 
				+
			
 
				+static void lwork_queue(struct linger_work *lwork)
			
 
				+{
			
 
				+	struct ceph_osd_linger_request *lreq = lwork->lreq;
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+
			
 
				+	verify_lreq_locked(lreq);
			
 
				+	WARN_ON(!list_empty(&lwork->pending_item));
			
 
				+
			
 
				+	lwork->queued_stamp = jiffies;
			
 
				+	list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
			
 
				+	queue_work(osdc->notify_wq, &lwork->work);
			
 
				+}
			
 
				+
			
 
				+static void do_watch_notify(struct work_struct *w)
			
 
				+{
			
 
				+	struct linger_work *lwork = container_of(w, struct linger_work, work);
			
 
				+	struct ceph_osd_linger_request *lreq = lwork->lreq;
			
 
				+
			
 
				+	if (!linger_registered(lreq)) {
			
 
				+		dout("%s lreq %p not registered\n", __func__, lreq);
			
 
				+		goto out;
			
 
				 	}
			
 
				-	if (req->r_target_oid.name_len == 0) {
			
 
				-		ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
			
 
				-		need_check_tiering = true;
			
 
				+
			
 
				+	WARN_ON(!lreq->is_watch);
			
 
				+	dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
			
 
				+	     __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
			
 
				+	     lwork->notify.payload_len);
			
 
				+	lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
			
 
				+		  lwork->notify.notifier_id, lwork->notify.payload,
			
 
				+		  lwork->notify.payload_len);
			
 
				+
			
 
				+out:
			
 
				+	ceph_msg_put(lwork->notify.msg);
			
 
				+	lwork_free(lwork);
			
 
				+}
			
 
				+
			
 
				+static void do_watch_error(struct work_struct *w)
			
 
				+{
			
 
				+	struct linger_work *lwork = container_of(w, struct linger_work, work);
			
 
				+	struct ceph_osd_linger_request *lreq = lwork->lreq;
			
 
				+
			
 
				+	if (!linger_registered(lreq)) {
			
 
				+		dout("%s lreq %p not registered\n", __func__, lreq);
			
 
				+		goto out;
			
 
				 	}
			
 
				 
			
 
				-	if (need_check_tiering &&
			
 
				-	    (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
			
 
				-		struct ceph_pg_pool_info *pi;
			
 
				-
			
 
				-		pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
			
 
				-		if (pi) {
			
 
				-			if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
			
 
				-			    pi->read_tier >= 0)
			
 
				-				req->r_target_oloc.pool = pi->read_tier;
			
 
				-			if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
			
 
				-			    pi->write_tier >= 0)
			
 
				-				req->r_target_oloc.pool = pi->write_tier;
			
 
				+	dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
			
 
				+	lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
			
 
				+
			
 
				+out:
			
 
				+	lwork_free(lwork);
			
 
				+}
			
 
				+
			
 
				+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct linger_work *lwork;
			
 
				+
			
 
				+	lwork = lwork_alloc(lreq, do_watch_error);
			
 
				+	if (!lwork) {
			
 
				+		pr_err("failed to allocate error-lwork\n");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	lwork->error.err = lreq->last_error;
			
 
				+	lwork_queue(lwork);
			
 
				+}
			
 
				+
			
 
				+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
			
 
				+				       int result)
			
 
				+{
			
 
				+	if (!completion_done(&lreq->reg_commit_wait)) {
			
 
				+		lreq->reg_commit_error = (result <= 0 ? result : 0);
			
 
				+		complete_all(&lreq->reg_commit_wait);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void linger_commit_cb(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	struct ceph_osd_linger_request *lreq = req->r_priv;
			
 
				+
			
 
				+	mutex_lock(&lreq->lock);
			
 
				+	dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
			
 
				+	     lreq->linger_id, req->r_result);
			
 
				+	WARN_ON(!__linger_registered(lreq));
			
 
				+	linger_reg_commit_complete(lreq, req->r_result);
			
 
				+	lreq->committed = true;
			
 
				+
			
 
				+	if (!lreq->is_watch) {
			
 
				+		struct ceph_osd_data *osd_data =
			
 
				+		    osd_req_op_data(req, 0, notify, response_data);
			
 
				+		void *p = page_address(osd_data->pages[0]);
			
 
				+
			
 
				+		WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
			
 
				+			osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
			
 
				+
			
 
				+		/* make note of the notify_id */
			
 
				+		if (req->r_ops[0].outdata_len >= sizeof(u64)) {
			
 
				+			lreq->notify_id = ceph_decode_64(&p);
			
 
				+			dout("lreq %p notify_id %llu\n", lreq,
			
 
				+			     lreq->notify_id);
			
 
				+		} else {
			
 
				+			dout("lreq %p no notify_id\n", lreq);
			
 
				 		}
			
 
				-		/* !pi is caught in ceph_oloc_oid_to_pg() */
			
 
				 	}
			
 
				 
			
 
				-	return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
			
 
				-				   &req->r_target_oid, pg_out);
			
 
				+	mutex_unlock(&lreq->lock);
			
 
				+	linger_put(lreq);
			
 
				 }
			
 
				 
			
 
				-static void __enqueue_request(struct ceph_osd_request *req)
			
 
				+static int normalize_watch_error(int err)
			
 
				 {
			
 
				-	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				+	/*
			
 
				+	 * Translate ENOENT -> ENOTCONN so that a delete->disconnection
			
 
				+	 * notification and a failure to reconnect because we raced with
			
 
				+	 * the delete appear the same to the user.
			
 
				+	 */
			
 
				+	if (err == -ENOENT)
			
 
				+		err = -ENOTCONN;
			
 
				 
			
 
				-	dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
			
 
				-	     req->r_osd ? req->r_osd->o_osd : -1);
			
 
				+	return err;
			
 
				+}
			
 
				 
			
 
				-	if (req->r_osd) {
			
 
				-		__remove_osd_from_lru(req->r_osd);
			
 
				-		list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
			
 
				-		list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
			
 
				+static void linger_reconnect_cb(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	struct ceph_osd_linger_request *lreq = req->r_priv;
			
 
				+
			
 
				+	mutex_lock(&lreq->lock);
			
 
				+	dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
			
 
				+	     lreq, lreq->linger_id, req->r_result, lreq->last_error);
			
 
				+	if (req->r_result < 0) {
			
 
				+		if (!lreq->last_error) {
			
 
				+			lreq->last_error = normalize_watch_error(req->r_result);
			
 
				+			queue_watch_error(lreq);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&lreq->lock);
			
 
				+	linger_put(lreq);
			
 
				+}
			
 
				+
			
 
				+static void send_linger(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_request *req = lreq->reg_req;
			
 
				+	struct ceph_osd_req_op *op = &req->r_ops[0];
			
 
				+
			
 
				+	verify_osdc_wrlocked(req->r_osdc);
			
 
				+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
			
 
				+
			
 
				+	if (req->r_osd)
			
 
				+		cancel_linger_request(req);
			
 
				+
			
 
				+	request_reinit(req);
			
 
				+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
			
 
				+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
			
 
				+	req->r_flags = lreq->t.flags;
			
 
				+	req->r_mtime = lreq->mtime;
			
 
				+
			
 
				+	mutex_lock(&lreq->lock);
			
 
				+	if (lreq->is_watch && lreq->committed) {
			
 
				+		WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
			
 
				+			op->watch.cookie != lreq->linger_id);
			
 
				+		op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
			
 
				+		op->watch.gen = ++lreq->register_gen;
			
 
				+		dout("lreq %p reconnect register_gen %u\n", lreq,
			
 
				+		     op->watch.gen);
			
 
				+		req->r_callback = linger_reconnect_cb;
			
 
				 	} else {
			
 
				-		list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
			
 
				+		if (!lreq->is_watch)
			
 
				+			lreq->notify_id = 0;
			
 
				+		else
			
 
				+			WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
			
 
				+		dout("lreq %p register\n", lreq);
			
 
				+		req->r_callback = linger_commit_cb;
			
 
				 	}
			
 
				+	mutex_unlock(&lreq->lock);
			
 
				+
			
 
				+	req->r_priv = linger_get(lreq);
			
 
				+	req->r_linger = true;
			
 
				+
			
 
				+	submit_request(req, true);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
			
 
				- * (as needed), and set the request r_osd appropriately.  If there is
			
 
				- * no up osd, set r_osd to NULL.  Move the request to the appropriate list
			
 
				- * (unsent, homeless) or leave on in-flight lru.
			
 
				- *
			
 
				- * Return 0 if unchanged, 1 if changed, or negative on error.
			
 
				- *
			
 
				- * Caller should hold map_sem for read and request_mutex.
			
 
				- */
			
 
				-static int __map_request(struct ceph_osd_client *osdc,
			
 
				-			 struct ceph_osd_request *req, int force_resend)
			
 
				+static void linger_ping_cb(struct ceph_osd_request *req)
			
 
				 {
			
 
				-	struct ceph_pg pgid;
			
 
				-	int acting[CEPH_PG_MAX_SIZE];
			
 
				-	int num, o;
			
 
				-	int err;
			
 
				-	bool was_paused;
			
 
				-
			
 
				-	dout("map_request %p tid %lld\n", req, req->r_tid);
			
 
				-
			
 
				-	err = __calc_request_pg(osdc->osdmap, req, &pgid);
			
 
				-	if (err) {
			
 
				-		list_move(&req->r_req_lru_item, &osdc->req_notarget);
			
 
				-		return err;
			
 
				-	}
			
 
				-	req->r_pgid = pgid;
			
 
				-
			
 
				-	num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
			
 
				-	if (num < 0)
			
 
				-		num = 0;
			
 
				-
			
 
				-	was_paused = req->r_paused;
			
 
				-	req->r_paused = __req_should_be_paused(osdc, req);
			
 
				-	if (was_paused && !req->r_paused)
			
 
				-		force_resend = 1;
			
 
				-
			
 
				-	if ((!force_resend &&
			
 
				-	     req->r_osd && req->r_osd->o_osd == o &&
			
 
				-	     req->r_sent >= req->r_osd->o_incarnation &&
			
 
				-	     req->r_num_pg_osds == num &&
			
 
				-	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
			
 
				-	    (req->r_osd == NULL && o == -1) ||
			
 
				-	    req->r_paused)
			
 
				-		return 0;  /* no change */
			
 
				-
			
 
				-	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
			
 
				-	     req->r_tid, pgid.pool, pgid.seed, o,
			
 
				-	     req->r_osd ? req->r_osd->o_osd : -1);
			
 
				-
			
 
				-	/* record full pg acting set */
			
 
				-	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
			
 
				-	req->r_num_pg_osds = num;
			
 
				-
			
 
				-	if (req->r_osd) {
			
 
				-		__cancel_request(req);
			
 
				-		list_del_init(&req->r_osd_item);
			
 
				-		list_del_init(&req->r_linger_osd_item);
			
 
				-		req->r_osd = NULL;
			
 
				-	}
			
 
				-
			
 
				-	req->r_osd = __lookup_osd(osdc, o);
			
 
				-	if (!req->r_osd && o >= 0) {
			
 
				-		err = -ENOMEM;
			
 
				-		req->r_osd = create_osd(osdc, o);
			
 
				-		if (!req->r_osd) {
			
 
				-			list_move(&req->r_req_lru_item, &osdc->req_notarget);
			
 
				-			goto out;
			
 
				+	struct ceph_osd_linger_request *lreq = req->r_priv;
			
 
				+
			
 
				+	mutex_lock(&lreq->lock);
			
 
				+	dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
			
 
				+	     __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
			
 
				+	     lreq->last_error);
			
 
				+	if (lreq->register_gen == req->r_ops[0].watch.gen) {
			
 
				+		if (!req->r_result) {
			
 
				+			lreq->watch_valid_thru = lreq->ping_sent;
			
 
				+		} else if (!lreq->last_error) {
			
 
				+			lreq->last_error = normalize_watch_error(req->r_result);
			
 
				+			queue_watch_error(lreq);
			
 
				 		}
			
 
				+	} else {
			
 
				+		dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
			
 
				+		     lreq->register_gen, req->r_ops[0].watch.gen);
			
 
				+	}
			
 
				 
			
 
				-		dout("map_request osd %p is osd%d\n", req->r_osd, o);
			
 
				-		__insert_osd(osdc, req->r_osd);
			
 
				+	mutex_unlock(&lreq->lock);
			
 
				+	linger_put(lreq);
			
 
				+}
			
 
				 
			
 
				-		ceph_con_open(&req->r_osd->o_con,
			
 
				-			      CEPH_ENTITY_TYPE_OSD, o,
			
 
				-			      &osdc->osdmap->osd_addr[o]);
			
 
				+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+	struct ceph_osd_request *req = lreq->ping_req;
			
 
				+	struct ceph_osd_req_op *op = &req->r_ops[0];
			
 
				+
			
 
				+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
			
 
				+		dout("%s PAUSERD\n", __func__);
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				-	__enqueue_request(req);
			
 
				-	err = 1;   /* osd or pg changed */
			
 
				+	lreq->ping_sent = jiffies;
			
 
				+	dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
			
 
				+	     __func__, lreq, lreq->linger_id, lreq->ping_sent,
			
 
				+	     lreq->register_gen);
			
 
				 
			
 
				-out:
			
 
				-	return err;
			
 
				+	if (req->r_osd)
			
 
				+		cancel_linger_request(req);
			
 
				+
			
 
				+	request_reinit(req);
			
 
				+	target_copy(&req->r_t, &lreq->t);
			
 
				+
			
 
				+	WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
			
 
				+		op->watch.cookie != lreq->linger_id ||
			
 
				+		op->watch.op != CEPH_OSD_WATCH_OP_PING);
			
 
				+	op->watch.gen = lreq->register_gen;
			
 
				+	req->r_callback = linger_ping_cb;
			
 
				+	req->r_priv = linger_get(lreq);
			
 
				+	req->r_linger = true;
			
 
				+
			
 
				+	ceph_osdc_get_request(req);
			
 
				+	account_request(req);
			
 
				+	req->r_tid = atomic64_inc_return(&osdc->last_tid);
			
 
				+	link_request(lreq->osd, req);
			
 
				+	send_request(req);
			
 
				+}
			
 
				+
			
 
				+static void linger_submit(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+	struct ceph_osd *osd;
			
 
				+
			
 
				+	calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
			
 
				+	osd = lookup_create_osd(osdc, lreq->t.osd, true);
			
 
				+	link_linger(osd, lreq);
			
 
				+
			
 
				+	send_linger(lreq);
			
 
				+}
			
 
				+
			
 
				+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+	struct ceph_osd_linger_request *lookup_lreq;
			
 
				+
			
 
				+	verify_osdc_wrlocked(osdc);
			
 
				+
			
 
				+	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
			
 
				+				       lreq->linger_id);
			
 
				+	if (!lookup_lreq)
			
 
				+		return;
			
 
				+
			
 
				+	WARN_ON(lookup_lreq != lreq);
			
 
				+	erase_linger_mc(&osdc->linger_map_checks, lreq);
			
 
				+	linger_put(lreq);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * caller should hold map_sem (for read) and request_mutex
			
 
				+ * @lreq has to be both registered and linked.
			
 
				  */
			
 
				-static void __send_request(struct ceph_osd_client *osdc,
			
 
				-			   struct ceph_osd_request *req)
			
 
				+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
			
 
				 {
			
 
				-	void *p;
			
 
				+	if (lreq->is_watch && lreq->ping_req->r_osd)
			
 
				+		cancel_linger_request(lreq->ping_req);
			
 
				+	if (lreq->reg_req->r_osd)
			
 
				+		cancel_linger_request(lreq->reg_req);
			
 
				+	cancel_linger_map_check(lreq);
			
 
				+	unlink_linger(lreq->osd, lreq);
			
 
				+	linger_unregister(lreq);
			
 
				+}
			
 
				+
			
 
				+static void linger_cancel(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+
			
 
				+	down_write(&osdc->lock);
			
 
				+	if (__linger_registered(lreq))
			
 
				+		__linger_cancel(lreq);
			
 
				+	up_write(&osdc->lock);
			
 
				+}
			
 
				+
			
 
				+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
			
 
				+
			
 
				+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+	struct ceph_osdmap *map = osdc->osdmap;
			
 
				+
			
 
				+	verify_osdc_wrlocked(osdc);
			
 
				+	WARN_ON(!map->epoch);
			
 
				+
			
 
				+	if (lreq->register_gen) {
			
 
				+		lreq->map_dne_bound = map->epoch;
			
 
				+		dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
			
 
				+		     lreq, lreq->linger_id);
			
 
				+	} else {
			
 
				+		dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
			
 
				+		     __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
			
 
				+		     map->epoch);
			
 
				+	}
			
 
				+
			
 
				+	if (lreq->map_dne_bound) {
			
 
				+		if (map->epoch >= lreq->map_dne_bound) {
			
 
				+			/* we had a new enough map */
			
 
				+			pr_info("linger_id %llu pool does not exist\n",
			
 
				+				lreq->linger_id);
			
 
				+			linger_reg_commit_complete(lreq, -ENOENT);
			
 
				+			__linger_cancel(lreq);
			
 
				+		}
			
 
				+	} else {
			
 
				+		send_linger_map_check(lreq);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
			
 
				+	struct ceph_osd_linger_request *lreq;
			
 
				+	u64 linger_id = greq->private_data;
			
 
				+
			
 
				+	WARN_ON(greq->result || !greq->u.newest);
			
 
				+
			
 
				+	down_write(&osdc->lock);
			
 
				+	lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
			
 
				+	if (!lreq) {
			
 
				+		dout("%s linger_id %llu dne\n", __func__, linger_id);
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				 
			
 
				-	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
			
 
				-	     req, req->r_tid, req->r_osd->o_osd, req->r_flags,
			
 
				-	     (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
			
 
				+	dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
			
 
				+	     __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
			
 
				+	     greq->u.newest);
			
 
				+	if (!lreq->map_dne_bound)
			
 
				+		lreq->map_dne_bound = greq->u.newest;
			
 
				+	erase_linger_mc(&osdc->linger_map_checks, lreq);
			
 
				+	check_linger_pool_dne(lreq);
			
 
				 
			
 
				-	/* fill in message content that changes each time we send it */
			
 
				-	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
			
 
				-	put_unaligned_le32(req->r_flags, req->r_request_flags);
			
 
				-	put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
			
 
				-	p = req->r_request_pgid;
			
 
				-	ceph_encode_64(&p, req->r_pgid.pool);
			
 
				-	ceph_encode_32(&p, req->r_pgid.seed);
			
 
				-	put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
			
 
				-	memcpy(req->r_request_reassert_version, &req->r_reassert_version,
			
 
				-	       sizeof(req->r_reassert_version));
			
 
				+	linger_put(lreq);
			
 
				+out_unlock:
			
 
				+	up_write(&osdc->lock);
			
 
				+}
			
 
				 
			
 
				-	req->r_stamp = jiffies;
			
 
				-	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
			
 
				+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+	struct ceph_osd_linger_request *lookup_lreq;
			
 
				+	int ret;
			
 
				 
			
 
				-	ceph_msg_get(req->r_request); /* send consumes a ref */
			
 
				+	verify_osdc_wrlocked(osdc);
			
 
				 
			
 
				-	req->r_sent = req->r_osd->o_incarnation;
			
 
				+	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
			
 
				+				       lreq->linger_id);
			
 
				+	if (lookup_lreq) {
			
 
				+		WARN_ON(lookup_lreq != lreq);
			
 
				+		return;
			
 
				+	}
			
 
				 
			
 
				-	ceph_con_send(&req->r_osd->o_con, req->r_request);
			
 
				+	linger_get(lreq);
			
 
				+	insert_linger_mc(&osdc->linger_map_checks, lreq);
			
 
				+	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
			
 
				+					  linger_map_check_cb, lreq->linger_id);
			
 
				+	WARN_ON(ret);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Send any requests in the queue (req_unsent).
			
 
				- */
			
 
				-static void __send_queued(struct ceph_osd_client *osdc)
			
 
				+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
			
 
				 {
			
 
				-	struct ceph_osd_request *req, *tmp;
			
 
				+	int ret;
			
 
				 
			
 
				-	dout("__send_queued\n");
			
 
				-	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
			
 
				-		__send_request(osdc, req);
			
 
				+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
			
 
				+	ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
			
 
				+	return ret ?: lreq->reg_commit_error;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Caller should hold map_sem for read and request_mutex.
			
 
				- */
			
 
				-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
			
 
				-				     struct ceph_osd_request *req,
			
 
				-				     bool nofail)
			
 
				-{
			
 
				-	int rc;
			
 
				-
			
 
				-	__register_request(osdc, req);
			
 
				-	req->r_sent = 0;
			
 
				-	req->r_got_reply = 0;
			
 
				-	rc = __map_request(osdc, req, 0);
			
 
				-	if (rc < 0) {
			
 
				-		if (nofail) {
			
 
				-			dout("osdc_start_request failed map, "
			
 
				-				" will retry %lld\n", req->r_tid);
			
 
				-			rc = 0;
			
 
				-		} else {
			
 
				-			__unregister_request(osdc, req);
			
 
				-		}
			
 
				-		return rc;
			
 
				-	}
			
 
				-
			
 
				-	if (req->r_osd == NULL) {
			
 
				-		dout("send_request %p no up osds in pg\n", req);
			
 
				-		ceph_monc_request_next_osdmap(&osdc->client->monc);
			
 
				-	} else {
			
 
				-		__send_queued(osdc);
			
 
				-	}
			
 
				+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	int ret;
			
 
				 
			
 
				-	return 0;
			
 
				+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
			
 
				+	ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
			
 
				+	return ret ?: lreq->notify_finish_error;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Timeout callback, called every N seconds when 1 or more osd
			
 
				- * requests has been active for more than N seconds.  When this
			
 
				- * happens, we ping all OSDs with requests who have timed out to
			
 
				- * ensure any communications channel reset is detected.  Reset the
			
 
				- * request timeouts another N seconds in the future as we go.
			
 
				- * Reschedule the timeout event another N seconds in future (unless
			
 
				- * there are no open requests).
			
 
				+ * Timeout callback, called every N seconds.  When 1 or more OSD
			
 
				+ * requests has been active for more than N seconds, we send a keepalive
			
 
				+ * (tag + timestamp) to its OSD to ensure any communications channel
			
 
				+ * reset is detected.
			
 
				  */
			
 
				 static void handle_timeout(struct work_struct *work)
			
 
				 {
			
 
				 	struct ceph_osd_client *osdc =
			
 
				 		container_of(work, struct ceph_osd_client, timeout_work.work);
			
 
				 	struct ceph_options *opts = osdc->client->options;
			
 
				-	struct ceph_osd_request *req;
			
 
				-	struct ceph_osd *osd;
			
 
				-	struct list_head slow_osds;
			
 
				-	dout("timeout\n");
			
 
				-	down_read(&osdc->map_sem);
			
 
				-
			
 
				-	ceph_monc_request_next_osdmap(&osdc->client->monc);
			
 
				+	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
			
 
				+	LIST_HEAD(slow_osds);
			
 
				+	struct rb_node *n, *p;
			
 
				 
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				+	dout("%s osdc %p\n", __func__, osdc);
			
 
				+	down_write(&osdc->lock);
			
 
				 
			
 
				 	/*
			
 
				 	 * ping osds that are a bit slow.  this ensures that if there
			
 
				 	 * is a break in the TCP connection we will notice, and reopen
			
 
				 	 * a connection with that osd (from the fault callback).
			
 
				 	 */
			
 
				-	INIT_LIST_HEAD(&slow_osds);
			
 
				-	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
			
 
				-		if (time_before(jiffies,
			
 
				-				req->r_stamp + opts->osd_keepalive_timeout))
			
 
				-			break;
			
 
				+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
			
 
				+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
			
 
				+		bool found = false;
			
 
				+
			
 
				+		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
			
 
				+			struct ceph_osd_request *req =
			
 
				+			    rb_entry(p, struct ceph_osd_request, r_node);
			
 
				+
			
 
				+			if (time_before(req->r_stamp, cutoff)) {
			
 
				+				dout(" req %p tid %llu on osd%d is laggy\n",
			
 
				+				     req, req->r_tid, osd->o_osd);
			
 
				+				found = true;
			
 
				+			}
			
 
				+		}
			
 
				+		for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
			
 
				+			struct ceph_osd_linger_request *lreq =
			
 
				+			    rb_entry(p, struct ceph_osd_linger_request, node);
			
 
				+
			
 
				+			dout(" lreq %p linger_id %llu is served by osd%d\n",
			
 
				+			     lreq, lreq->linger_id, osd->o_osd);
			
 
				+			found = true;
			
 
				+
			
 
				+			mutex_lock(&lreq->lock);
			
 
				+			if (lreq->is_watch && lreq->committed && !lreq->last_error)
			
 
				+				send_linger_ping(lreq);
			
 
				+			mutex_unlock(&lreq->lock);
			
 
				+		}
			
 
				 
			
 
				-		osd = req->r_osd;
			
 
				-		BUG_ON(!osd);
			
 
				-		dout(" tid %llu is slow, will send keepalive on osd%d\n",
			
 
				-		     req->r_tid, osd->o_osd);
			
 
				-		list_move_tail(&osd->o_keepalive_item, &slow_osds);
			
 
				+		if (found)
			
 
				+			list_move_tail(&osd->o_keepalive_item, &slow_osds);
			
 
				 	}
			
 
				+
			
 
				+	if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
			
 
				+		maybe_request_map(osdc);
			
 
				+
			
 
				 	while (!list_empty(&slow_osds)) {
			
 
				-		osd = list_entry(slow_osds.next, struct ceph_osd,
			
 
				-				 o_keepalive_item);
			
 
				+		struct ceph_osd *osd = list_first_entry(&slow_osds,
			
 
				+							struct ceph_osd,
			
 
				+							o_keepalive_item);
			
 
				 		list_del_init(&osd->o_keepalive_item);
			
 
				 		ceph_con_keepalive(&osd->o_con);
			
 
				 	}
			
 
				 
			
 
				-	__schedule_osd_timeout(osdc);
			
 
				-	__send_queued(osdc);
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				-	up_read(&osdc->map_sem);
			
 
				+	up_write(&osdc->lock);
			
 
				+	schedule_delayed_work(&osdc->timeout_work,
			
 
				+			      osdc->client->options->osd_keepalive_timeout);
			
 
				 }
			
 
				 
			
 
				 static void handle_osds_timeout(struct work_struct *work)
			
@@ -1663,12 +2542,20 @@ static void handle_osds_timeout(struct work_struct *work)
 
				 		container_of(work, struct ceph_osd_client,
			
 
				 			     osds_timeout_work.work);
			
 
				 	unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
			
 
				+	struct ceph_osd *osd, *nosd;
			
 
				+
			
 
				+	dout("%s osdc %p\n", __func__, osdc);
			
 
				+	down_write(&osdc->lock);
			
 
				+	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
			
 
				+		if (time_before(jiffies, osd->lru_ttl))
			
 
				+			break;
			
 
				 
			
 
				-	dout("osds timeout\n");
			
 
				-	down_read(&osdc->map_sem);
			
 
				-	remove_old_osds(osdc);
			
 
				-	up_read(&osdc->map_sem);
			
 
				+		WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
			
 
				+		WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
			
 
				+		close_osd(osd);
			
 
				+	}
			
 
				 
			
 
				+	up_write(&osdc->lock);
			
 
				 	schedule_delayed_work(&osdc->osds_timeout_work,
			
 
				 			      round_jiffies_relative(delay));
			
 
				 }
			
@@ -1776,107 +2663,76 @@ static int ceph_redirect_decode(void **p, void *end,
 
				 	goto out;
			
 
				 }
			
 
				 
			
 
				-static void complete_request(struct ceph_osd_request *req)
			
 
				-{
			
 
				-	complete_all(&req->r_safe_completion);  /* fsync waiter */
			
 
				-}
			
 
				+struct MOSDOpReply {
			
 
				+	struct ceph_pg pgid;
			
 
				+	u64 flags;
			
 
				+	int result;
			
 
				+	u32 epoch;
			
 
				+	int num_ops;
			
 
				+	u32 outdata_len[CEPH_OSD_MAX_OPS];
			
 
				+	s32 rval[CEPH_OSD_MAX_OPS];
			
 
				+	int retry_attempt;
			
 
				+	struct ceph_eversion replay_version;
			
 
				+	u64 user_version;
			
 
				+	struct ceph_request_redirect redirect;
			
 
				+};
			
 
				 
			
 
				-/*
			
 
				- * handle osd op reply.  either call the callback if it is specified,
			
 
				- * or do the completion to wake up the waiting thread.
			
 
				- */
			
 
				-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
			
 
				+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
			
 
				 {
			
 
				-	void *p, *end;
			
 
				-	struct ceph_osd_request *req;
			
 
				-	struct ceph_request_redirect redir;
			
 
				-	u64 tid;
			
 
				-	int object_len;
			
 
				-	unsigned int numops;
			
 
				-	int payload_len, flags;
			
 
				-	s32 result;
			
 
				-	s32 retry_attempt;
			
 
				-	struct ceph_pg pg;
			
 
				-	int err;
			
 
				-	u32 reassert_epoch;
			
 
				-	u64 reassert_version;
			
 
				-	u32 osdmap_epoch;
			
 
				-	int already_completed;
			
 
				-	u32 bytes;
			
 
				+	void *p = msg->front.iov_base;
			
 
				+	void *const end = p + msg->front.iov_len;
			
 
				+	u16 version = le16_to_cpu(msg->hdr.version);
			
 
				+	struct ceph_eversion bad_replay_version;
			
 
				 	u8 decode_redir;
			
 
				-	unsigned int i;
			
 
				-
			
 
				-	tid = le64_to_cpu(msg->hdr.tid);
			
 
				-	dout("handle_reply %p tid %llu\n", msg, tid);
			
 
				+	u32 len;
			
 
				+	int ret;
			
 
				+	int i;
			
 
				 
			
 
				-	p = msg->front.iov_base;
			
 
				-	end = p + msg->front.iov_len;
			
 
				+	ceph_decode_32_safe(&p, end, len, e_inval);
			
 
				+	ceph_decode_need(&p, end, len, e_inval);
			
 
				+	p += len; /* skip oid */
			
 
				 
			
 
				-	ceph_decode_need(&p, end, 4, bad);
			
 
				-	object_len = ceph_decode_32(&p);
			
 
				-	ceph_decode_need(&p, end, object_len, bad);
			
 
				-	p += object_len;
			
 
				+	ret = ceph_decode_pgid(&p, end, &m->pgid);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				-	err = ceph_decode_pgid(&p, end, &pg);
			
 
				-	if (err)
			
 
				-		goto bad;
			
 
				+	ceph_decode_64_safe(&p, end, m->flags, e_inval);
			
 
				+	ceph_decode_32_safe(&p, end, m->result, e_inval);
			
 
				+	ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
			
 
				+	memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
			
 
				+	p += sizeof(bad_replay_version);
			
 
				+	ceph_decode_32_safe(&p, end, m->epoch, e_inval);
			
 
				 
			
 
				-	ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
			
 
				-	flags = ceph_decode_64(&p);
			
 
				-	result = ceph_decode_32(&p);
			
 
				-	reassert_epoch = ceph_decode_32(&p);
			
 
				-	reassert_version = ceph_decode_64(&p);
			
 
				-	osdmap_epoch = ceph_decode_32(&p);
			
 
				-
			
 
				-	/* lookup */
			
 
				-	down_read(&osdc->map_sem);
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	req = __lookup_request(osdc, tid);
			
 
				-	if (req == NULL) {
			
 
				-		dout("handle_reply tid %llu dne\n", tid);
			
 
				-		goto bad_mutex;
			
 
				-	}
			
 
				-	ceph_osdc_get_request(req);
			
 
				+	ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
			
 
				+	if (m->num_ops > ARRAY_SIZE(m->outdata_len))
			
 
				+		goto e_inval;
			
 
				 
			
 
				-	dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
			
 
				-	     req, result);
			
 
				-
			
 
				-	ceph_decode_need(&p, end, 4, bad_put);
			
 
				-	numops = ceph_decode_32(&p);
			
 
				-	if (numops > CEPH_OSD_MAX_OPS)
			
 
				-		goto bad_put;
			
 
				-	if (numops != req->r_num_ops)
			
 
				-		goto bad_put;
			
 
				-	payload_len = 0;
			
 
				-	ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
			
 
				-	for (i = 0; i < numops; i++) {
			
 
				+	ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
			
 
				+			 e_inval);
			
 
				+	for (i = 0; i < m->num_ops; i++) {
			
 
				 		struct ceph_osd_op *op = p;
			
 
				-		int len;
			
 
				 
			
 
				-		len = le32_to_cpu(op->payload_len);
			
 
				-		req->r_ops[i].outdata_len = len;
			
 
				-		dout(" op %d has %d bytes\n", i, len);
			
 
				-		payload_len += len;
			
 
				+		m->outdata_len[i] = le32_to_cpu(op->payload_len);
			
 
				 		p += sizeof(*op);
			
 
				 	}
			
 
				-	bytes = le32_to_cpu(msg->hdr.data_len);
			
 
				-	if (payload_len != bytes) {
			
 
				-		pr_warn("sum of op payload lens %d != data_len %d\n",
			
 
				-			payload_len, bytes);
			
 
				-		goto bad_put;
			
 
				-	}
			
 
				 
			
 
				-	ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
			
 
				-	retry_attempt = ceph_decode_32(&p);
			
 
				-	for (i = 0; i < numops; i++)
			
 
				-		req->r_ops[i].rval = ceph_decode_32(&p);
			
 
				+	ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
			
 
				+	for (i = 0; i < m->num_ops; i++)
			
 
				+		ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
			
 
				 
			
 
				-	if (le16_to_cpu(msg->hdr.version) >= 6) {
			
 
				-		p += 8 + 4; /* skip replay_version */
			
 
				-		p += 8; /* skip user_version */
			
 
				+	if (version >= 5) {
			
 
				+		ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
			
 
				+		memcpy(&m->replay_version, p, sizeof(m->replay_version));
			
 
				+		p += sizeof(m->replay_version);
			
 
				+		ceph_decode_64_safe(&p, end, m->user_version, e_inval);
			
 
				+	} else {
			
 
				+		m->replay_version = bad_replay_version; /* struct */
			
 
				+		m->user_version = le64_to_cpu(m->replay_version.version);
			
 
				+	}
			
 
				 
			
 
				-		if (le16_to_cpu(msg->hdr.version) >= 7)
			
 
				-			ceph_decode_8_safe(&p, end, decode_redir, bad_put);
			
 
				+	if (version >= 6) {
			
 
				+		if (version >= 7)
			
 
				+			ceph_decode_8_safe(&p, end, decode_redir, e_inval);
			
 
				 		else
			
 
				 			decode_redir = 1;
			
 
				 	} else {
			
@@ -1884,228 +2740,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
				 	}
			
 
				 
			
 
				 	if (decode_redir) {
			
 
				-		err = ceph_redirect_decode(&p, end, &redir);
			
 
				-		if (err)
			
 
				-			goto bad_put;
			
 
				+		ret = ceph_redirect_decode(&p, end, &m->redirect);
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				 	} else {
			
 
				-		redir.oloc.pool = -1;
			
 
				+		ceph_oloc_init(&m->redirect.oloc);
			
 
				 	}
			
 
				 
			
 
				-	if (redir.oloc.pool != -1) {
			
 
				-		dout("redirect pool %lld\n", redir.oloc.pool);
			
 
				-
			
 
				-		__unregister_request(osdc, req);
			
 
				-
			
 
				-		req->r_target_oloc = redir.oloc; /* struct */
			
 
				+	return 0;
			
 
				 
			
 
				-		/*
			
 
				-		 * Start redirect requests with nofail=true.  If
			
 
				-		 * mapping fails, request will end up on the notarget
			
 
				-		 * list, waiting for the new osdmap (which can take
			
 
				-		 * a while), even though the original request mapped
			
 
				-		 * successfully.  In the future we might want to follow
			
 
				-		 * original request's nofail setting here.
			
 
				-		 */
			
 
				-		err = __ceph_osdc_start_request(osdc, req, true);
			
 
				-		BUG_ON(err);
			
 
				+e_inval:
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				 
			
 
				-		goto out_unlock;
			
 
				-	}
			
 
				+/*
			
 
				+ * We are done with @req if
			
 
				+ *   - @m is a safe reply, or
			
 
				+ *   - @m is an unsafe reply and we didn't want a safe one
			
 
				+ */
			
 
				+static bool done_request(const struct ceph_osd_request *req,
			
 
				+			 const struct MOSDOpReply *m)
			
 
				+{
			
 
				+	return (m->result < 0 ||
			
 
				+		(m->flags & CEPH_OSD_FLAG_ONDISK) ||
			
 
				+		!(req->r_flags & CEPH_OSD_FLAG_ONDISK));
			
 
				+}
			
 
				 
			
 
				-	already_completed = req->r_got_reply;
			
 
				-	if (!req->r_got_reply) {
			
 
				-		req->r_result = result;
			
 
				-		dout("handle_reply result %d bytes %d\n", req->r_result,
			
 
				-		     bytes);
			
 
				-		if (req->r_result == 0)
			
 
				-			req->r_result = bytes;
			
 
				+/*
			
 
				+ * handle osd op reply.  either call the callback if it is specified,
			
 
				+ * or do the completion to wake up the waiting thread.
			
 
				+ *
			
 
				+ * ->r_unsafe_callback is set?	yes			no
			
 
				+ *
			
 
				+ * first reply is OK (needed	r_cb/r_completion,	r_cb/r_completion,
			
 
				+ * any or needed/got safe)	r_safe_completion	r_safe_completion
			
 
				+ *
			
 
				+ * first reply is unsafe	r_unsafe_cb(true)	(nothing)
			
 
				+ *
			
 
				+ * when we get the safe reply	r_unsafe_cb(false),	r_cb/r_completion,
			
 
				+ *				r_safe_completion	r_safe_completion
			
 
				+ */
			
 
				+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = osd->o_osdc;
			
 
				+	struct ceph_osd_request *req;
			
 
				+	struct MOSDOpReply m;
			
 
				+	u64 tid = le64_to_cpu(msg->hdr.tid);
			
 
				+	u32 data_len = 0;
			
 
				+	bool already_acked;
			
 
				+	int ret;
			
 
				+	int i;
			
 
				 
			
 
				-		/* in case this is a write and we need to replay, */
			
 
				-		req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
			
 
				-		req->r_reassert_version.version = cpu_to_le64(reassert_version);
			
 
				+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
			
 
				 
			
 
				-		req->r_got_reply = 1;
			
 
				-	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
			
 
				-		dout("handle_reply tid %llu dup ack\n", tid);
			
 
				-		goto out_unlock;
			
 
				+	down_read(&osdc->lock);
			
 
				+	if (!osd_registered(osd)) {
			
 
				+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
			
 
				+		goto out_unlock_osdc;
			
 
				 	}
			
 
				+	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
			
 
				 
			
 
				-	dout("handle_reply tid %llu flags %d\n", tid, flags);
			
 
				+	mutex_lock(&osd->lock);
			
 
				+	req = lookup_request(&osd->o_requests, tid);
			
 
				+	if (!req) {
			
 
				+		dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
			
 
				+		goto out_unlock_session;
			
 
				+	}
			
 
				 
			
 
				-	if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
			
 
				-		__register_linger_request(osdc, req);
			
 
				+	ret = decode_MOSDOpReply(msg, &m);
			
 
				+	if (ret) {
			
 
				+		pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
			
 
				+		       req->r_tid, ret);
			
 
				+		ceph_msg_dump(msg);
			
 
				+		goto fail_request;
			
 
				+	}
			
 
				+	dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
			
 
				+	     __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
			
 
				+	     m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
			
 
				+	     le64_to_cpu(m.replay_version.version), m.user_version);
			
 
				+
			
 
				+	if (m.retry_attempt >= 0) {
			
 
				+		if (m.retry_attempt != req->r_attempts - 1) {
			
 
				+			dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
			
 
				+			     req, req->r_tid, m.retry_attempt,
			
 
				+			     req->r_attempts - 1);
			
 
				+			goto out_unlock_session;
			
 
				+		}
			
 
				+	} else {
			
 
				+		WARN_ON(1); /* MOSDOpReply v4 is assumed */
			
 
				+	}
			
 
				 
			
 
				-	/* either this is a read, or we got the safe response */
			
 
				-	if (result < 0 ||
			
 
				-	    (flags & CEPH_OSD_FLAG_ONDISK) ||
			
 
				-	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
			
 
				-		__unregister_request(osdc, req);
			
 
				+	if (!ceph_oloc_empty(&m.redirect.oloc)) {
			
 
				+		dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
			
 
				+		     m.redirect.oloc.pool);
			
 
				+		unlink_request(osd, req);
			
 
				+		mutex_unlock(&osd->lock);
			
 
				+
			
 
				+		ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
			
 
				+		req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
			
 
				+		req->r_tid = 0;
			
 
				+		__submit_request(req, false);
			
 
				+		goto out_unlock_osdc;
			
 
				+	}
			
 
				 
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				-	up_read(&osdc->map_sem);
			
 
				+	if (m.num_ops != req->r_num_ops) {
			
 
				+		pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
			
 
				+		       req->r_num_ops, req->r_tid);
			
 
				+		goto fail_request;
			
 
				+	}
			
 
				+	for (i = 0; i < req->r_num_ops; i++) {
			
 
				+		dout(" req %p tid %llu op %d rval %d len %u\n", req,
			
 
				+		     req->r_tid, i, m.rval[i], m.outdata_len[i]);
			
 
				+		req->r_ops[i].rval = m.rval[i];
			
 
				+		req->r_ops[i].outdata_len = m.outdata_len[i];
			
 
				+		data_len += m.outdata_len[i];
			
 
				+	}
			
 
				+	if (data_len != le32_to_cpu(msg->hdr.data_len)) {
			
 
				+		pr_err("sum of lens %u != %u for tid %llu\n", data_len,
			
 
				+		       le32_to_cpu(msg->hdr.data_len), req->r_tid);
			
 
				+		goto fail_request;
			
 
				+	}
			
 
				+	dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
			
 
				+	     req, req->r_tid, req->r_got_reply, m.result, data_len);
			
 
				+
			
 
				+	already_acked = req->r_got_reply;
			
 
				+	if (!already_acked) {
			
 
				+		req->r_result = m.result ?: data_len;
			
 
				+		req->r_replay_version = m.replay_version; /* struct */
			
 
				+		req->r_got_reply = true;
			
 
				+	} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
			
 
				+		dout("req %p tid %llu dup ack\n", req, req->r_tid);
			
 
				+		goto out_unlock_session;
			
 
				+	}
			
 
				 
			
 
				-	if (!already_completed) {
			
 
				-		if (req->r_unsafe_callback &&
			
 
				-		    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
			
 
				-			req->r_unsafe_callback(req, true);
			
 
				-		if (req->r_callback)
			
 
				-			req->r_callback(req, msg);
			
 
				-		else
			
 
				-			complete_all(&req->r_completion);
			
 
				+	if (done_request(req, &m)) {
			
 
				+		__finish_request(req);
			
 
				+		if (req->r_linger) {
			
 
				+			WARN_ON(req->r_unsafe_callback);
			
 
				+			dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
			
 
				+			__complete_request(req);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				-	if (flags & CEPH_OSD_FLAG_ONDISK) {
			
 
				-		if (req->r_unsafe_callback && already_completed)
			
 
				+	mutex_unlock(&osd->lock);
			
 
				+	up_read(&osdc->lock);
			
 
				+
			
 
				+	if (done_request(req, &m)) {
			
 
				+		if (already_acked && req->r_unsafe_callback) {
			
 
				+			dout("req %p tid %llu safe-cb\n", req, req->r_tid);
			
 
				 			req->r_unsafe_callback(req, false);
			
 
				-		complete_request(req);
			
 
				+		} else if (!req->r_linger) {
			
 
				+			dout("req %p tid %llu cb\n", req, req->r_tid);
			
 
				+			__complete_request(req);
			
 
				+		}
			
 
				+	} else {
			
 
				+		if (req->r_unsafe_callback) {
			
 
				+			dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
			
 
				+			req->r_unsafe_callback(req, true);
			
 
				+		} else {
			
 
				+			WARN_ON(1);
			
 
				+		}
			
 
				 	}
			
 
				+	if (m.flags & CEPH_OSD_FLAG_ONDISK)
			
 
				+		complete_all(&req->r_safe_completion);
			
 
				 
			
 
				-out:
			
 
				-	dout("req=%p req->r_linger=%d\n", req, req->r_linger);
			
 
				 	ceph_osdc_put_request(req);
			
 
				 	return;
			
 
				-out_unlock:
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				-	up_read(&osdc->map_sem);
			
 
				-	goto out;
			
 
				 
			
 
				-bad_put:
			
 
				-	req->r_result = -EIO;
			
 
				-	__unregister_request(osdc, req);
			
 
				-	if (req->r_callback)
			
 
				-		req->r_callback(req, msg);
			
 
				-	else
			
 
				-		complete_all(&req->r_completion);
			
 
				-	complete_request(req);
			
 
				-	ceph_osdc_put_request(req);
			
 
				-bad_mutex:
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				-	up_read(&osdc->map_sem);
			
 
				-bad:
			
 
				-	pr_err("corrupt osd_op_reply got %d %d\n",
			
 
				-	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
			
 
				-	ceph_msg_dump(msg);
			
 
				+fail_request:
			
 
				+	complete_request(req, -EIO);
			
 
				+out_unlock_session:
			
 
				+	mutex_unlock(&osd->lock);
			
 
				+out_unlock_osdc:
			
 
				+	up_read(&osdc->lock);
			
 
				 }
			
 
				 
			
 
				-static void reset_changed_osds(struct ceph_osd_client *osdc)
			
 
				+static void set_pool_was_full(struct ceph_osd_client *osdc)
			
 
				 {
			
 
				-	struct rb_node *p, *n;
			
 
				+	struct rb_node *n;
			
 
				 
			
 
				-	dout("%s %p\n", __func__, osdc);
			
 
				-	for (p = rb_first(&osdc->osds); p; p = n) {
			
 
				-		struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
			
 
				+	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
			
 
				+		struct ceph_pg_pool_info *pi =
			
 
				+		    rb_entry(n, struct ceph_pg_pool_info, node);
			
 
				 
			
 
				-		n = rb_next(p);
			
 
				-		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
			
 
				-		    memcmp(&osd->o_con.peer_addr,
			
 
				-			   ceph_osd_addr(osdc->osdmap,
			
 
				-					 osd->o_osd),
			
 
				-			   sizeof(struct ceph_entity_addr)) != 0)
			
 
				-			__reset_osd(osdc, osd);
			
 
				+		pi->was_full = __pool_full(pi);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
			
 
				+{
			
 
				+	struct ceph_pg_pool_info *pi;
			
 
				+
			
 
				+	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
			
 
				+	if (!pi)
			
 
				+		return false;
			
 
				+
			
 
				+	return pi->was_full && !__pool_full(pi);
			
 
				+}
			
 
				+
			
 
				+static enum calc_target_result
			
 
				+recalc_linger_target(struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc = lreq->osdc;
			
 
				+	enum calc_target_result ct_res;
			
 
				+
			
 
				+	ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
			
 
				+	if (ct_res == CALC_TARGET_NEED_RESEND) {
			
 
				+		struct ceph_osd *osd;
			
 
				+
			
 
				+		osd = lookup_create_osd(osdc, lreq->t.osd, true);
			
 
				+		if (osd != lreq->osd) {
			
 
				+			unlink_linger(lreq->osd, lreq);
			
 
				+			link_linger(osd, lreq);
			
 
				+		}
			
 
				 	}
			
 
				+
			
 
				+	return ct_res;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Requeue requests whose mapping to an OSD has changed.  If requests map to
			
 
				- * no osd, request a new map.
			
 
				- *
			
 
				- * Caller should hold map_sem for read.
			
 
				+ * Requeue requests whose mapping to an OSD has changed.
			
 
				  */
			
 
				-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
			
 
				-			  bool force_resend_writes)
			
 
				+static void scan_requests(struct ceph_osd *osd,
			
 
				+			  bool force_resend,
			
 
				+			  bool cleared_full,
			
 
				+			  bool check_pool_cleared_full,
			
 
				+			  struct rb_root *need_resend,
			
 
				+			  struct list_head *need_resend_linger)
			
 
				 {
			
 
				-	struct ceph_osd_request *req, *nreq;
			
 
				-	struct rb_node *p;
			
 
				-	int needmap = 0;
			
 
				-	int err;
			
 
				-	bool force_resend_req;
			
 
				+	struct ceph_osd_client *osdc = osd->o_osdc;
			
 
				+	struct rb_node *n;
			
 
				+	bool force_resend_writes;
			
 
				+
			
 
				+	for (n = rb_first(&osd->o_linger_requests); n; ) {
			
 
				+		struct ceph_osd_linger_request *lreq =
			
 
				+		    rb_entry(n, struct ceph_osd_linger_request, node);
			
 
				+		enum calc_target_result ct_res;
			
 
				+
			
 
				+		n = rb_next(n); /* recalc_linger_target() */
			
 
				+
			
 
				+		dout("%s lreq %p linger_id %llu\n", __func__, lreq,
			
 
				+		     lreq->linger_id);
			
 
				+		ct_res = recalc_linger_target(lreq);
			
 
				+		switch (ct_res) {
			
 
				+		case CALC_TARGET_NO_ACTION:
			
 
				+			force_resend_writes = cleared_full ||
			
 
				+			    (check_pool_cleared_full &&
			
 
				+			     pool_cleared_full(osdc, lreq->t.base_oloc.pool));
			
 
				+			if (!force_resend && !force_resend_writes)
			
 
				+				break;
			
 
				+
			
 
				+			/* fall through */
			
 
				+		case CALC_TARGET_NEED_RESEND:
			
 
				+			cancel_linger_map_check(lreq);
			
 
				+			/*
			
 
				+			 * scan_requests() for the previous epoch(s)
			
 
				+			 * may have already added it to the list, since
			
 
				+			 * it's not unlinked here.
			
 
				+			 */
			
 
				+			if (list_empty(&lreq->scan_item))
			
 
				+				list_add_tail(&lreq->scan_item, need_resend_linger);
			
 
				+			break;
			
 
				+		case CALC_TARGET_POOL_DNE:
			
 
				+			check_linger_pool_dne(lreq);
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (n = rb_first(&osd->o_requests); n; ) {
			
 
				+		struct ceph_osd_request *req =
			
 
				+		    rb_entry(n, struct ceph_osd_request, r_node);
			
 
				+		enum calc_target_result ct_res;
			
 
				+
			
 
				+		n = rb_next(n); /* unlink_request(), check_pool_dne() */
			
 
				+
			
 
				+		dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
			
 
				+		ct_res = calc_target(osdc, &req->r_t,
			
 
				+				     &req->r_last_force_resend, false);
			
 
				+		switch (ct_res) {
			
 
				+		case CALC_TARGET_NO_ACTION:
			
 
				+			force_resend_writes = cleared_full ||
			
 
				+			    (check_pool_cleared_full &&
			
 
				+			     pool_cleared_full(osdc, req->r_t.base_oloc.pool));
			
 
				+			if (!force_resend &&
			
 
				+			    (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
			
 
				+			     !force_resend_writes))
			
 
				+				break;
			
 
				+
			
 
				+			/* fall through */
			
 
				+		case CALC_TARGET_NEED_RESEND:
			
 
				+			cancel_map_check(req);
			
 
				+			unlink_request(osd, req);
			
 
				+			insert_request(need_resend, req);
			
 
				+			break;
			
 
				+		case CALC_TARGET_POOL_DNE:
			
 
				+			check_pool_dne(req);
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int handle_one_map(struct ceph_osd_client *osdc,
			
 
				+			  void *p, void *end, bool incremental,
			
 
				+			  struct rb_root *need_resend,
			
 
				+			  struct list_head *need_resend_linger)
			
 
				+{
			
 
				+	struct ceph_osdmap *newmap;
			
 
				+	struct rb_node *n;
			
 
				+	bool skipped_map = false;
			
 
				+	bool was_full;
			
 
				+
			
 
				+	was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
			
 
				+	set_pool_was_full(osdc);
			
 
				 
			
 
				-	dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
			
 
				-		force_resend_writes ? " (force resend writes)" : "");
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	for (p = rb_first(&osdc->requests); p; ) {
			
 
				-		req = rb_entry(p, struct ceph_osd_request, r_node);
			
 
				-		p = rb_next(p);
			
 
				+	if (incremental)
			
 
				+		newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
			
 
				+	else
			
 
				+		newmap = ceph_osdmap_decode(&p, end);
			
 
				+	if (IS_ERR(newmap))
			
 
				+		return PTR_ERR(newmap);
			
 
				 
			
 
				+	if (newmap != osdc->osdmap) {
			
 
				 		/*
			
 
				-		 * For linger requests that have not yet been
			
 
				-		 * registered, move them to the linger list; they'll
			
 
				-		 * be sent to the osd in the loop below.  Unregister
			
 
				-		 * the request before re-registering it as a linger
			
 
				-		 * request to ensure the __map_request() below
			
 
				-		 * will decide it needs to be sent.
			
 
				+		 * Preserve ->was_full before destroying the old map.
			
 
				+		 * For pools that weren't in the old map, ->was_full
			
 
				+		 * should be false.
			
 
				 		 */
			
 
				-		if (req->r_linger && list_empty(&req->r_linger_item)) {
			
 
				-			dout("%p tid %llu restart on osd%d\n",
			
 
				-			     req, req->r_tid,
			
 
				-			     req->r_osd ? req->r_osd->o_osd : -1);
			
 
				-			ceph_osdc_get_request(req);
			
 
				-			__unregister_request(osdc, req);
			
 
				-			__register_linger_request(osdc, req);
			
 
				-			ceph_osdc_put_request(req);
			
 
				-			continue;
			
 
				+		for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
			
 
				+			struct ceph_pg_pool_info *pi =
			
 
				+			    rb_entry(n, struct ceph_pg_pool_info, node);
			
 
				+			struct ceph_pg_pool_info *old_pi;
			
 
				+
			
 
				+			old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
			
 
				+			if (old_pi)
			
 
				+				pi->was_full = old_pi->was_full;
			
 
				+			else
			
 
				+				WARN_ON(pi->was_full);
			
 
				 		}
			
 
				 
			
 
				-		force_resend_req = force_resend ||
			
 
				-			(force_resend_writes &&
			
 
				-				req->r_flags & CEPH_OSD_FLAG_WRITE);
			
 
				-		err = __map_request(osdc, req, force_resend_req);
			
 
				-		if (err < 0)
			
 
				-			continue;  /* error */
			
 
				-		if (req->r_osd == NULL) {
			
 
				-			dout("%p tid %llu maps to no osd\n", req, req->r_tid);
			
 
				-			needmap++;  /* request a newer map */
			
 
				-		} else if (err > 0) {
			
 
				-			if (!req->r_linger) {
			
 
				-				dout("%p tid %llu requeued on osd%d\n", req,
			
 
				-				     req->r_tid,
			
 
				-				     req->r_osd ? req->r_osd->o_osd : -1);
			
 
				-				req->r_flags |= CEPH_OSD_FLAG_RETRY;
			
 
				-			}
			
 
				+		if (osdc->osdmap->epoch &&
			
 
				+		    osdc->osdmap->epoch + 1 < newmap->epoch) {
			
 
				+			WARN_ON(incremental);
			
 
				+			skipped_map = true;
			
 
				 		}
			
 
				+
			
 
				+		ceph_osdmap_destroy(osdc->osdmap);
			
 
				+		osdc->osdmap = newmap;
			
 
				 	}
			
 
				 
			
 
				-	list_for_each_entry_safe(req, nreq, &osdc->req_linger,
			
 
				-				 r_linger_item) {
			
 
				-		dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
			
 
				-
			
 
				-		err = __map_request(osdc, req,
			
 
				-				    force_resend || force_resend_writes);
			
 
				-		dout("__map_request returned %d\n", err);
			
 
				-		if (err < 0)
			
 
				-			continue;  /* hrm! */
			
 
				-		if (req->r_osd == NULL || err > 0) {
			
 
				-			if (req->r_osd == NULL) {
			
 
				-				dout("lingering %p tid %llu maps to no osd\n",
			
 
				-				     req, req->r_tid);
			
 
				-				/*
			
 
				-				 * A homeless lingering request makes
			
 
				-				 * no sense, as it's job is to keep
			
 
				-				 * a particular OSD connection open.
			
 
				-				 * Request a newer map and kick the
			
 
				-				 * request, knowing that it won't be
			
 
				-				 * resent until we actually get a map
			
 
				-				 * that can tell us where to send it.
			
 
				-				 */
			
 
				-				needmap++;
			
 
				-			}
			
 
				+	was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
			
 
				+	scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
			
 
				+		      need_resend, need_resend_linger);
			
 
				+
			
 
				+	for (n = rb_first(&osdc->osds); n; ) {
			
 
				+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
			
 
				+
			
 
				+		n = rb_next(n); /* close_osd() */
			
 
				 
			
 
				-			dout("kicking lingering %p tid %llu osd%d\n", req,
			
 
				-			     req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
			
 
				-			__register_request(osdc, req);
			
 
				-			__unregister_linger_request(osdc, req);
			
 
				+		scan_requests(osd, skipped_map, was_full, true, need_resend,
			
 
				+			      need_resend_linger);
			
 
				+		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
			
 
				+		    memcmp(&osd->o_con.peer_addr,
			
 
				+			   ceph_osd_addr(osdc->osdmap, osd->o_osd),
			
 
				+			   sizeof(struct ceph_entity_addr)))
			
 
				+			close_osd(osd);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void kick_requests(struct ceph_osd_client *osdc,
			
 
				+			  struct rb_root *need_resend,
			
 
				+			  struct list_head *need_resend_linger)
			
 
				+{
			
 
				+	struct ceph_osd_linger_request *lreq, *nlreq;
			
 
				+	struct rb_node *n;
			
 
				+
			
 
				+	for (n = rb_first(need_resend); n; ) {
			
 
				+		struct ceph_osd_request *req =
			
 
				+		    rb_entry(n, struct ceph_osd_request, r_node);
			
 
				+		struct ceph_osd *osd;
			
 
				+
			
 
				+		n = rb_next(n);
			
 
				+		erase_request(need_resend, req); /* before link_request() */
			
 
				+
			
 
				+		WARN_ON(req->r_osd);
			
 
				+		calc_target(osdc, &req->r_t, NULL, false);
			
 
				+		osd = lookup_create_osd(osdc, req->r_t.osd, true);
			
 
				+		link_request(osd, req);
			
 
				+		if (!req->r_linger) {
			
 
				+			if (!osd_homeless(osd) && !req->r_t.paused)
			
 
				+				send_request(req);
			
 
				+		} else {
			
 
				+			cancel_linger_request(req);
			
 
				 		}
			
 
				 	}
			
 
				-	reset_changed_osds(osdc);
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				 
			
 
				-	if (needmap) {
			
 
				-		dout("%d requests for down osds, need new map\n", needmap);
			
 
				-		ceph_monc_request_next_osdmap(&osdc->client->monc);
			
 
				+	list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
			
 
				+		if (!osd_homeless(lreq->osd))
			
 
				+			send_linger(lreq);
			
 
				+
			
 
				+		list_del_init(&lreq->scan_item);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-
			
 
				 /*
			
 
				  * Process updated osd map.
			
 
				  *
			
@@ -2115,27 +3153,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
 
				  */
			
 
				 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
			
 
				 {
			
 
				-	void *p, *end, *next;
			
 
				+	void *p = msg->front.iov_base;
			
 
				+	void *const end = p + msg->front.iov_len;
			
 
				 	u32 nr_maps, maplen;
			
 
				 	u32 epoch;
			
 
				-	struct ceph_osdmap *newmap = NULL, *oldmap;
			
 
				-	int err;
			
 
				 	struct ceph_fsid fsid;
			
 
				-	bool was_full;
			
 
				+	struct rb_root need_resend = RB_ROOT;
			
 
				+	LIST_HEAD(need_resend_linger);
			
 
				+	bool handled_incremental = false;
			
 
				+	bool was_pauserd, was_pausewr;
			
 
				+	bool pauserd, pausewr;
			
 
				+	int err;
			
 
				 
			
 
				-	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
			
 
				-	p = msg->front.iov_base;
			
 
				-	end = p + msg->front.iov_len;
			
 
				+	dout("%s have %u\n", __func__, osdc->osdmap->epoch);
			
 
				+	down_write(&osdc->lock);
			
 
				 
			
 
				 	/* verify fsid */
			
 
				 	ceph_decode_need(&p, end, sizeof(fsid), bad);
			
 
				 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
			
 
				 	if (ceph_check_fsid(osdc->client, &fsid) < 0)
			
 
				-		return;
			
 
				-
			
 
				-	down_write(&osdc->map_sem);
			
 
				+		goto bad;
			
 
				 
			
 
				-	was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
			
 
				+	was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
			
 
				+	was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
			
 
				+		      ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
			
 
				+		      have_pool_full(osdc);
			
 
				 
			
 
				 	/* incremental maps */
			
 
				 	ceph_decode_32_safe(&p, end, nr_maps, bad);
			
@@ -2145,34 +3187,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
				 		epoch = ceph_decode_32(&p);
			
 
				 		maplen = ceph_decode_32(&p);
			
 
				 		ceph_decode_need(&p, end, maplen, bad);
			
 
				-		next = p + maplen;
			
 
				-		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
			
 
				+		if (osdc->osdmap->epoch &&
			
 
				+		    osdc->osdmap->epoch + 1 == epoch) {
			
 
				 			dout("applying incremental map %u len %d\n",
			
 
				 			     epoch, maplen);
			
 
				-			newmap = osdmap_apply_incremental(&p, next,
			
 
				-							  osdc->osdmap,
			
 
				-							  &osdc->client->msgr);
			
 
				-			if (IS_ERR(newmap)) {
			
 
				-				err = PTR_ERR(newmap);
			
 
				+			err = handle_one_map(osdc, p, p + maplen, true,
			
 
				+					     &need_resend, &need_resend_linger);
			
 
				+			if (err)
			
 
				 				goto bad;
			
 
				-			}
			
 
				-			BUG_ON(!newmap);
			
 
				-			if (newmap != osdc->osdmap) {
			
 
				-				ceph_osdmap_destroy(osdc->osdmap);
			
 
				-				osdc->osdmap = newmap;
			
 
				-			}
			
 
				-			was_full = was_full ||
			
 
				-				ceph_osdmap_flag(osdc->osdmap,
			
 
				-						 CEPH_OSDMAP_FULL);
			
 
				-			kick_requests(osdc, 0, was_full);
			
 
				+			handled_incremental = true;
			
 
				 		} else {
			
 
				 			dout("ignoring incremental map %u len %d\n",
			
 
				 			     epoch, maplen);
			
 
				 		}
			
 
				-		p = next;
			
 
				+		p += maplen;
			
 
				 		nr_maps--;
			
 
				 	}
			
 
				-	if (newmap)
			
 
				+	if (handled_incremental)
			
 
				 		goto done;
			
 
				 
			
 
				 	/* full maps */
			
@@ -2186,455 +3217,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
				 		if (nr_maps > 1) {
			
 
				 			dout("skipping non-latest full map %u len %d\n",
			
 
				 			     epoch, maplen);
			
 
				-		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
			
 
				+		} else if (osdc->osdmap->epoch >= epoch) {
			
 
				 			dout("skipping full map %u len %d, "
			
 
				 			     "older than our %u\n", epoch, maplen,
			
 
				 			     osdc->osdmap->epoch);
			
 
				 		} else {
			
 
				-			int skipped_map = 0;
			
 
				+			dout("taking full map %u len %d\n", epoch, maplen);
			
 
				+			err = handle_one_map(osdc, p, p + maplen, false,
			
 
				+					     &need_resend, &need_resend_linger);
			
 
				+			if (err)
			
 
				+				goto bad;
			
 
				+		}
			
 
				+		p += maplen;
			
 
				+		nr_maps--;
			
 
				+	}
			
 
				+
			
 
				+done:
			
 
				+	/*
			
 
				+	 * subscribe to subsequent osdmap updates if full to ensure
			
 
				+	 * we find out when we are no longer full and stop returning
			
 
				+	 * ENOSPC.
			
 
				+	 */
			
 
				+	pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
			
 
				+	pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
			
 
				+		  ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
			
 
				+		  have_pool_full(osdc);
			
 
				+	if (was_pauserd || was_pausewr || pauserd || pausewr)
			
 
				+		maybe_request_map(osdc);
			
 
				+
			
 
				+	kick_requests(osdc, &need_resend, &need_resend_linger);
			
 
				+
			
 
				+	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
			
 
				+			  osdc->osdmap->epoch);
			
 
				+	up_write(&osdc->lock);
			
 
				+	wake_up_all(&osdc->client->auth_wq);
			
 
				+	return;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("osdc handle_map corrupt msg\n");
			
 
				+	ceph_msg_dump(msg);
			
 
				+	up_write(&osdc->lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Resubmit requests pending on the given osd.
			
 
				+ */
			
 
				+static void kick_osd_requests(struct ceph_osd *osd)
			
 
				+{
			
 
				+	struct rb_node *n;
			
 
				+
			
 
				+	for (n = rb_first(&osd->o_requests); n; ) {
			
 
				+		struct ceph_osd_request *req =
			
 
				+		    rb_entry(n, struct ceph_osd_request, r_node);
			
 
				+
			
 
				+		n = rb_next(n); /* cancel_linger_request() */
			
 
				+
			
 
				+		if (!req->r_linger) {
			
 
				+			if (!req->r_t.paused)
			
 
				+				send_request(req);
			
 
				+		} else {
			
 
				+			cancel_linger_request(req);
			
 
				+		}
			
 
				+	}
			
 
				+	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
			
 
				+		struct ceph_osd_linger_request *lreq =
			
 
				+		    rb_entry(n, struct ceph_osd_linger_request, node);
			
 
				+
			
 
				+		send_linger(lreq);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If the osd connection drops, we need to resubmit all requests.
			
 
				+ */
			
 
				+static void osd_fault(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_osd *osd = con->private;
			
 
				+	struct ceph_osd_client *osdc = osd->o_osdc;
			
 
				+
			
 
				+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
			
 
				+
			
 
				+	down_write(&osdc->lock);
			
 
				+	if (!osd_registered(osd)) {
			
 
				+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	if (!reopen_osd(osd))
			
 
				+		kick_osd_requests(osd);
			
 
				+	maybe_request_map(osdc);
			
 
				+
			
 
				+out_unlock:
			
 
				+	up_write(&osdc->lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Process osd watch notifications
			
 
				+ */
			
 
				+static void handle_watch_notify(struct ceph_osd_client *osdc,
			
 
				+				struct ceph_msg *msg)
			
 
				+{
			
 
				+	void *p = msg->front.iov_base;
			
 
				+	void *const end = p + msg->front.iov_len;
			
 
				+	struct ceph_osd_linger_request *lreq;
			
 
				+	struct linger_work *lwork;
			
 
				+	u8 proto_ver, opcode;
			
 
				+	u64 cookie, notify_id;
			
 
				+	u64 notifier_id = 0;
			
 
				+	s32 return_code = 0;
			
 
				+	void *payload = NULL;
			
 
				+	u32 payload_len = 0;
			
 
				+
			
 
				+	ceph_decode_8_safe(&p, end, proto_ver, bad);
			
 
				+	ceph_decode_8_safe(&p, end, opcode, bad);
			
 
				+	ceph_decode_64_safe(&p, end, cookie, bad);
			
 
				+	p += 8; /* skip ver */
			
 
				+	ceph_decode_64_safe(&p, end, notify_id, bad);
			
 
				+
			
 
				+	if (proto_ver >= 1) {
			
 
				+		ceph_decode_32_safe(&p, end, payload_len, bad);
			
 
				+		ceph_decode_need(&p, end, payload_len, bad);
			
 
				+		payload = p;
			
 
				+		p += payload_len;
			
 
				+	}
			
 
				+
			
 
				+	if (le16_to_cpu(msg->hdr.version) >= 2)
			
 
				+		ceph_decode_32_safe(&p, end, return_code, bad);
			
 
				+
			
 
				+	if (le16_to_cpu(msg->hdr.version) >= 3)
			
 
				+		ceph_decode_64_safe(&p, end, notifier_id, bad);
			
 
				+
			
 
				+	down_read(&osdc->lock);
			
 
				+	lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
			
 
				+	if (!lreq) {
			
 
				+		dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
			
 
				+		     cookie);
			
 
				+		goto out_unlock_osdc;
			
 
				+	}
			
 
				 
			
 
				-			dout("taking full map %u len %d\n", epoch, maplen);
			
 
				-			newmap = ceph_osdmap_decode(&p, p+maplen);
			
 
				-			if (IS_ERR(newmap)) {
			
 
				-				err = PTR_ERR(newmap);
			
 
				-				goto bad;
			
 
				-			}
			
 
				-			BUG_ON(!newmap);
			
 
				-			oldmap = osdc->osdmap;
			
 
				-			osdc->osdmap = newmap;
			
 
				-			if (oldmap) {
			
 
				-				if (oldmap->epoch + 1 < newmap->epoch)
			
 
				-					skipped_map = 1;
			
 
				-				ceph_osdmap_destroy(oldmap);
			
 
				+	mutex_lock(&lreq->lock);
			
 
				+	dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
			
 
				+	     opcode, cookie, lreq, lreq->is_watch);
			
 
				+	if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
			
 
				+		if (!lreq->last_error) {
			
 
				+			lreq->last_error = -ENOTCONN;
			
 
				+			queue_watch_error(lreq);
			
 
				+		}
			
 
				+	} else if (!lreq->is_watch) {
			
 
				+		/* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
			
 
				+		if (lreq->notify_id && lreq->notify_id != notify_id) {
			
 
				+			dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
			
 
				+			     lreq->notify_id, notify_id);
			
 
				+		} else if (!completion_done(&lreq->notify_finish_wait)) {
			
 
				+			struct ceph_msg_data *data =
			
 
				+			    list_first_entry_or_null(&msg->data,
			
 
				+						     struct ceph_msg_data,
			
 
				+						     links);
			
 
				+
			
 
				+			if (data) {
			
 
				+				if (lreq->preply_pages) {
			
 
				+					WARN_ON(data->type !=
			
 
				+							CEPH_MSG_DATA_PAGES);
			
 
				+					*lreq->preply_pages = data->pages;
			
 
				+					*lreq->preply_len = data->length;
			
 
				+				} else {
			
 
				+					ceph_release_page_vector(data->pages,
			
 
				+					       calc_pages_for(0, data->length));
			
 
				+				}
			
 
				 			}
			
 
				-			was_full = was_full ||
			
 
				-				ceph_osdmap_flag(osdc->osdmap,
			
 
				-						 CEPH_OSDMAP_FULL);
			
 
				-			kick_requests(osdc, skipped_map, was_full);
			
 
				+			lreq->notify_finish_error = return_code;
			
 
				+			complete_all(&lreq->notify_finish_wait);
			
 
				+		}
			
 
				+	} else {
			
 
				+		/* CEPH_WATCH_EVENT_NOTIFY */
			
 
				+		lwork = lwork_alloc(lreq, do_watch_notify);
			
 
				+		if (!lwork) {
			
 
				+			pr_err("failed to allocate notify-lwork\n");
			
 
				+			goto out_unlock_lreq;
			
 
				 		}
			
 
				-		p += maplen;
			
 
				-		nr_maps--;
			
 
				-	}
			
 
				 
			
 
				-	if (!osdc->osdmap)
			
 
				-		goto bad;
			
 
				-done:
			
 
				-	downgrade_write(&osdc->map_sem);
			
 
				-	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
			
 
				-			  osdc->osdmap->epoch);
			
 
				+		lwork->notify.notify_id = notify_id;
			
 
				+		lwork->notify.notifier_id = notifier_id;
			
 
				+		lwork->notify.payload = payload;
			
 
				+		lwork->notify.payload_len = payload_len;
			
 
				+		lwork->notify.msg = ceph_msg_get(msg);
			
 
				+		lwork_queue(lwork);
			
 
				+	}
			
 
				 
			
 
				-	/*
			
 
				-	 * subscribe to subsequent osdmap updates if full to ensure
			
 
				-	 * we find out when we are no longer full and stop returning
			
 
				-	 * ENOSPC.
			
 
				-	 */
			
 
				-	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
			
 
				-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
			
 
				-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
			
 
				-		ceph_monc_request_next_osdmap(&osdc->client->monc);
			
 
				-
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	__send_queued(osdc);
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				-	up_read(&osdc->map_sem);
			
 
				-	wake_up_all(&osdc->client->auth_wq);
			
 
				+out_unlock_lreq:
			
 
				+	mutex_unlock(&lreq->lock);
			
 
				+out_unlock_osdc:
			
 
				+	up_read(&osdc->lock);
			
 
				 	return;
			
 
				 
			
 
				 bad:
			
 
				-	pr_err("osdc handle_map corrupt msg\n");
			
 
				-	ceph_msg_dump(msg);
			
 
				-	up_write(&osdc->map_sem);
			
 
				+	pr_err("osdc handle_watch_notify corrupt msg\n");
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * watch/notify callback event infrastructure
			
 
				- *
			
 
				- * These callbacks are used both for watch and notify operations.
			
 
				+ * Register request, send initial attempt.
			
 
				  */
			
 
				-static void __release_event(struct kref *kref)
			
 
				+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
			
 
				+			    struct ceph_osd_request *req,
			
 
				+			    bool nofail)
			
 
				 {
			
 
				-	struct ceph_osd_event *event =
			
 
				-		container_of(kref, struct ceph_osd_event, kref);
			
 
				+	down_read(&osdc->lock);
			
 
				+	submit_request(req, false);
			
 
				+	up_read(&osdc->lock);
			
 
				 
			
 
				-	dout("__release_event %p\n", event);
			
 
				-	kfree(event);
			
 
				+	return 0;
			
 
				 }
			
 
				+EXPORT_SYMBOL(ceph_osdc_start_request);
			
 
				 
			
 
				-static void get_event(struct ceph_osd_event *event)
			
 
				+/*
			
 
				+ * Unregister a registered request.  The request is not completed (i.e.
			
 
				+ * no callbacks or wakeups) - higher layers are supposed to know what
			
 
				+ * they are canceling.
			
 
				+ */
			
 
				+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
			
 
				 {
			
 
				-	kref_get(&event->kref);
			
 
				-}
			
 
				+	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				 
			
 
				-void ceph_osdc_put_event(struct ceph_osd_event *event)
			
 
				-{
			
 
				-	kref_put(&event->kref, __release_event);
			
 
				+	down_write(&osdc->lock);
			
 
				+	if (req->r_osd)
			
 
				+		cancel_request(req);
			
 
				+	up_write(&osdc->lock);
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_osdc_put_event);
			
 
				+EXPORT_SYMBOL(ceph_osdc_cancel_request);
			
 
				 
			
 
				-static void __insert_event(struct ceph_osd_client *osdc,
			
 
				-			     struct ceph_osd_event *new)
			
 
				+/*
			
 
				+ * @timeout: in jiffies, 0 means "wait forever"
			
 
				+ */
			
 
				+static int wait_request_timeout(struct ceph_osd_request *req,
			
 
				+				unsigned long timeout)
			
 
				 {
			
 
				-	struct rb_node **p = &osdc->event_tree.rb_node;
			
 
				-	struct rb_node *parent = NULL;
			
 
				-	struct ceph_osd_event *event = NULL;
			
 
				+	long left;
			
 
				 
			
 
				-	while (*p) {
			
 
				-		parent = *p;
			
 
				-		event = rb_entry(parent, struct ceph_osd_event, node);
			
 
				-		if (new->cookie < event->cookie)
			
 
				-			p = &(*p)->rb_left;
			
 
				-		else if (new->cookie > event->cookie)
			
 
				-			p = &(*p)->rb_right;
			
 
				-		else
			
 
				-			BUG();
			
 
				+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
			
 
				+	left = wait_for_completion_killable_timeout(&req->r_completion,
			
 
				+						ceph_timeout_jiffies(timeout));
			
 
				+	if (left <= 0) {
			
 
				+		left = left ?: -ETIMEDOUT;
			
 
				+		ceph_osdc_cancel_request(req);
			
 
				+
			
 
				+		/* kludge - need to to wake ceph_osdc_sync() */
			
 
				+		complete_all(&req->r_safe_completion);
			
 
				+	} else {
			
 
				+		left = req->r_result; /* completed */
			
 
				 	}
			
 
				 
			
 
				-	rb_link_node(&new->node, parent, p);
			
 
				-	rb_insert_color(&new->node, &osdc->event_tree);
			
 
				+	return left;
			
 
				 }
			
 
				 
			
 
				-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
			
 
				-					        u64 cookie)
			
 
				+/*
			
 
				+ * wait for a request to complete
			
 
				+ */
			
 
				+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
			
 
				+			   struct ceph_osd_request *req)
			
 
				 {
			
 
				-	struct rb_node **p = &osdc->event_tree.rb_node;
			
 
				-	struct rb_node *parent = NULL;
			
 
				-	struct ceph_osd_event *event = NULL;
			
 
				-
			
 
				-	while (*p) {
			
 
				-		parent = *p;
			
 
				-		event = rb_entry(parent, struct ceph_osd_event, node);
			
 
				-		if (cookie < event->cookie)
			
 
				-			p = &(*p)->rb_left;
			
 
				-		else if (cookie > event->cookie)
			
 
				-			p = &(*p)->rb_right;
			
 
				-		else
			
 
				-			return event;
			
 
				-	}
			
 
				-	return NULL;
			
 
				+	return wait_request_timeout(req, 0);
			
 
				 }
			
 
				+EXPORT_SYMBOL(ceph_osdc_wait_request);
			
 
				 
			
 
				-static void __remove_event(struct ceph_osd_event *event)
			
 
				+/*
			
 
				+ * sync - wait for all in-flight requests to flush.  avoid starvation.
			
 
				+ */
			
 
				+void ceph_osdc_sync(struct ceph_osd_client *osdc)
			
 
				 {
			
 
				-	struct ceph_osd_client *osdc = event->osdc;
			
 
				+	struct rb_node *n, *p;
			
 
				+	u64 last_tid = atomic64_read(&osdc->last_tid);
			
 
				 
			
 
				-	if (!RB_EMPTY_NODE(&event->node)) {
			
 
				-		dout("__remove_event removed %p\n", event);
			
 
				-		rb_erase(&event->node, &osdc->event_tree);
			
 
				-		ceph_osdc_put_event(event);
			
 
				-	} else {
			
 
				-		dout("__remove_event didn't remove %p\n", event);
			
 
				-	}
			
 
				-}
			
 
				+again:
			
 
				+	down_read(&osdc->lock);
			
 
				+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
			
 
				+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
			
 
				 
			
 
				-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
			
 
				-			   void (*event_cb)(u64, u64, u8, void *),
			
 
				-			   void *data, struct ceph_osd_event **pevent)
			
 
				-{
			
 
				-	struct ceph_osd_event *event;
			
 
				+		mutex_lock(&osd->lock);
			
 
				+		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
			
 
				+			struct ceph_osd_request *req =
			
 
				+			    rb_entry(p, struct ceph_osd_request, r_node);
			
 
				 
			
 
				-	event = kmalloc(sizeof(*event), GFP_NOIO);
			
 
				-	if (!event)
			
 
				-		return -ENOMEM;
			
 
				+			if (req->r_tid > last_tid)
			
 
				+				break;
			
 
				 
			
 
				-	dout("create_event %p\n", event);
			
 
				-	event->cb = event_cb;
			
 
				-	event->one_shot = 0;
			
 
				-	event->data = data;
			
 
				-	event->osdc = osdc;
			
 
				-	INIT_LIST_HEAD(&event->osd_node);
			
 
				-	RB_CLEAR_NODE(&event->node);
			
 
				-	kref_init(&event->kref);   /* one ref for us */
			
 
				-	kref_get(&event->kref);    /* one ref for the caller */
			
 
				-
			
 
				-	spin_lock(&osdc->event_lock);
			
 
				-	event->cookie = ++osdc->event_count;
			
 
				-	__insert_event(osdc, event);
			
 
				-	spin_unlock(&osdc->event_lock);
			
 
				-
			
 
				-	*pevent = event;
			
 
				-	return 0;
			
 
				+			if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
			
 
				+				continue;
			
 
				+
			
 
				+			ceph_osdc_get_request(req);
			
 
				+			mutex_unlock(&osd->lock);
			
 
				+			up_read(&osdc->lock);
			
 
				+			dout("%s waiting on req %p tid %llu last_tid %llu\n",
			
 
				+			     __func__, req, req->r_tid, last_tid);
			
 
				+			wait_for_completion(&req->r_safe_completion);
			
 
				+			ceph_osdc_put_request(req);
			
 
				+			goto again;
			
 
				+		}
			
 
				+
			
 
				+		mutex_unlock(&osd->lock);
			
 
				+	}
			
 
				+
			
 
				+	up_read(&osdc->lock);
			
 
				+	dout("%s done last_tid %llu\n", __func__, last_tid);
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_osdc_create_event);
			
 
				+EXPORT_SYMBOL(ceph_osdc_sync);
			
 
				 
			
 
				-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
			
 
				+static struct ceph_osd_request *
			
 
				+alloc_linger_request(struct ceph_osd_linger_request *lreq)
			
 
				 {
			
 
				-	struct ceph_osd_client *osdc = event->osdc;
			
 
				+	struct ceph_osd_request *req;
			
 
				 
			
 
				-	dout("cancel_event %p\n", event);
			
 
				-	spin_lock(&osdc->event_lock);
			
 
				-	__remove_event(event);
			
 
				-	spin_unlock(&osdc->event_lock);
			
 
				-	ceph_osdc_put_event(event); /* caller's */
			
 
				-}
			
 
				-EXPORT_SYMBOL(ceph_osdc_cancel_event);
			
 
				+	req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
			
 
				+	if (!req)
			
 
				+		return NULL;
			
 
				 
			
 
				+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
			
 
				+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
			
 
				 
			
 
				-static void do_event_work(struct work_struct *work)
			
 
				-{
			
 
				-	struct ceph_osd_event_work *event_work =
			
 
				-		container_of(work, struct ceph_osd_event_work, work);
			
 
				-	struct ceph_osd_event *event = event_work->event;
			
 
				-	u64 ver = event_work->ver;
			
 
				-	u64 notify_id = event_work->notify_id;
			
 
				-	u8 opcode = event_work->opcode;
			
 
				+	if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
			
 
				+		ceph_osdc_put_request(req);
			
 
				+		return NULL;
			
 
				+	}
			
 
				 
			
 
				-	dout("do_event_work completing %p\n", event);
			
 
				-	event->cb(ver, notify_id, opcode, event->data);
			
 
				-	dout("do_event_work completed %p\n", event);
			
 
				-	ceph_osdc_put_event(event);
			
 
				-	kfree(event_work);
			
 
				+	return req;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 /*
			
 
				- * Process osd watch notifications
			
 
				+ * Returns a handle, caller owns a ref.
			
 
				  */
			
 
				-static void handle_watch_notify(struct ceph_osd_client *osdc,
			
 
				-				struct ceph_msg *msg)
			
 
				+struct ceph_osd_linger_request *
			
 
				+ceph_osdc_watch(struct ceph_osd_client *osdc,
			
 
				+		struct ceph_object_id *oid,
			
 
				+		struct ceph_object_locator *oloc,
			
 
				+		rados_watchcb2_t wcb,
			
 
				+		rados_watcherrcb_t errcb,
			
 
				+		void *data)
			
 
				 {
			
 
				-	void *p, *end;
			
 
				-	u8 proto_ver;
			
 
				-	u64 cookie, ver, notify_id;
			
 
				-	u8 opcode;
			
 
				-	struct ceph_osd_event *event;
			
 
				-	struct ceph_osd_event_work *event_work;
			
 
				+	struct ceph_osd_linger_request *lreq;
			
 
				+	int ret;
			
 
				 
			
 
				-	p = msg->front.iov_base;
			
 
				-	end = p + msg->front.iov_len;
			
 
				+	lreq = linger_alloc(osdc);
			
 
				+	if (!lreq)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				 
			
 
				-	ceph_decode_8_safe(&p, end, proto_ver, bad);
			
 
				-	ceph_decode_8_safe(&p, end, opcode, bad);
			
 
				-	ceph_decode_64_safe(&p, end, cookie, bad);
			
 
				-	ceph_decode_64_safe(&p, end, ver, bad);
			
 
				-	ceph_decode_64_safe(&p, end, notify_id, bad);
			
 
				+	lreq->is_watch = true;
			
 
				+	lreq->wcb = wcb;
			
 
				+	lreq->errcb = errcb;
			
 
				+	lreq->data = data;
			
 
				+	lreq->watch_valid_thru = jiffies;
			
 
				+
			
 
				+	ceph_oid_copy(&lreq->t.base_oid, oid);
			
 
				+	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
			
 
				+	lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
			
 
				+	lreq->mtime = CURRENT_TIME;
			
 
				+
			
 
				+	lreq->reg_req = alloc_linger_request(lreq);
			
 
				+	if (!lreq->reg_req) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto err_put_lreq;
			
 
				+	}
			
 
				 
			
 
				-	spin_lock(&osdc->event_lock);
			
 
				-	event = __find_event(osdc, cookie);
			
 
				-	if (event) {
			
 
				-		BUG_ON(event->one_shot);
			
 
				-		get_event(event);
			
 
				-	}
			
 
				-	spin_unlock(&osdc->event_lock);
			
 
				-	dout("handle_watch_notify cookie %lld ver %lld event %p\n",
			
 
				-	     cookie, ver, event);
			
 
				-	if (event) {
			
 
				-		event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
			
 
				-		if (!event_work) {
			
 
				-			pr_err("couldn't allocate event_work\n");
			
 
				-			ceph_osdc_put_event(event);
			
 
				-			return;
			
 
				-		}
			
 
				-		INIT_WORK(&event_work->work, do_event_work);
			
 
				-		event_work->event = event;
			
 
				-		event_work->ver = ver;
			
 
				-		event_work->notify_id = notify_id;
			
 
				-		event_work->opcode = opcode;
			
 
				+	lreq->ping_req = alloc_linger_request(lreq);
			
 
				+	if (!lreq->ping_req) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto err_put_lreq;
			
 
				+	}
			
 
				 
			
 
				-		queue_work(osdc->notify_wq, &event_work->work);
			
 
				+	down_write(&osdc->lock);
			
 
				+	linger_register(lreq); /* before osd_req_op_* */
			
 
				+	osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
			
 
				+			      CEPH_OSD_WATCH_OP_WATCH);
			
 
				+	osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
			
 
				+			      CEPH_OSD_WATCH_OP_PING);
			
 
				+	linger_submit(lreq);
			
 
				+	up_write(&osdc->lock);
			
 
				+
			
 
				+	ret = linger_reg_commit_wait(lreq);
			
 
				+	if (ret) {
			
 
				+		linger_cancel(lreq);
			
 
				+		goto err_put_lreq;
			
 
				 	}
			
 
				 
			
 
				-	return;
			
 
				+	return lreq;
			
 
				 
			
 
				-bad:
			
 
				-	pr_err("osdc handle_watch_notify corrupt msg\n");
			
 
				+err_put_lreq:
			
 
				+	linger_put(lreq);
			
 
				+	return ERR_PTR(ret);
			
 
				 }
			
 
				+EXPORT_SYMBOL(ceph_osdc_watch);
			
 
				 
			
 
				 /*
			
 
				- * build new request AND message
			
 
				+ * Releases a ref.
			
 
				  *
			
 
				+ * Times out after mount_timeout to preserve rbd unmap behaviour
			
 
				+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
			
 
				+ * with mount_timeout").
			
 
				  */
			
 
				-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
			
 
				-				struct ceph_snap_context *snapc, u64 snap_id,
			
 
				-				struct timespec *mtime)
			
 
				-{
			
 
				-	struct ceph_msg *msg = req->r_request;
			
 
				-	void *p;
			
 
				-	size_t msg_size;
			
 
				-	int flags = req->r_flags;
			
 
				-	u64 data_len;
			
 
				-	unsigned int i;
			
 
				-
			
 
				-	req->r_snapid = snap_id;
			
 
				-	req->r_snapc = ceph_get_snap_context(snapc);
			
 
				+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
			
 
				+		      struct ceph_osd_linger_request *lreq)
			
 
				+{
			
 
				+	struct ceph_options *opts = osdc->client->options;
			
 
				+	struct ceph_osd_request *req;
			
 
				+	int ret;
			
 
				 
			
 
				-	/* encode request */
			
 
				-	msg->hdr.version = cpu_to_le16(4);
			
 
				-
			
 
				-	p = msg->front.iov_base;
			
 
				-	ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
			
 
				-	req->r_request_osdmap_epoch = p;
			
 
				-	p += 4;
			
 
				-	req->r_request_flags = p;
			
 
				-	p += 4;
			
 
				-	if (req->r_flags & CEPH_OSD_FLAG_WRITE)
			
 
				-		ceph_encode_timespec(p, mtime);
			
 
				-	p += sizeof(struct ceph_timespec);
			
 
				-	req->r_request_reassert_version = p;
			
 
				-	p += sizeof(struct ceph_eversion); /* will get filled in */
			
 
				+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
			
 
				+	if (!req)
			
 
				+		return -ENOMEM;
			
 
				 
			
 
				-	/* oloc */
			
 
				-	ceph_encode_8(&p, 4);
			
 
				-	ceph_encode_8(&p, 4);
			
 
				-	ceph_encode_32(&p, 8 + 4 + 4);
			
 
				-	req->r_request_pool = p;
			
 
				-	p += 8;
			
 
				-	ceph_encode_32(&p, -1);  /* preferred */
			
 
				-	ceph_encode_32(&p, 0);   /* key len */
			
 
				+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
			
 
				+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
			
 
				+	req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
			
 
				+	req->r_mtime = CURRENT_TIME;
			
 
				+	osd_req_op_watch_init(req, 0, lreq->linger_id,
			
 
				+			      CEPH_OSD_WATCH_OP_UNWATCH);
			
 
				 
			
 
				-	ceph_encode_8(&p, 1);
			
 
				-	req->r_request_pgid = p;
			
 
				-	p += 8 + 4;
			
 
				-	ceph_encode_32(&p, -1);  /* preferred */
			
 
				+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
			
 
				+	if (ret)
			
 
				+		goto out_put_req;
			
 
				 
			
 
				-	/* oid */
			
 
				-	ceph_encode_32(&p, req->r_base_oid.name_len);
			
 
				-	memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
			
 
				-	dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
			
 
				-	     req->r_base_oid.name, req->r_base_oid.name_len);
			
 
				-	p += req->r_base_oid.name_len;
			
 
				-
			
 
				-	/* ops--can imply data */
			
 
				-	ceph_encode_16(&p, (u16)req->r_num_ops);
			
 
				-	data_len = 0;
			
 
				-	for (i = 0; i < req->r_num_ops; i++) {
			
 
				-		data_len += osd_req_encode_op(req, p, i);
			
 
				-		p += sizeof(struct ceph_osd_op);
			
 
				-	}
			
 
				+	ceph_osdc_start_request(osdc, req, false);
			
 
				+	linger_cancel(lreq);
			
 
				+	linger_put(lreq);
			
 
				+	ret = wait_request_timeout(req, opts->mount_timeout);
			
 
				 
			
 
				-	/* snaps */
			
 
				-	ceph_encode_64(&p, req->r_snapid);
			
 
				-	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
			
 
				-	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
			
 
				-	if (req->r_snapc) {
			
 
				-		for (i = 0; i < snapc->num_snaps; i++) {
			
 
				-			ceph_encode_64(&p, req->r_snapc->snaps[i]);
			
 
				-		}
			
 
				-	}
			
 
				+out_put_req:
			
 
				+	ceph_osdc_put_request(req);
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL(ceph_osdc_unwatch);
			
 
				 
			
 
				-	req->r_request_attempts = p;
			
 
				-	p += 4;
			
 
				+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
			
 
				+				      u64 notify_id, u64 cookie, void *payload,
			
 
				+				      size_t payload_len)
			
 
				+{
			
 
				+	struct ceph_osd_req_op *op;
			
 
				+	struct ceph_pagelist *pl;
			
 
				+	int ret;
			
 
				 
			
 
				-	/* data */
			
 
				-	if (flags & CEPH_OSD_FLAG_WRITE) {
			
 
				-		u16 data_off;
			
 
				+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
			
 
				 
			
 
				-		/*
			
 
				-		 * The header "data_off" is a hint to the receiver
			
 
				-		 * allowing it to align received data into its
			
 
				-		 * buffers such that there's no need to re-copy
			
 
				-		 * it before writing it to disk (direct I/O).
			
 
				-		 */
			
 
				-		data_off = (u16) (off & 0xffff);
			
 
				-		req->r_request->hdr.data_off = cpu_to_le16(data_off);
			
 
				-	}
			
 
				-	req->r_request->hdr.data_len = cpu_to_le32(data_len);
			
 
				+	pl = kmalloc(sizeof(*pl), GFP_NOIO);
			
 
				+	if (!pl)
			
 
				+		return -ENOMEM;
			
 
				 
			
 
				-	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
			
 
				-	msg_size = p - msg->front.iov_base;
			
 
				-	msg->front.iov_len = msg_size;
			
 
				-	msg->hdr.front_len = cpu_to_le32(msg_size);
			
 
				+	ceph_pagelist_init(pl);
			
 
				+	ret = ceph_pagelist_encode_64(pl, notify_id);
			
 
				+	ret |= ceph_pagelist_encode_64(pl, cookie);
			
 
				+	if (payload) {
			
 
				+		ret |= ceph_pagelist_encode_32(pl, payload_len);
			
 
				+		ret |= ceph_pagelist_append(pl, payload, payload_len);
			
 
				+	} else {
			
 
				+		ret |= ceph_pagelist_encode_32(pl, 0);
			
 
				+	}
			
 
				+	if (ret) {
			
 
				+		ceph_pagelist_release(pl);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				 
			
 
				-	dout("build_request msg_size was %d\n", (int)msg_size);
			
 
				+	ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
			
 
				+	op->indata_len = pl->length;
			
 
				+	return 0;
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_osdc_build_request);
			
 
				 
			
 
				-/*
			
 
				- * Register request, send initial attempt.
			
 
				- */
			
 
				-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
			
 
				-			    struct ceph_osd_request *req,
			
 
				-			    bool nofail)
			
 
				+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
			
 
				+			 struct ceph_object_id *oid,
			
 
				+			 struct ceph_object_locator *oloc,
			
 
				+			 u64 notify_id,
			
 
				+			 u64 cookie,
			
 
				+			 void *payload,
			
 
				+			 size_t payload_len)
			
 
				 {
			
 
				-	int rc;
			
 
				+	struct ceph_osd_request *req;
			
 
				+	int ret;
			
 
				+
			
 
				+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
			
 
				+	if (!req)
			
 
				+		return -ENOMEM;
			
 
				 
			
 
				-	down_read(&osdc->map_sem);
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				+	ceph_oid_copy(&req->r_base_oid, oid);
			
 
				+	ceph_oloc_copy(&req->r_base_oloc, oloc);
			
 
				+	req->r_flags = CEPH_OSD_FLAG_READ;
			
 
				 
			
 
				-	rc = __ceph_osdc_start_request(osdc, req, nofail);
			
 
				+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
			
 
				+	if (ret)
			
 
				+		goto out_put_req;
			
 
				+
			
 
				+	ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
			
 
				+					 payload_len);
			
 
				+	if (ret)
			
 
				+		goto out_put_req;
			
 
				 
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				-	up_read(&osdc->map_sem);
			
 
				+	ceph_osdc_start_request(osdc, req, false);
			
 
				+	ret = ceph_osdc_wait_request(osdc, req);
			
 
				 
			
 
				-	return rc;
			
 
				+out_put_req:
			
 
				+	ceph_osdc_put_request(req);
			
 
				+	return ret;
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_osdc_start_request);
			
 
				+EXPORT_SYMBOL(ceph_osdc_notify_ack);
			
 
				 
			
 
				-/*
			
 
				- * Unregister a registered request.  The request is not completed (i.e.
			
 
				- * no callbacks or wakeups) - higher layers are supposed to know what
			
 
				- * they are canceling.
			
 
				- */
			
 
				-void ceph_osdc_cancel_request(struct ceph_osd_request *req)
			
 
				+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
			
 
				+				  u64 cookie, u32 prot_ver, u32 timeout,
			
 
				+				  void *payload, size_t payload_len)
			
 
				 {
			
 
				-	struct ceph_osd_client *osdc = req->r_osdc;
			
 
				+	struct ceph_osd_req_op *op;
			
 
				+	struct ceph_pagelist *pl;
			
 
				+	int ret;
			
 
				 
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	if (req->r_linger)
			
 
				-		__unregister_linger_request(osdc, req);
			
 
				-	__unregister_request(osdc, req);
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
			
 
				+	op->notify.cookie = cookie;
			
 
				 
			
 
				-	dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
			
 
				+	pl = kmalloc(sizeof(*pl), GFP_NOIO);
			
 
				+	if (!pl)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ceph_pagelist_init(pl);
			
 
				+	ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
			
 
				+	ret |= ceph_pagelist_encode_32(pl, timeout);
			
 
				+	ret |= ceph_pagelist_encode_32(pl, payload_len);
			
 
				+	ret |= ceph_pagelist_append(pl, payload, payload_len);
			
 
				+	if (ret) {
			
 
				+		ceph_pagelist_release(pl);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
			
 
				+	op->indata_len = pl->length;
			
 
				+	return 0;
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_osdc_cancel_request);
			
 
				 
			
 
				 /*
			
 
				- * wait for a request to complete
			
 
				+ * @timeout: in seconds
			
 
				+ *
			
 
				+ * @preply_{pages,len} are initialized both on success and error.
			
 
				+ * The caller is responsible for:
			
 
				+ *
			
 
				+ *     ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
			
 
				  */
			
 
				-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
			
 
				-			   struct ceph_osd_request *req)
			
 
				+int ceph_osdc_notify(struct ceph_osd_client *osdc,
			
 
				+		     struct ceph_object_id *oid,
			
 
				+		     struct ceph_object_locator *oloc,
			
 
				+		     void *payload,
			
 
				+		     size_t payload_len,
			
 
				+		     u32 timeout,
			
 
				+		     struct page ***preply_pages,
			
 
				+		     size_t *preply_len)
			
 
				 {
			
 
				-	int rc;
			
 
				+	struct ceph_osd_linger_request *lreq;
			
 
				+	struct page **pages;
			
 
				+	int ret;
			
 
				 
			
 
				-	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
			
 
				+	WARN_ON(!timeout);
			
 
				+	if (preply_pages) {
			
 
				+		*preply_pages = NULL;
			
 
				+		*preply_len = 0;
			
 
				+	}
			
 
				 
			
 
				-	rc = wait_for_completion_interruptible(&req->r_completion);
			
 
				-	if (rc < 0) {
			
 
				-		dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
			
 
				-		ceph_osdc_cancel_request(req);
			
 
				-		complete_request(req);
			
 
				-		return rc;
			
 
				+	lreq = linger_alloc(osdc);
			
 
				+	if (!lreq)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	lreq->preply_pages = preply_pages;
			
 
				+	lreq->preply_len = preply_len;
			
 
				+
			
 
				+	ceph_oid_copy(&lreq->t.base_oid, oid);
			
 
				+	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
			
 
				+	lreq->t.flags = CEPH_OSD_FLAG_READ;
			
 
				+
			
 
				+	lreq->reg_req = alloc_linger_request(lreq);
			
 
				+	if (!lreq->reg_req) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto out_put_lreq;
			
 
				+	}
			
 
				+
			
 
				+	/* for notify_id */
			
 
				+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
			
 
				+	if (IS_ERR(pages)) {
			
 
				+		ret = PTR_ERR(pages);
			
 
				+		goto out_put_lreq;
			
 
				+	}
			
 
				+
			
 
				+	down_write(&osdc->lock);
			
 
				+	linger_register(lreq); /* before osd_req_op_* */
			
 
				+	ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
			
 
				+				     timeout, payload, payload_len);
			
 
				+	if (ret) {
			
 
				+		linger_unregister(lreq);
			
 
				+		up_write(&osdc->lock);
			
 
				+		ceph_release_page_vector(pages, 1);
			
 
				+		goto out_put_lreq;
			
 
				 	}
			
 
				+	ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
			
 
				+						 response_data),
			
 
				+				 pages, PAGE_SIZE, 0, false, true);
			
 
				+	linger_submit(lreq);
			
 
				+	up_write(&osdc->lock);
			
 
				+
			
 
				+	ret = linger_reg_commit_wait(lreq);
			
 
				+	if (!ret)
			
 
				+		ret = linger_notify_finish_wait(lreq);
			
 
				+	else
			
 
				+		dout("lreq %p failed to initiate notify %d\n", lreq, ret);
			
 
				 
			
 
				-	dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
			
 
				-	     req->r_result);
			
 
				-	return req->r_result;
			
 
				+	linger_cancel(lreq);
			
 
				+out_put_lreq:
			
 
				+	linger_put(lreq);
			
 
				+	return ret;
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_osdc_wait_request);
			
 
				+EXPORT_SYMBOL(ceph_osdc_notify);
			
 
				 
			
 
				 /*
			
 
				- * sync - wait for all in-flight requests to flush.  avoid starvation.
			
 
				+ * Return the number of milliseconds since the watch was last
			
 
				+ * confirmed, or an error.  If there is an error, the watch is no
			
 
				+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
			
 
				  */
			
 
				-void ceph_osdc_sync(struct ceph_osd_client *osdc)
			
 
				+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
			
 
				+			  struct ceph_osd_linger_request *lreq)
			
 
				 {
			
 
				-	struct ceph_osd_request *req;
			
 
				-	u64 last_tid, next_tid = 0;
			
 
				-
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	last_tid = osdc->last_tid;
			
 
				-	while (1) {
			
 
				-		req = __lookup_request_ge(osdc, next_tid);
			
 
				-		if (!req)
			
 
				-			break;
			
 
				-		if (req->r_tid > last_tid)
			
 
				-			break;
			
 
				-
			
 
				-		next_tid = req->r_tid + 1;
			
 
				-		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
			
 
				-			continue;
			
 
				+	unsigned long stamp, age;
			
 
				+	int ret;
			
 
				 
			
 
				-		ceph_osdc_get_request(req);
			
 
				-		mutex_unlock(&osdc->request_mutex);
			
 
				-		dout("sync waiting on tid %llu (last is %llu)\n",
			
 
				-		     req->r_tid, last_tid);
			
 
				-		wait_for_completion(&req->r_safe_completion);
			
 
				-		mutex_lock(&osdc->request_mutex);
			
 
				-		ceph_osdc_put_request(req);
			
 
				+	down_read(&osdc->lock);
			
 
				+	mutex_lock(&lreq->lock);
			
 
				+	stamp = lreq->watch_valid_thru;
			
 
				+	if (!list_empty(&lreq->pending_lworks)) {
			
 
				+		struct linger_work *lwork =
			
 
				+		    list_first_entry(&lreq->pending_lworks,
			
 
				+				     struct linger_work,
			
 
				+				     pending_item);
			
 
				+
			
 
				+		if (time_before(lwork->queued_stamp, stamp))
			
 
				+			stamp = lwork->queued_stamp;
			
 
				 	}
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				-	dout("sync done (thru tid %llu)\n", last_tid);
			
 
				+	age = jiffies - stamp;
			
 
				+	dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
			
 
				+	     lreq, lreq->linger_id, age, lreq->last_error);
			
 
				+	/* we are truncating to msecs, so return a safe upper bound */
			
 
				+	ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
			
 
				+
			
 
				+	mutex_unlock(&lreq->lock);
			
 
				+	up_read(&osdc->lock);
			
 
				+	return ret;
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_osdc_sync);
			
 
				 
			
 
				 /*
			
 
				  * Call all pending notify callbacks - for use after a watch is
			
@@ -2646,6 +3869,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
 
				 }
			
 
				 EXPORT_SYMBOL(ceph_osdc_flush_notifies);
			
 
				 
			
 
				+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
			
 
				+{
			
 
				+	down_read(&osdc->lock);
			
 
				+	maybe_request_map(osdc);
			
 
				+	up_read(&osdc->lock);
			
 
				+}
			
 
				+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
			
 
				 
			
 
				 /*
			
 
				  * init, shutdown
			
@@ -2656,43 +3886,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
				 
			
 
				 	dout("init\n");
			
 
				 	osdc->client = client;
			
 
				-	osdc->osdmap = NULL;
			
 
				-	init_rwsem(&osdc->map_sem);
			
 
				-	init_completion(&osdc->map_waiters);
			
 
				-	osdc->last_requested_map = 0;
			
 
				-	mutex_init(&osdc->request_mutex);
			
 
				-	osdc->last_tid = 0;
			
 
				+	init_rwsem(&osdc->lock);
			
 
				 	osdc->osds = RB_ROOT;
			
 
				 	INIT_LIST_HEAD(&osdc->osd_lru);
			
 
				-	osdc->requests = RB_ROOT;
			
 
				-	INIT_LIST_HEAD(&osdc->req_lru);
			
 
				-	INIT_LIST_HEAD(&osdc->req_unsent);
			
 
				-	INIT_LIST_HEAD(&osdc->req_notarget);
			
 
				-	INIT_LIST_HEAD(&osdc->req_linger);
			
 
				-	osdc->num_requests = 0;
			
 
				+	spin_lock_init(&osdc->osd_lru_lock);
			
 
				+	osd_init(&osdc->homeless_osd);
			
 
				+	osdc->homeless_osd.o_osdc = osdc;
			
 
				+	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
			
 
				+	osdc->linger_requests = RB_ROOT;
			
 
				+	osdc->map_checks = RB_ROOT;
			
 
				+	osdc->linger_map_checks = RB_ROOT;
			
 
				 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
			
 
				 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
			
 
				-	spin_lock_init(&osdc->event_lock);
			
 
				-	osdc->event_tree = RB_ROOT;
			
 
				-	osdc->event_count = 0;
			
 
				-
			
 
				-	schedule_delayed_work(&osdc->osds_timeout_work,
			
 
				-	    round_jiffies_relative(osdc->client->options->osd_idle_ttl));
			
 
				 
			
 
				 	err = -ENOMEM;
			
 
				+	osdc->osdmap = ceph_osdmap_alloc();
			
 
				+	if (!osdc->osdmap)
			
 
				+		goto out;
			
 
				+
			
 
				 	osdc->req_mempool = mempool_create_slab_pool(10,
			
 
				 						     ceph_osd_request_cache);
			
 
				 	if (!osdc->req_mempool)
			
 
				-		goto out;
			
 
				+		goto out_map;
			
 
				 
			
 
				 	err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
			
 
				-				OSD_OP_FRONT_LEN, 10, true,
			
 
				-				"osd_op");
			
 
				+				PAGE_SIZE, 10, true, "osd_op");
			
 
				 	if (err < 0)
			
 
				 		goto out_mempool;
			
 
				 	err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
			
 
				-				OSD_OPREPLY_FRONT_LEN, 10, true,
			
 
				-				"osd_op_reply");
			
 
				+				PAGE_SIZE, 10, true, "osd_op_reply");
			
 
				 	if (err < 0)
			
 
				 		goto out_msgpool;
			
 
				 
			
@@ -2701,6 +3923,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
				 	if (!osdc->notify_wq)
			
 
				 		goto out_msgpool_reply;
			
 
				 
			
 
				+	schedule_delayed_work(&osdc->timeout_work,
			
 
				+			      osdc->client->options->osd_keepalive_timeout);
			
 
				+	schedule_delayed_work(&osdc->osds_timeout_work,
			
 
				+	    round_jiffies_relative(osdc->client->options->osd_idle_ttl));
			
 
				+
			
 
				 	return 0;
			
 
				 
			
 
				 out_msgpool_reply:
			
@@ -2709,6 +3936,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
				 	ceph_msgpool_destroy(&osdc->msgpool_op);
			
 
				 out_mempool:
			
 
				 	mempool_destroy(osdc->req_mempool);
			
 
				+out_map:
			
 
				+	ceph_osdmap_destroy(osdc->osdmap);
			
 
				 out:
			
 
				 	return err;
			
 
				 }
			
@@ -2719,11 +3948,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 
				 	destroy_workqueue(osdc->notify_wq);
			
 
				 	cancel_delayed_work_sync(&osdc->timeout_work);
			
 
				 	cancel_delayed_work_sync(&osdc->osds_timeout_work);
			
 
				-	if (osdc->osdmap) {
			
 
				-		ceph_osdmap_destroy(osdc->osdmap);
			
 
				-		osdc->osdmap = NULL;
			
 
				+
			
 
				+	down_write(&osdc->lock);
			
 
				+	while (!RB_EMPTY_ROOT(&osdc->osds)) {
			
 
				+		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
			
 
				+						struct ceph_osd, o_node);
			
 
				+		close_osd(osd);
			
 
				 	}
			
 
				-	remove_all_osds(osdc);
			
 
				+	up_write(&osdc->lock);
			
 
				+	WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
			
 
				+	osd_cleanup(&osdc->homeless_osd);
			
 
				+
			
 
				+	WARN_ON(!list_empty(&osdc->osd_lru));
			
 
				+	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
			
 
				+	WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
			
 
				+	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
			
 
				+	WARN_ON(atomic_read(&osdc->num_requests));
			
 
				+	WARN_ON(atomic_read(&osdc->num_homeless));
			
 
				+
			
 
				+	ceph_osdmap_destroy(osdc->osdmap);
			
 
				 	mempool_destroy(osdc->req_mempool);
			
 
				 	ceph_msgpool_destroy(&osdc->msgpool_op);
			
 
				 	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
			
@@ -2752,15 +3995,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 
				 		return PTR_ERR(req);
			
 
				 
			
 
				 	/* it may be a short read due to an object boundary */
			
 
				-
			
 
				 	osd_req_op_extent_osd_data_pages(req, 0,
			
 
				 				pages, *plen, page_align, false, false);
			
 
				 
			
 
				 	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
			
 
				 	     off, *plen, *plen, page_align);
			
 
				 
			
 
				-	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
			
 
				-
			
 
				 	rc = ceph_osdc_start_request(osdc, req, false);
			
 
				 	if (!rc)
			
 
				 		rc = ceph_osdc_wait_request(osdc, req);
			
@@ -2786,7 +4026,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 
				 	int rc = 0;
			
 
				 	int page_align = off & ~PAGE_MASK;
			
 
				 
			
 
				-	BUG_ON(vino.snap != CEPH_NOSNAP);	/* snapshots aren't writeable */
			
 
				 	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
			
 
				 				    CEPH_OSD_OP_WRITE,
			
 
				 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
			
@@ -2800,8 +4039,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 
				 				false, false);
			
 
				 	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
			
 
				 
			
 
				-	ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
			
 
				-
			
 
				+	req->r_mtime = *mtime;
			
 
				 	rc = ceph_osdc_start_request(osdc, req, true);
			
 
				 	if (!rc)
			
 
				 		rc = ceph_osdc_wait_request(osdc, req);
			
@@ -2841,19 +4079,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
 
				 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
			
 
				 {
			
 
				 	struct ceph_osd *osd = con->private;
			
 
				-	struct ceph_osd_client *osdc;
			
 
				+	struct ceph_osd_client *osdc = osd->o_osdc;
			
 
				 	int type = le16_to_cpu(msg->hdr.type);
			
 
				 
			
 
				-	if (!osd)
			
 
				-		goto out;
			
 
				-	osdc = osd->o_osdc;
			
 
				-
			
 
				 	switch (type) {
			
 
				 	case CEPH_MSG_OSD_MAP:
			
 
				 		ceph_osdc_handle_map(osdc, msg);
			
 
				 		break;
			
 
				 	case CEPH_MSG_OSD_OPREPLY:
			
 
				-		handle_reply(osdc, msg);
			
 
				+		handle_reply(osd, msg);
			
 
				 		break;
			
 
				 	case CEPH_MSG_WATCH_NOTIFY:
			
 
				 		handle_watch_notify(osdc, msg);
			
@@ -2863,7 +4097,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 
				 		pr_err("received unknown message type %d %s\n", type,
			
 
				 		       ceph_msg_type_name(type));
			
 
				 	}
			
 
				-out:
			
 
				+
			
 
				 	ceph_msg_put(msg);
			
 
				 }
			
 
				 
			
@@ -2878,21 +4112,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 
				 {
			
 
				 	struct ceph_osd *osd = con->private;
			
 
				 	struct ceph_osd_client *osdc = osd->o_osdc;
			
 
				-	struct ceph_msg *m;
			
 
				+	struct ceph_msg *m = NULL;
			
 
				 	struct ceph_osd_request *req;
			
 
				 	int front_len = le32_to_cpu(hdr->front_len);
			
 
				 	int data_len = le32_to_cpu(hdr->data_len);
			
 
				-	u64 tid;
			
 
				+	u64 tid = le64_to_cpu(hdr->tid);
			
 
				+
			
 
				+	down_read(&osdc->lock);
			
 
				+	if (!osd_registered(osd)) {
			
 
				+		dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
			
 
				+		*skip = 1;
			
 
				+		goto out_unlock_osdc;
			
 
				+	}
			
 
				+	WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
			
 
				 
			
 
				-	tid = le64_to_cpu(hdr->tid);
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	req = __lookup_request(osdc, tid);
			
 
				+	mutex_lock(&osd->lock);
			
 
				+	req = lookup_request(&osd->o_requests, tid);
			
 
				 	if (!req) {
			
 
				 		dout("%s osd%d tid %llu unknown, skipping\n", __func__,
			
 
				 		     osd->o_osd, tid);
			
 
				-		m = NULL;
			
 
				 		*skip = 1;
			
 
				-		goto out;
			
 
				+		goto out_unlock_session;
			
 
				 	}
			
 
				 
			
 
				 	ceph_msg_revoke_incoming(req->r_reply);
			
@@ -2904,7 +4144,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 
				 		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
			
 
				 				 false);
			
 
				 		if (!m)
			
 
				-			goto out;
			
 
				+			goto out_unlock_session;
			
 
				 		ceph_msg_put(req->r_reply);
			
 
				 		req->r_reply = m;
			
 
				 	}
			
@@ -2915,14 +4155,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 
				 			req->r_reply->data_length);
			
 
				 		m = NULL;
			
 
				 		*skip = 1;
			
 
				-		goto out;
			
 
				+		goto out_unlock_session;
			
 
				 	}
			
 
				 
			
 
				 	m = ceph_msg_get(req->r_reply);
			
 
				 	dout("get_reply tid %lld %p\n", tid, m);
			
 
				 
			
 
				-out:
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				+out_unlock_session:
			
 
				+	mutex_unlock(&osd->lock);
			
 
				+out_unlock_osdc:
			
 
				+	up_read(&osdc->lock);
			
 
				+	return m;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * TODO: switch to a msg-owned pagelist
			
 
				+ */
			
 
				+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
			
 
				+{
			
 
				+	struct ceph_msg *m;
			
 
				+	int type = le16_to_cpu(hdr->type);
			
 
				+	u32 front_len = le32_to_cpu(hdr->front_len);
			
 
				+	u32 data_len = le32_to_cpu(hdr->data_len);
			
 
				+
			
 
				+	m = ceph_msg_new(type, front_len, GFP_NOIO, false);
			
 
				+	if (!m)
			
 
				+		return NULL;
			
 
				+
			
 
				+	if (data_len) {
			
 
				+		struct page **pages;
			
 
				+		struct ceph_osd_data osd_data;
			
 
				+
			
 
				+		pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
			
 
				+					       GFP_NOIO);
			
 
				+		if (!pages) {
			
 
				+			ceph_msg_put(m);
			
 
				+			return NULL;
			
 
				+		}
			
 
				+
			
 
				+		ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
			
 
				+					 false);
			
 
				+		ceph_osdc_msg_data_add(m, &osd_data);
			
 
				+	}
			
 
				+
			
 
				 	return m;
			
 
				 }
			
 
				 
			
@@ -2932,18 +4207,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 
				 {
			
 
				 	struct ceph_osd *osd = con->private;
			
 
				 	int type = le16_to_cpu(hdr->type);
			
 
				-	int front = le32_to_cpu(hdr->front_len);
			
 
				 
			
 
				 	*skip = 0;
			
 
				 	switch (type) {
			
 
				 	case CEPH_MSG_OSD_MAP:
			
 
				 	case CEPH_MSG_WATCH_NOTIFY:
			
 
				-		return ceph_msg_new(type, front, GFP_NOFS, false);
			
 
				+		return alloc_msg_with_page_vector(hdr);
			
 
				 	case CEPH_MSG_OSD_OPREPLY:
			
 
				 		return get_reply(con, hdr, skip);
			
 
				 	default:
			
 
				-		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
			
 
				-			osd->o_osd);
			
 
				+		pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
			
 
				+			osd->o_osd, type);
			
 
				 		*skip = 1;
			
 
				 		return NULL;
			
 
				 	}
			
@@ -3047,5 +4321,5 @@ static const struct ceph_connection_operations osd_con_ops = {
 
				 	.alloc_msg = alloc_msg,
			
 
				 	.sign_message = osd_sign_message,
			
 
				 	.check_message_signature = osd_check_message_signature,
			
 
				-	.fault = osd_reset,
			
 
				+	.fault = osd_fault,
			
 
				 };
			
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 
				 	return ERR_PTR(err);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
			
 
				- * to a set of osds) and primary_temp (explicit primary setting)
			
 
				- */
			
 
				-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
			
 
				+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
			
 
				 {
			
 
				-	if (l.pool < r.pool)
			
 
				+	if (lhs->pool < rhs->pool)
			
 
				 		return -1;
			
 
				-	if (l.pool > r.pool)
			
 
				+	if (lhs->pool > rhs->pool)
			
 
				 		return 1;
			
 
				-	if (l.seed < r.seed)
			
 
				+	if (lhs->seed < rhs->seed)
			
 
				 		return -1;
			
 
				-	if (l.seed > r.seed)
			
 
				+	if (lhs->seed > rhs->seed)
			
 
				 		return 1;
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
			
 
				+ * to a set of osds) and primary_temp (explicit primary setting)
			
 
				+ */
			
 
				 static int __insert_pg_mapping(struct ceph_pg_mapping *new,
			
 
				 			       struct rb_root *root)
			
 
				 {
			
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 
				 	while (*p) {
			
 
				 		parent = *p;
			
 
				 		pg = rb_entry(parent, struct ceph_pg_mapping, node);
			
 
				-		c = pgid_cmp(new->pgid, pg->pgid);
			
 
				+		c = ceph_pg_compare(&new->pgid, &pg->pgid);
			
 
				 		if (c < 0)
			
 
				 			p = &(*p)->rb_left;
			
 
				 		else if (c > 0)
			
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 
				 
			
 
				 	while (n) {
			
 
				 		pg = rb_entry(n, struct ceph_pg_mapping, node);
			
 
				-		c = pgid_cmp(pgid, pg->pgid);
			
 
				+		c = ceph_pg_compare(&pgid, &pg->pgid);
			
 
				 		if (c < 0) {
			
 
				 			n = n->rb_left;
			
 
				 		} else if (c > 0) {
			
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 
				 	*p += 4;  /* skip crash_replay_interval */
			
 
				 
			
 
				 	if (ev >= 7)
			
 
				-		*p += 1;  /* skip min_size */
			
 
				+		pi->min_size = ceph_decode_8(p);
			
 
				+	else
			
 
				+		pi->min_size = pi->size - pi->size / 2;
			
 
				 
			
 
				 	if (ev >= 8)
			
 
				 		*p += 8 + 8;  /* skip quota_max_* */
			
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 
				 		pi->write_tier = -1;
			
 
				 	}
			
 
				 
			
 
				+	if (ev >= 10) {
			
 
				+		/* skip properties */
			
 
				+		num = ceph_decode_32(p);
			
 
				+		while (num--) {
			
 
				+			len = ceph_decode_32(p);
			
 
				+			*p += len; /* key */
			
 
				+			len = ceph_decode_32(p);
			
 
				+			*p += len; /* val */
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (ev >= 11) {
			
 
				+		/* skip hit_set_params */
			
 
				+		*p += 1 + 1; /* versions */
			
 
				+		len = ceph_decode_32(p);
			
 
				+		*p += len;
			
 
				+
			
 
				+		*p += 4; /* skip hit_set_period */
			
 
				+		*p += 4; /* skip hit_set_count */
			
 
				+	}
			
 
				+
			
 
				+	if (ev >= 12)
			
 
				+		*p += 4; /* skip stripe_width */
			
 
				+
			
 
				+	if (ev >= 13) {
			
 
				+		*p += 8; /* skip target_max_bytes */
			
 
				+		*p += 8; /* skip target_max_objects */
			
 
				+		*p += 4; /* skip cache_target_dirty_ratio_micro */
			
 
				+		*p += 4; /* skip cache_target_full_ratio_micro */
			
 
				+		*p += 4; /* skip cache_min_flush_age */
			
 
				+		*p += 4; /* skip cache_min_evict_age */
			
 
				+	}
			
 
				+
			
 
				+	if (ev >=  14) {
			
 
				+		/* skip erasure_code_profile */
			
 
				+		len = ceph_decode_32(p);
			
 
				+		*p += len;
			
 
				+	}
			
 
				+
			
 
				+	if (ev >= 15)
			
 
				+		pi->last_force_request_resend = ceph_decode_32(p);
			
 
				+	else
			
 
				+		pi->last_force_request_resend = 0;
			
 
				+
			
 
				 	/* ignore the rest */
			
 
				 
			
 
				 	*p = pool_end;
			
@@ -660,6 +707,23 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
 
				 /*
			
 
				  * osd map
			
 
				  */
			
 
				+struct ceph_osdmap *ceph_osdmap_alloc(void)
			
 
				+{
			
 
				+	struct ceph_osdmap *map;
			
 
				+
			
 
				+	map = kzalloc(sizeof(*map), GFP_NOIO);
			
 
				+	if (!map)
			
 
				+		return NULL;
			
 
				+
			
 
				+	map->pg_pools = RB_ROOT;
			
 
				+	map->pool_max = -1;
			
 
				+	map->pg_temp = RB_ROOT;
			
 
				+	map->primary_temp = RB_ROOT;
			
 
				+	mutex_init(&map->crush_scratch_mutex);
			
 
				+
			
 
				+	return map;
			
 
				+}
			
 
				+
			
 
				 void ceph_osdmap_destroy(struct ceph_osdmap *map)
			
 
				 {
			
 
				 	dout("osdmap_destroy %p\n", map);
			
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 
				 	struct ceph_osdmap *map;
			
 
				 	int ret;
			
 
				 
			
 
				-	map = kzalloc(sizeof(*map), GFP_NOFS);
			
 
				+	map = ceph_osdmap_alloc();
			
 
				 	if (!map)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
 
				-	map->pg_temp = RB_ROOT;
			
 
				-	map->primary_temp = RB_ROOT;
			
 
				-	mutex_init(&map->crush_scratch_mutex);
			
 
				-
			
 
				 	ret = osdmap_decode(p, end, map);
			
 
				 	if (ret) {
			
 
				 		ceph_osdmap_destroy(map);
			
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 
				  * decode and apply an incremental map update.
			
 
				  */
			
 
				 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
			
 
				-					     struct ceph_osdmap *map,
			
 
				-					     struct ceph_messenger *msgr)
			
 
				+					     struct ceph_osdmap *map)
			
 
				 {
			
 
				 	struct crush_map *newcrush = NULL;
			
 
				 	struct ceph_fsid fsid;
			
@@ -1381,8 +1440,252 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
				 	return ERR_PTR(err);
			
 
				 }
			
 
				 
			
 
				+void ceph_oid_copy(struct ceph_object_id *dest,
			
 
				+		   const struct ceph_object_id *src)
			
 
				+{
			
 
				+	WARN_ON(!ceph_oid_empty(dest));
			
 
				+
			
 
				+	if (src->name != src->inline_name) {
			
 
				+		/* very rare, see ceph_object_id definition */
			
 
				+		dest->name = kmalloc(src->name_len + 1,
			
 
				+				     GFP_NOIO | __GFP_NOFAIL);
			
 
				+	}
			
 
				+
			
 
				+	memcpy(dest->name, src->name, src->name_len + 1);
			
 
				+	dest->name_len = src->name_len;
			
 
				+}
			
 
				+EXPORT_SYMBOL(ceph_oid_copy);
			
 
				+
			
 
				+static __printf(2, 0)
			
 
				+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
			
 
				+{
			
 
				+	int len;
			
 
				+
			
 
				+	WARN_ON(!ceph_oid_empty(oid));
			
 
				+
			
 
				+	len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
			
 
				+	if (len >= sizeof(oid->inline_name))
			
 
				+		return len;
			
 
				+
			
 
				+	oid->name_len = len;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If oid doesn't fit into inline buffer, BUG.
			
 
				+ */
			
 
				+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
			
 
				+{
			
 
				+	va_list ap;
			
 
				+
			
 
				+	va_start(ap, fmt);
			
 
				+	BUG_ON(oid_printf_vargs(oid, fmt, ap));
			
 
				+	va_end(ap);
			
 
				+}
			
 
				+EXPORT_SYMBOL(ceph_oid_printf);
			
 
				+
			
 
				+static __printf(3, 0)
			
 
				+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
			
 
				+		      const char *fmt, va_list ap)
			
 
				+{
			
 
				+	va_list aq;
			
 
				+	int len;
			
 
				+
			
 
				+	va_copy(aq, ap);
			
 
				+	len = oid_printf_vargs(oid, fmt, aq);
			
 
				+	va_end(aq);
			
 
				+
			
 
				+	if (len) {
			
 
				+		char *external_name;
			
 
				+
			
 
				+		external_name = kmalloc(len + 1, gfp);
			
 
				+		if (!external_name)
			
 
				+			return -ENOMEM;
			
 
				+
			
 
				+		oid->name = external_name;
			
 
				+		WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
			
 
				+		oid->name_len = len;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If oid doesn't fit into inline buffer, allocate.
			
 
				+ */
			
 
				+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
			
 
				+		     const char *fmt, ...)
			
 
				+{
			
 
				+	va_list ap;
			
 
				+	int ret;
			
 
				+
			
 
				+	va_start(ap, fmt);
			
 
				+	ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
			
 
				+	va_end(ap);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL(ceph_oid_aprintf);
			
 
				+
			
 
				+void ceph_oid_destroy(struct ceph_object_id *oid)
			
 
				+{
			
 
				+	if (oid->name != oid->inline_name)
			
 
				+		kfree(oid->name);
			
 
				+}
			
 
				+EXPORT_SYMBOL(ceph_oid_destroy);
			
 
				+
			
 
				+/*
			
 
				+ * osds only
			
 
				+ */
			
 
				+static bool __osds_equal(const struct ceph_osds *lhs,
			
 
				+			 const struct ceph_osds *rhs)
			
 
				+{
			
 
				+	if (lhs->size == rhs->size &&
			
 
				+	    !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
			
 
				+		return true;
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * osds + primary
			
 
				+ */
			
 
				+static bool osds_equal(const struct ceph_osds *lhs,
			
 
				+		       const struct ceph_osds *rhs)
			
 
				+{
			
 
				+	if (__osds_equal(lhs, rhs) &&
			
 
				+	    lhs->primary == rhs->primary)
			
 
				+		return true;
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static bool osds_valid(const struct ceph_osds *set)
			
 
				+{
			
 
				+	/* non-empty set */
			
 
				+	if (set->size > 0 && set->primary >= 0)
			
 
				+		return true;
			
 
				+
			
 
				+	/* empty can_shift_osds set */
			
 
				+	if (!set->size && set->primary == -1)
			
 
				+		return true;
			
 
				+
			
 
				+	/* empty !can_shift_osds set - all NONE */
			
 
				+	if (set->size > 0 && set->primary == -1) {
			
 
				+		int i;
			
 
				+
			
 
				+		for (i = 0; i < set->size; i++) {
			
 
				+			if (set->osds[i] != CRUSH_ITEM_NONE)
			
 
				+				break;
			
 
				+		}
			
 
				+		if (i == set->size)
			
 
				+			return true;
			
 
				+	}
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
			
 
				+{
			
 
				+	memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
			
 
				+	dest->size = src->size;
			
 
				+	dest->primary = src->primary;
			
 
				+}
			
 
				+
			
 
				+static bool is_split(const struct ceph_pg *pgid,
			
 
				+		     u32 old_pg_num,
			
 
				+		     u32 new_pg_num)
			
 
				+{
			
 
				+	int old_bits = calc_bits_of(old_pg_num);
			
 
				+	int old_mask = (1 << old_bits) - 1;
			
 
				+	int n;
			
 
				+
			
 
				+	WARN_ON(pgid->seed >= old_pg_num);
			
 
				+	if (new_pg_num <= old_pg_num)
			
 
				+		return false;
			
 
				+
			
 
				+	for (n = 1; ; n++) {
			
 
				+		int next_bit = n << (old_bits - 1);
			
 
				+		u32 s = next_bit | pgid->seed;
			
 
				+
			
 
				+		if (s < old_pg_num || s == pgid->seed)
			
 
				+			continue;
			
 
				+		if (s >= new_pg_num)
			
 
				+			break;
			
 
				+
			
 
				+		s = ceph_stable_mod(s, old_pg_num, old_mask);
			
 
				+		if (s == pgid->seed)
			
 
				+			return true;
			
 
				+	}
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
			
 
				+			  const struct ceph_osds *new_acting,
			
 
				+			  const struct ceph_osds *old_up,
			
 
				+			  const struct ceph_osds *new_up,
			
 
				+			  int old_size,
			
 
				+			  int new_size,
			
 
				+			  int old_min_size,
			
 
				+			  int new_min_size,
			
 
				+			  u32 old_pg_num,
			
 
				+			  u32 new_pg_num,
			
 
				+			  bool old_sort_bitwise,
			
 
				+			  bool new_sort_bitwise,
			
 
				+			  const struct ceph_pg *pgid)
			
 
				+{
			
 
				+	return !osds_equal(old_acting, new_acting) ||
			
 
				+	       !osds_equal(old_up, new_up) ||
			
 
				+	       old_size != new_size ||
			
 
				+	       old_min_size != new_min_size ||
			
 
				+	       is_split(pgid, old_pg_num, new_pg_num) ||
			
 
				+	       old_sort_bitwise != new_sort_bitwise;
			
 
				+}
			
 
				+
			
 
				+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < acting->size; i++) {
			
 
				+		if (acting->osds[i] == osd)
			
 
				+			return i;
			
 
				+	}
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static bool primary_changed(const struct ceph_osds *old_acting,
			
 
				+			    const struct ceph_osds *new_acting)
			
 
				+{
			
 
				+	if (!old_acting->size && !new_acting->size)
			
 
				+		return false; /* both still empty */
			
 
				 
			
 
				+	if (!old_acting->size ^ !new_acting->size)
			
 
				+		return true; /* was empty, now not, or vice versa */
			
 
				 
			
 
				+	if (old_acting->primary != new_acting->primary)
			
 
				+		return true; /* primary changed */
			
 
				+
			
 
				+	if (calc_pg_rank(old_acting->primary, old_acting) !=
			
 
				+	    calc_pg_rank(new_acting->primary, new_acting))
			
 
				+		return true;
			
 
				+
			
 
				+	return false; /* same primary (tho replicas may have changed) */
			
 
				+}
			
 
				+
			
 
				+bool ceph_osds_changed(const struct ceph_osds *old_acting,
			
 
				+		       const struct ceph_osds *new_acting,
			
 
				+		       bool any_change)
			
 
				+{
			
 
				+	if (primary_changed(old_acting, new_acting))
			
 
				+		return true;
			
 
				+
			
 
				+	if (any_change && !__osds_equal(old_acting, new_acting))
			
 
				+		return true;
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				 
			
 
				 /*
			
 
				  * calculate file layout from given offset, length.
			
@@ -1455,30 +1758,71 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 
				 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
			
 
				 
			
 
				 /*
			
 
				- * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
			
 
				- * called with target's (oloc, oid), since tiering isn't taken into
			
 
				- * account.
			
 
				+ * Map an object into a PG.
			
 
				+ *
			
 
				+ * Should only be called with target_oid and target_oloc (as opposed to
			
 
				+ * base_oid and base_oloc), since tiering isn't taken into account.
			
 
				  */
			
 
				-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
			
 
				-			struct ceph_object_locator *oloc,
			
 
				-			struct ceph_object_id *oid,
			
 
				-			struct ceph_pg *pg_out)
			
 
				+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
			
 
				+			      struct ceph_object_id *oid,
			
 
				+			      struct ceph_object_locator *oloc,
			
 
				+			      struct ceph_pg *raw_pgid)
			
 
				 {
			
 
				 	struct ceph_pg_pool_info *pi;
			
 
				 
			
 
				-	pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
			
 
				+	pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
			
 
				 	if (!pi)
			
 
				-		return -EIO;
			
 
				+		return -ENOENT;
			
 
				 
			
 
				-	pg_out->pool = oloc->pool;
			
 
				-	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
			
 
				-				     oid->name_len);
			
 
				+	raw_pgid->pool = oloc->pool;
			
 
				+	raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
			
 
				+				       oid->name_len);
			
 
				 
			
 
				-	dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
			
 
				-	     pg_out->pool, pg_out->seed);
			
 
				+	dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len,
			
 
				+	     oid->name, raw_pgid->pool, raw_pgid->seed);
			
 
				 	return 0;
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
			
 
				+EXPORT_SYMBOL(ceph_object_locator_to_pg);
			
 
				+
			
 
				+/*
			
 
				+ * Map a raw PG (full precision ps) into an actual PG.
			
 
				+ */
			
 
				+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
			
 
				+			 const struct ceph_pg *raw_pgid,
			
 
				+			 struct ceph_pg *pgid)
			
 
				+{
			
 
				+	pgid->pool = raw_pgid->pool;
			
 
				+	pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
			
 
				+				     pi->pg_num_mask);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Map a raw PG (full precision ps) into a placement ps (placement
			
 
				+ * seed).  Include pool id in that value so that different pools don't
			
 
				+ * use the same seeds.
			
 
				+ */
			
 
				+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
			
 
				+			 const struct ceph_pg *raw_pgid)
			
 
				+{
			
 
				+	if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
			
 
				+		/* hash pool id and seed so that pool PGs do not overlap */
			
 
				+		return crush_hash32_2(CRUSH_HASH_RJENKINS1,
			
 
				+				      ceph_stable_mod(raw_pgid->seed,
			
 
				+						      pi->pgp_num,
			
 
				+						      pi->pgp_num_mask),
			
 
				+				      raw_pgid->pool);
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * legacy behavior: add ps and pool together.  this is
			
 
				+		 * not a great approach because the PGs from each pool
			
 
				+		 * will overlap on top of each other: 0.5 == 1.4 ==
			
 
				+		 * 2.3 == ...
			
 
				+		 */
			
 
				+		return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
			
 
				+				       pi->pgp_num_mask) +
			
 
				+		       (unsigned)raw_pgid->pool;
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
			
 
				 		    int *result, int result_max,
			
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Calculate raw (crush) set for given pgid.
			
 
				+ * Calculate raw set (CRUSH output) for given PG.  The result may
			
 
				+ * contain nonexistent OSDs.  ->primary is undefined for a raw set.
			
 
				  *
			
 
				- * Return raw set length, or error.
			
 
				+ * Placement seed (CRUSH input) is returned through @ppps.
			
 
				  */
			
 
				-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
			
 
				-			  struct ceph_pg_pool_info *pool,
			
 
				-			  struct ceph_pg pgid, u32 pps, int *osds)
			
 
				+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
			
 
				+			   struct ceph_pg_pool_info *pi,
			
 
				+			   const struct ceph_pg *raw_pgid,
			
 
				+			   struct ceph_osds *raw,
			
 
				+			   u32 *ppps)
			
 
				 {
			
 
				+	u32 pps = raw_pg_to_pps(pi, raw_pgid);
			
 
				 	int ruleno;
			
 
				 	int len;
			
 
				 
			
 
				-	/* crush */
			
 
				-	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
			
 
				-				 pool->type, pool->size);
			
 
				+	ceph_osds_init(raw);
			
 
				+	if (ppps)
			
 
				+		*ppps = pps;
			
 
				+
			
 
				+	ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
			
 
				+				 pi->size);
			
 
				 	if (ruleno < 0) {
			
 
				 		pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
			
 
				-		       pgid.pool, pool->crush_ruleset, pool->type,
			
 
				-		       pool->size);
			
 
				-		return -ENOENT;
			
 
				+		       pi->id, pi->crush_ruleset, pi->type, pi->size);
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				-	len = do_crush(osdmap, ruleno, pps, osds,
			
 
				-		       min_t(int, pool->size, CEPH_PG_MAX_SIZE),
			
 
				+	len = do_crush(osdmap, ruleno, pps, raw->osds,
			
 
				+		       min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
			
 
				 		       osdmap->osd_weight, osdmap->max_osd);
			
 
				 	if (len < 0) {
			
 
				 		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
			
 
				-		       len, ruleno, pgid.pool, pool->crush_ruleset,
			
 
				-		       pool->type, pool->size);
			
 
				-		return len;
			
 
				+		       len, ruleno, pi->id, pi->crush_ruleset, pi->type,
			
 
				+		       pi->size);
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				-	return len;
			
 
				+	raw->size = len;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Given raw set, calculate up set and up primary.
			
 
				+ * Given raw set, calculate up set and up primary.  By definition of an
			
 
				+ * up set, the result won't contain nonexistent or down OSDs.
			
 
				  *
			
 
				- * Return up set length.  *primary is set to up primary osd id, or -1
			
 
				- * if up set is empty.
			
 
				+ * This is done in-place - on return @set is the up set.  If it's
			
 
				+ * empty, ->primary will remain undefined.
			
 
				  */
			
 
				-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
			
 
				-			  struct ceph_pg_pool_info *pool,
			
 
				-			  int *osds, int len, int *primary)
			
 
				+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
			
 
				+			   struct ceph_pg_pool_info *pi,
			
 
				+			   struct ceph_osds *set)
			
 
				 {
			
 
				-	int up_primary = -1;
			
 
				 	int i;
			
 
				 
			
 
				-	if (ceph_can_shift_osds(pool)) {
			
 
				+	/* ->primary is undefined for a raw set */
			
 
				+	BUG_ON(set->primary != -1);
			
 
				+
			
 
				+	if (ceph_can_shift_osds(pi)) {
			
 
				 		int removed = 0;
			
 
				 
			
 
				-		for (i = 0; i < len; i++) {
			
 
				-			if (ceph_osd_is_down(osdmap, osds[i])) {
			
 
				+		/* shift left */
			
 
				+		for (i = 0; i < set->size; i++) {
			
 
				+			if (ceph_osd_is_down(osdmap, set->osds[i])) {
			
 
				 				removed++;
			
 
				 				continue;
			
 
				 			}
			
 
				 			if (removed)
			
 
				-				osds[i - removed] = osds[i];
			
 
				+				set->osds[i - removed] = set->osds[i];
			
 
				 		}
			
 
				-
			
 
				-		len -= removed;
			
 
				-		if (len > 0)
			
 
				-			up_primary = osds[0];
			
 
				+		set->size -= removed;
			
 
				+		if (set->size > 0)
			
 
				+			set->primary = set->osds[0];
			
 
				 	} else {
			
 
				-		for (i = len - 1; i >= 0; i--) {
			
 
				-			if (ceph_osd_is_down(osdmap, osds[i]))
			
 
				-				osds[i] = CRUSH_ITEM_NONE;
			
 
				+		/* set down/dne devices to NONE */
			
 
				+		for (i = set->size - 1; i >= 0; i--) {
			
 
				+			if (ceph_osd_is_down(osdmap, set->osds[i]))
			
 
				+				set->osds[i] = CRUSH_ITEM_NONE;
			
 
				 			else
			
 
				-				up_primary = osds[i];
			
 
				+				set->primary = set->osds[i];
			
 
				 		}
			
 
				 	}
			
 
				-
			
 
				-	*primary = up_primary;
			
 
				-	return len;
			
 
				 }
			
 
				 
			
 
				-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
			
 
				-				   struct ceph_pg_pool_info *pool,
			
 
				-				   int *osds, int len, int *primary)
			
 
				+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
			
 
				+				   struct ceph_pg_pool_info *pi,
			
 
				+				   u32 pps,
			
 
				+				   struct ceph_osds *up)
			
 
				 {
			
 
				 	int i;
			
 
				 	int pos = -1;
			
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 
				 	if (!osdmap->osd_primary_affinity)
			
 
				 		return;
			
 
				 
			
 
				-	for (i = 0; i < len; i++) {
			
 
				-		int osd = osds[i];
			
 
				+	for (i = 0; i < up->size; i++) {
			
 
				+		int osd = up->osds[i];
			
 
				 
			
 
				 		if (osd != CRUSH_ITEM_NONE &&
			
 
				 		    osdmap->osd_primary_affinity[osd] !=
			
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 
				 			break;
			
 
				 		}
			
 
				 	}
			
 
				-	if (i == len)
			
 
				+	if (i == up->size)
			
 
				 		return;
			
 
				 
			
 
				 	/*
			
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 
				 	 * osd into the hash/rng so that a proportional fraction of an
			
 
				 	 * osd's pgs get rejected as primary.
			
 
				 	 */
			
 
				-	for (i = 0; i < len; i++) {
			
 
				-		int osd = osds[i];
			
 
				+	for (i = 0; i < up->size; i++) {
			
 
				+		int osd = up->osds[i];
			
 
				 		u32 aff;
			
 
				 
			
 
				 		if (osd == CRUSH_ITEM_NONE)
			
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 
				 	if (pos < 0)
			
 
				 		return;
			
 
				 
			
 
				-	*primary = osds[pos];
			
 
				+	up->primary = up->osds[pos];
			
 
				 
			
 
				-	if (ceph_can_shift_osds(pool) && pos > 0) {
			
 
				+	if (ceph_can_shift_osds(pi) && pos > 0) {
			
 
				 		/* move the new primary to the front */
			
 
				 		for (i = pos; i > 0; i--)
			
 
				-			osds[i] = osds[i - 1];
			
 
				-		osds[0] = *primary;
			
 
				+			up->osds[i] = up->osds[i - 1];
			
 
				+		up->osds[0] = up->primary;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Given up set, apply pg_temp and primary_temp mappings.
			
 
				+ * Get pg_temp and primary_temp mappings for given PG.
			
 
				  *
			
 
				- * Return acting set length.  *primary is set to acting primary osd id,
			
 
				- * or -1 if acting set is empty.
			
 
				+ * Note that a PG may have none, only pg_temp, only primary_temp or
			
 
				+ * both pg_temp and primary_temp mappings.  This means @temp isn't
			
 
				+ * always a valid OSD set on return: in the "only primary_temp" case,
			
 
				+ * @temp will have its ->primary >= 0 but ->size == 0.
			
 
				  */
			
 
				-static int apply_temps(struct ceph_osdmap *osdmap,
			
 
				-		       struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
			
 
				-		       int *osds, int len, int *primary)
			
 
				+static void get_temp_osds(struct ceph_osdmap *osdmap,
			
 
				+			  struct ceph_pg_pool_info *pi,
			
 
				+			  const struct ceph_pg *raw_pgid,
			
 
				+			  struct ceph_osds *temp)
			
 
				 {
			
 
				+	struct ceph_pg pgid;
			
 
				 	struct ceph_pg_mapping *pg;
			
 
				-	int temp_len;
			
 
				-	int temp_primary;
			
 
				 	int i;
			
 
				 
			
 
				-	/* raw_pg -> pg */
			
 
				-	pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
			
 
				-				    pool->pg_num_mask);
			
 
				+	raw_pg_to_pg(pi, raw_pgid, &pgid);
			
 
				+	ceph_osds_init(temp);
			
 
				 
			
 
				 	/* pg_temp? */
			
 
				 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
			
 
				 	if (pg) {
			
 
				-		temp_len = 0;
			
 
				-		temp_primary = -1;
			
 
				-
			
 
				 		for (i = 0; i < pg->pg_temp.len; i++) {
			
 
				 			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
			
 
				-				if (ceph_can_shift_osds(pool))
			
 
				+				if (ceph_can_shift_osds(pi))
			
 
				 					continue;
			
 
				-				else
			
 
				-					osds[temp_len++] = CRUSH_ITEM_NONE;
			
 
				+
			
 
				+				temp->osds[temp->size++] = CRUSH_ITEM_NONE;
			
 
				 			} else {
			
 
				-				osds[temp_len++] = pg->pg_temp.osds[i];
			
 
				+				temp->osds[temp->size++] = pg->pg_temp.osds[i];
			
 
				 			}
			
 
				 		}
			
 
				 
			
 
				 		/* apply pg_temp's primary */
			
 
				-		for (i = 0; i < temp_len; i++) {
			
 
				-			if (osds[i] != CRUSH_ITEM_NONE) {
			
 
				-				temp_primary = osds[i];
			
 
				+		for (i = 0; i < temp->size; i++) {
			
 
				+			if (temp->osds[i] != CRUSH_ITEM_NONE) {
			
 
				+				temp->primary = temp->osds[i];
			
 
				 				break;
			
 
				 			}
			
 
				 		}
			
 
				-	} else {
			
 
				-		temp_len = len;
			
 
				-		temp_primary = *primary;
			
 
				 	}
			
 
				 
			
 
				 	/* primary_temp? */
			
 
				 	pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
			
 
				 	if (pg)
			
 
				-		temp_primary = pg->primary_temp.osd;
			
 
				-
			
 
				-	*primary = temp_primary;
			
 
				-	return temp_len;
			
 
				+		temp->primary = pg->primary_temp.osd;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Calculate acting set for given pgid.
			
 
				+ * Map a PG to its acting set as well as its up set.
			
 
				  *
			
 
				- * Return acting set length, or error.  *primary is set to acting
			
 
				- * primary osd id, or -1 if acting set is empty or on error.
			
 
				+ * Acting set is used for data mapping purposes, while up set can be
			
 
				+ * recorded for detecting interval changes and deciding whether to
			
 
				+ * resend a request.
			
 
				  */
			
 
				-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
			
 
				-			int *osds, int *primary)
			
 
				+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
			
 
				+			       const struct ceph_pg *raw_pgid,
			
 
				+			       struct ceph_osds *up,
			
 
				+			       struct ceph_osds *acting)
			
 
				 {
			
 
				-	struct ceph_pg_pool_info *pool;
			
 
				+	struct ceph_pg_pool_info *pi;
			
 
				 	u32 pps;
			
 
				-	int len;
			
 
				 
			
 
				-	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
			
 
				-	if (!pool) {
			
 
				-		*primary = -1;
			
 
				-		return -ENOENT;
			
 
				+	pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
			
 
				+	if (!pi) {
			
 
				+		ceph_osds_init(up);
			
 
				+		ceph_osds_init(acting);
			
 
				+		goto out;
			
 
				 	}
			
 
				 
			
 
				-	if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
			
 
				-		/* hash pool id and seed so that pool PGs do not overlap */
			
 
				-		pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
			
 
				-				     ceph_stable_mod(pgid.seed, pool->pgp_num,
			
 
				-						     pool->pgp_num_mask),
			
 
				-				     pgid.pool);
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * legacy behavior: add ps and pool together.  this is
			
 
				-		 * not a great approach because the PGs from each pool
			
 
				-		 * will overlap on top of each other: 0.5 == 1.4 ==
			
 
				-		 * 2.3 == ...
			
 
				-		 */
			
 
				-		pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
			
 
				-				      pool->pgp_num_mask) +
			
 
				-			(unsigned)pgid.pool;
			
 
				-	}
			
 
				-
			
 
				-	len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
			
 
				-	if (len < 0) {
			
 
				-		*primary = -1;
			
 
				-		return len;
			
 
				+	pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
			
 
				+	raw_to_up_osds(osdmap, pi, up);
			
 
				+	apply_primary_affinity(osdmap, pi, pps, up);
			
 
				+	get_temp_osds(osdmap, pi, raw_pgid, acting);
			
 
				+	if (!acting->size) {
			
 
				+		memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
			
 
				+		acting->size = up->size;
			
 
				+		if (acting->primary == -1)
			
 
				+			acting->primary = up->primary;
			
 
				 	}
			
 
				-
			
 
				-	len = raw_to_up_osds(osdmap, pool, osds, len, primary);
			
 
				-
			
 
				-	apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
			
 
				-
			
 
				-	len = apply_temps(osdmap, pool, pgid, osds, len, primary);
			
 
				-
			
 
				-	return len;
			
 
				+out:
			
 
				+	WARN_ON(!osds_valid(up) || !osds_valid(acting));
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Return primary osd for given pgid, or -1 if none.
			
 
				+ * Return acting primary for given PG, or -1 if none.
			
 
				  */
			
 
				-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
			
 
				+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
			
 
				+			      const struct ceph_pg *raw_pgid)
			
 
				 {
			
 
				-	int osds[CEPH_PG_MAX_SIZE];
			
 
				-	int primary;
			
 
				-
			
 
				-	ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
			
 
				+	struct ceph_osds up, acting;
			
 
				 
			
 
				-	return primary;
			
 
				+	ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
			
 
				+	return acting.primary;
			
 
				 }
			
 
				-EXPORT_SYMBOL(ceph_calc_pg_primary);
			
 
				+EXPORT_SYMBOL(ceph_pg_to_acting_primary);