10 лет назад · 2f2ff0ee5e
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,7 +66,11 @@ struct btrfs_inode {
 
				 	 */
			
 
				 	struct btrfs_key location;
			
 
				 
			
 
				-	/* Lock for counters */
			
 
				+	/*
			
 
				+	 * Lock for counters and all fields used to determine if the inode is in
			
 
				+	 * the log or not (last_trans, last_sub_trans, last_log_commit,
			
 
				+	 * logged_trans).
			
 
				+	 */
			
 
				 	spinlock_t lock;
			
 
				 
			
 
				 	/* the extent_tree has caches of all the extent mappings to disk */
			
@@ -250,6 +254,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
 
				 
			
 
				 static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
			
 
				 {
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	spin_lock(&BTRFS_I(inode)->lock);
			
 
				 	if (BTRFS_I(inode)->logged_trans == generation &&
			
 
				 	    BTRFS_I(inode)->last_sub_trans <=
			
 
				 	    BTRFS_I(inode)->last_log_commit &&
			
@@ -263,9 +270,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 
				 		 */
			
 
				 		smp_mb();
			
 
				 		if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
			
 
				-			return 1;
			
 
				+			ret = 1;
			
 
				 	}
			
 
				-	return 0;
			
 
				+	spin_unlock(&BTRFS_I(inode)->lock);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 #define BTRFS_DIO_ORIG_BIO_SUBMITTED	0x1
			
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1811,7 +1811,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 
				 	 * otherwise subsequent syncs to a file that's been synced in this
			
 
				 	 * transaction will appear to have already occured.
			
 
				 	 */
			
 
				+	spin_lock(&BTRFS_I(inode)->lock);
			
 
				 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
			
 
				+	spin_unlock(&BTRFS_I(inode)->lock);
			
 
				 	if (num_written > 0) {
			
 
				 		err = generic_write_sync(file, pos, num_written);
			
 
				 		if (err < 0)
			
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -136,9 +136,11 @@ struct btrfs_pending_snapshot {
 
				 static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
			
 
				 					      struct inode *inode)
			
 
				 {
			
 
				+	spin_lock(&BTRFS_I(inode)->lock);
			
 
				 	BTRFS_I(inode)->last_trans = trans->transaction->transid;
			
 
				 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
			
 
				 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
			
 
				+	spin_unlock(&BTRFS_I(inode)->lock);
			
 
				 }
			
 
				 
			
 
				 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -492,11 +492,19 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 
				 
			
 
				 		if (btrfs_inode_generation(eb, src_item) == 0) {
			
 
				 			struct extent_buffer *dst_eb = path->nodes[0];
			
 
				+			const u64 ino_size = btrfs_inode_size(eb, src_item);
			
 
				 
			
 
				+			/*
			
 
				+			 * For regular files an ino_size == 0 is used only when
			
 
				+			 * logging that an inode exists, as part of a directory
			
 
				+			 * fsync, and the inode wasn't fsynced before. In this
			
 
				+			 * case don't set the size of the inode in the fs/subvol
			
 
				+			 * tree, otherwise we would be throwing valid data away.
			
 
				+			 */
			
 
				 			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
			
 
				-			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
			
 
				+			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
			
 
				+			    ino_size != 0) {
			
 
				 				struct btrfs_map_token token;
			
 
				-				u64 ino_size = btrfs_inode_size(eb, src_item);
			
 
				 
			
 
				 				btrfs_init_map_token(&token);
			
 
				 				btrfs_set_token_inode_size(dst_eb, dst_item,
			
@@ -3124,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 
				 			  struct btrfs_root *root, struct inode *inode,
			
 
				 			  struct btrfs_path *path,
			
 
				 			  struct btrfs_path *dst_path, int key_type,
			
 
				+			  struct btrfs_log_ctx *ctx,
			
 
				 			  u64 min_offset, u64 *last_offset_ret)
			
 
				 {
			
 
				 	struct btrfs_key min_key;
			
@@ -3208,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 
				 		src = path->nodes[0];
			
 
				 		nritems = btrfs_header_nritems(src);
			
 
				 		for (i = path->slots[0]; i < nritems; i++) {
			
 
				+			struct btrfs_dir_item *di;
			
 
				+
			
 
				 			btrfs_item_key_to_cpu(src, &min_key, i);
			
 
				 
			
 
				 			if (min_key.objectid != ino || min_key.type != key_type)
			
@@ -3218,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 
				 				err = ret;
			
 
				 				goto done;
			
 
				 			}
			
 
				+
			
 
				+			/*
			
 
				+			 * We must make sure that when we log a directory entry,
			
 
				+			 * the corresponding inode, after log replay, has a
			
 
				+			 * matching link count. For example:
			
 
				+			 *
			
 
				+			 * touch foo
			
 
				+			 * mkdir mydir
			
 
				+			 * sync
			
 
				+			 * ln foo mydir/bar
			
 
				+			 * xfs_io -c "fsync" mydir
			
 
				+			 * <crash>
			
 
				+			 * <mount fs and log replay>
			
 
				+			 *
			
 
				+			 * Would result in a fsync log that when replayed, our
			
 
				+			 * file inode would have a link count of 1, but we get
			
 
				+			 * two directory entries pointing to the same inode.
			
 
				+			 * After removing one of the names, it would not be
			
 
				+			 * possible to remove the other name, which resulted
			
 
				+			 * always in stale file handle errors, and would not
			
 
				+			 * be possible to rmdir the parent directory, since
			
 
				+			 * its i_size could never decrement to the value
			
 
				+			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
			
 
				+			 */
			
 
				+			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
			
 
				+			btrfs_dir_item_key_to_cpu(src, di, &tmp);
			
 
				+			if (ctx &&
			
 
				+			    (btrfs_dir_transid(src, di) == trans->transid ||
			
 
				+			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
			
 
				+			    tmp.type != BTRFS_ROOT_ITEM_KEY)
			
 
				+				ctx->log_new_dentries = true;
			
 
				 		}
			
 
				 		path->slots[0] = nritems;
			
 
				 
			
@@ -3279,7 +3321,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 
				 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
			
 
				 			  struct btrfs_root *root, struct inode *inode,
			
 
				 			  struct btrfs_path *path,
			
 
				-			  struct btrfs_path *dst_path)
			
 
				+			  struct btrfs_path *dst_path,
			
 
				+			  struct btrfs_log_ctx *ctx)
			
 
				 {
			
 
				 	u64 min_key;
			
 
				 	u64 max_key;
			
@@ -3291,7 +3334,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 
				 	max_key = 0;
			
 
				 	while (1) {
			
 
				 		ret = log_dir_items(trans, root, inode, path,
			
 
				-				    dst_path, key_type, min_key,
			
 
				+				    dst_path, key_type, ctx, min_key,
			
 
				 				    &max_key);
			
 
				 		if (ret)
			
 
				 			return ret;
			
@@ -4067,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
 
				 	if (ret < 0) {
			
 
				 		return ret;
			
 
				 	} else if (ret > 0) {
			
 
				-		*size_ret = i_size_read(inode);
			
 
				+		*size_ret = 0;
			
 
				 	} else {
			
 
				 		struct btrfs_inode_item *item;
			
 
				 
			
@@ -4374,15 +4417,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 
			
 
				 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
			
 
				-		ret = log_directory_changes(trans, root, inode, path, dst_path);
			
 
				+		ret = log_directory_changes(trans, root, inode, path, dst_path,
			
 
				+					    ctx);
			
 
				 		if (ret) {
			
 
				 			err = ret;
			
 
				 			goto out_unlock;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	spin_lock(&BTRFS_I(inode)->lock);
			
 
				 	BTRFS_I(inode)->logged_trans = trans->transid;
			
 
				 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
			
 
				+	spin_unlock(&BTRFS_I(inode)->lock);
			
 
				 out_unlock:
			
 
				 	if (unlikely(err))
			
 
				 		btrfs_put_logged_extents(&logged_list);
			
@@ -4469,6 +4515,181 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+struct btrfs_dir_list {
			
 
				+	u64 ino;
			
 
				+	struct list_head list;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Log the inodes of the new dentries of a directory. See log_dir_items() for
			
 
				+ * details about the why it is needed.
			
 
				+ * This is a recursive operation - if an existing dentry corresponds to a
			
 
				+ * directory, that directory's new entries are logged too (same behaviour as
			
 
				+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
			
 
				+ * the dentries point to we do not lock their i_mutex, otherwise lockdep
			
 
				+ * complains about the following circular lock dependency / possible deadlock:
			
 
				+ *
			
 
				+ *        CPU0                                        CPU1
			
 
				+ *        ----                                        ----
			
 
				+ * lock(&type->i_mutex_dir_key#3/2);
			
 
				+ *                                            lock(sb_internal#2);
			
 
				+ *                                            lock(&type->i_mutex_dir_key#3/2);
			
 
				+ * lock(&sb->s_type->i_mutex_key#14);
			
 
				+ *
			
 
				+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
			
 
				+ * sb_start_intwrite() in btrfs_start_transaction().
			
 
				+ * Not locking i_mutex of the inodes is still safe because:
			
 
				+ *
			
 
				+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
			
 
				+ *    that while logging the inode new references (names) are added or removed
			
 
				+ *    from the inode, leaving the logged inode item with a link count that does
			
 
				+ *    not match the number of logged inode reference items. This is fine because
			
 
				+ *    at log replay time we compute the real number of links and correct the
			
 
				+ *    link count in the inode item (see replay_one_buffer() and
			
 
				+ *    link_to_fixup_dir());
			
 
				+ *
			
 
				+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
			
 
				+ *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
			
 
				+ *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
			
 
				+ *    has a size that doesn't match the sum of the lengths of all the logged
			
 
				+ *    names. This does not result in a problem because if a dir_item key is
			
 
				+ *    logged but its matching dir_index key is not logged, at log replay time we
			
 
				+ *    don't use it to replay the respective name (see replay_one_name()). On the
			
 
				+ *    other hand if only the dir_index key ends up being logged, the respective
			
 
				+ *    name is added to the fs/subvol tree with both the dir_item and dir_index
			
 
				+ *    keys created (see replay_one_name()).
			
 
				+ *    The directory's inode item with a wrong i_size is not a problem as well,
			
 
				+ *    since we don't use it at log replay time to set the i_size in the inode
			
 
				+ *    item of the fs/subvol tree (see overwrite_item()).
			
 
				+ */
			
 
				+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
			
 
				+				struct btrfs_root *root,
			
 
				+				struct inode *start_inode,
			
 
				+				struct btrfs_log_ctx *ctx)
			
 
				+{
			
 
				+	struct btrfs_root *log = root->log_root;
			
 
				+	struct btrfs_path *path;
			
 
				+	LIST_HEAD(dir_list);
			
 
				+	struct btrfs_dir_list *dir_elem;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
			
 
				+	if (!dir_elem) {
			
 
				+		btrfs_free_path(path);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+	dir_elem->ino = btrfs_ino(start_inode);
			
 
				+	list_add_tail(&dir_elem->list, &dir_list);
			
 
				+
			
 
				+	while (!list_empty(&dir_list)) {
			
 
				+		struct extent_buffer *leaf;
			
 
				+		struct btrfs_key min_key;
			
 
				+		int nritems;
			
 
				+		int i;
			
 
				+
			
 
				+		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
			
 
				+					    list);
			
 
				+		if (ret)
			
 
				+			goto next_dir_inode;
			
 
				+
			
 
				+		min_key.objectid = dir_elem->ino;
			
 
				+		min_key.type = BTRFS_DIR_ITEM_KEY;
			
 
				+		min_key.offset = 0;
			
 
				+again:
			
 
				+		btrfs_release_path(path);
			
 
				+		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
			
 
				+		if (ret < 0) {
			
 
				+			goto next_dir_inode;
			
 
				+		} else if (ret > 0) {
			
 
				+			ret = 0;
			
 
				+			goto next_dir_inode;
			
 
				+		}
			
 
				+
			
 
				+process_leaf:
			
 
				+		leaf = path->nodes[0];
			
 
				+		nritems = btrfs_header_nritems(leaf);
			
 
				+		for (i = path->slots[0]; i < nritems; i++) {
			
 
				+			struct btrfs_dir_item *di;
			
 
				+			struct btrfs_key di_key;
			
 
				+			struct inode *di_inode;
			
 
				+			struct btrfs_dir_list *new_dir_elem;
			
 
				+			int log_mode = LOG_INODE_EXISTS;
			
 
				+			int type;
			
 
				+
			
 
				+			btrfs_item_key_to_cpu(leaf, &min_key, i);
			
 
				+			if (min_key.objectid != dir_elem->ino ||
			
 
				+			    min_key.type != BTRFS_DIR_ITEM_KEY)
			
 
				+				goto next_dir_inode;
			
 
				+
			
 
				+			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
			
 
				+			type = btrfs_dir_type(leaf, di);
			
 
				+			if (btrfs_dir_transid(leaf, di) < trans->transid &&
			
 
				+			    type != BTRFS_FT_DIR)
			
 
				+				continue;
			
 
				+			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
			
 
				+			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
			
 
				+				continue;
			
 
				+
			
 
				+			di_inode = btrfs_iget(root->fs_info->sb, &di_key,
			
 
				+					      root, NULL);
			
 
				+			if (IS_ERR(di_inode)) {
			
 
				+				ret = PTR_ERR(di_inode);
			
 
				+				goto next_dir_inode;
			
 
				+			}
			
 
				+
			
 
				+			if (btrfs_inode_in_log(di_inode, trans->transid)) {
			
 
				+				iput(di_inode);
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			ctx->log_new_dentries = false;
			
 
				+			if (type == BTRFS_FT_DIR)
			
 
				+				log_mode = LOG_INODE_ALL;
			
 
				+			btrfs_release_path(path);
			
 
				+			ret = btrfs_log_inode(trans, root, di_inode,
			
 
				+					      log_mode, 0, LLONG_MAX, ctx);
			
 
				+			iput(di_inode);
			
 
				+			if (ret)
			
 
				+				goto next_dir_inode;
			
 
				+			if (ctx->log_new_dentries) {
			
 
				+				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
			
 
				+						       GFP_NOFS);
			
 
				+				if (!new_dir_elem) {
			
 
				+					ret = -ENOMEM;
			
 
				+					goto next_dir_inode;
			
 
				+				}
			
 
				+				new_dir_elem->ino = di_key.objectid;
			
 
				+				list_add_tail(&new_dir_elem->list, &dir_list);
			
 
				+			}
			
 
				+			break;
			
 
				+		}
			
 
				+		if (i == nritems) {
			
 
				+			ret = btrfs_next_leaf(log, path);
			
 
				+			if (ret < 0) {
			
 
				+				goto next_dir_inode;
			
 
				+			} else if (ret > 0) {
			
 
				+				ret = 0;
			
 
				+				goto next_dir_inode;
			
 
				+			}
			
 
				+			goto process_leaf;
			
 
				+		}
			
 
				+		if (min_key.offset < (u64)-1) {
			
 
				+			min_key.offset++;
			
 
				+			goto again;
			
 
				+		}
			
 
				+next_dir_inode:
			
 
				+		list_del(&dir_elem->list);
			
 
				+		kfree(dir_elem);
			
 
				+	}
			
 
				+
			
 
				+	btrfs_free_path(path);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * helper function around btrfs_log_inode to make sure newly created
			
 
				  * parent directories also end up in the log.  A minimal inode and backref
			
@@ -4491,6 +4712,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
				 	const struct dentry * const first_parent = parent;
			
 
				 	const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
			
 
				 				 last_committed);
			
 
				+	bool log_dentries = false;
			
 
				+	struct inode *orig_inode = inode;
			
 
				 
			
 
				 	sb = inode->i_sb;
			
 
				 
			
@@ -4546,6 +4769,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
				 		goto end_trans;
			
 
				 	}
			
 
				 
			
 
				+	if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
			
 
				+		log_dentries = true;
			
 
				+
			
 
				 	while (1) {
			
 
				 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
			
 
				 			break;
			
@@ -4582,7 +4808,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
				 		dput(old_parent);
			
 
				 		old_parent = parent;
			
 
				 	}
			
 
				-	ret = 0;
			
 
				+	if (log_dentries)
			
 
				+		ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
			
 
				+	else
			
 
				+		ret = 0;
			
 
				 end_trans:
			
 
				 	dput(old_parent);
			
 
				 	if (ret < 0) {
			
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -29,6 +29,7 @@ struct btrfs_log_ctx {
 
				 	int log_ret;
			
 
				 	int log_transid;
			
 
				 	int io_err;
			
 
				+	bool log_new_dentries;
			
 
				 	struct list_head list;
			
 
				 };
			
 
				 
			
@@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
 
				 	ctx->log_ret = 0;
			
 
				 	ctx->log_transid = 0;
			
 
				 	ctx->io_err = 0;
			
 
				+	ctx->log_new_dentries = false;
			
 
				 	INIT_LIST_HEAD(&ctx->list);
			
 
				 }