9 年之前 · a53fe25769
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3684,11 +3684,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
				 		return -ENOMEM;
			
 
				 
			
 
				 	/*
			
 
				-	 * We don't need the lock here since we are protected by the transaction
			
 
				-	 * commit.  We want to do the cache_save_setup first and then run the
			
 
				+	 * Even though we are in the critical section of the transaction commit,
			
 
				+	 * we can still have concurrent tasks adding elements to this
			
 
				+	 * transaction's list of dirty block groups. These tasks correspond to
			
 
				+	 * endio free space workers started when writeback finishes for a
			
 
				+	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
			
 
				+	 * allocate new block groups as a result of COWing nodes of the root
			
 
				+	 * tree when updating the free space inode. The writeback for the space
			
 
				+	 * caches is triggered by an earlier call to
			
 
				+	 * btrfs_start_dirty_block_groups() and iterations of the following
			
 
				+	 * loop.
			
 
				+	 * Also we want to do the cache_save_setup first and then run the
			
 
				 	 * delayed refs to make sure we have the best chance at doing this all
			
 
				 	 * in one shot.
			
 
				 	 */
			
 
				+	spin_lock(&cur_trans->dirty_bgs_lock);
			
 
				 	while (!list_empty(&cur_trans->dirty_bgs)) {
			
 
				 		cache = list_first_entry(&cur_trans->dirty_bgs,
			
 
				 					 struct btrfs_block_group_cache,
			
@@ -3700,11 +3710,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
				 		 * finish and then do it all again
			
 
				 		 */
			
 
				 		if (!list_empty(&cache->io_list)) {
			
 
				+			spin_unlock(&cur_trans->dirty_bgs_lock);
			
 
				 			list_del_init(&cache->io_list);
			
 
				 			btrfs_wait_cache_io(root, trans, cache,
			
 
				 					    &cache->io_ctl, path,
			
 
				 					    cache->key.objectid);
			
 
				 			btrfs_put_block_group(cache);
			
 
				+			spin_lock(&cur_trans->dirty_bgs_lock);
			
 
				 		}
			
 
				 
			
 
				 		/*
			
@@ -3712,6 +3724,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
				 		 * on any pending IO
			
 
				 		 */
			
 
				 		list_del_init(&cache->dirty_list);
			
 
				+		spin_unlock(&cur_trans->dirty_bgs_lock);
			
 
				 		should_put = 1;
			
 
				 
			
 
				 		cache_save_setup(cache, trans, path);
			
@@ -3743,7 +3756,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
				 		/* if its not on the io list, we need to put the block group */
			
 
				 		if (should_put)
			
 
				 			btrfs_put_block_group(cache);
			
 
				+		spin_lock(&cur_trans->dirty_bgs_lock);
			
 
				 	}
			
 
				+	spin_unlock(&cur_trans->dirty_bgs_lock);
			
 
				 
			
 
				 	while (!list_empty(io)) {
			
 
				 		cache = list_first_entry(io, struct btrfs_block_group_cache,
			
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,13 @@ struct btrfs_iget_args {
 
				 	struct btrfs_root *root;
			
 
				 };
			
 
				 
			
 
				+struct btrfs_dio_data {
			
 
				+	u64 outstanding_extents;
			
 
				+	u64 reserve;
			
 
				+	u64 unsubmitted_oe_range_start;
			
 
				+	u64 unsubmitted_oe_range_end;
			
 
				+};
			
 
				+
			
 
				 static const struct inode_operations btrfs_dir_inode_operations;
			
 
				 static const struct inode_operations btrfs_symlink_inode_operations;
			
 
				 static const struct inode_operations btrfs_dir_ro_inode_operations;
			
@@ -7408,25 +7415,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 
				 			btrfs_start_ordered_extent(inode, ordered, 1);
			
 
				 			btrfs_put_ordered_extent(ordered);
			
 
				 		} else {
			
 
				-			/* Screw you mmap */
			
 
				-			ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
			
 
				-			if (ret)
			
 
				-				break;
			
 
				-			ret = filemap_fdatawait_range(inode->i_mapping,
			
 
				-						      lockstart,
			
 
				-						      lockend);
			
 
				-			if (ret)
			
 
				-				break;
			
 
				-
			
 
				 			/*
			
 
				-			 * If we found a page that couldn't be invalidated just
			
 
				-			 * fall back to buffered.
			
 
				+			 * We could trigger writeback for this range (and wait
			
 
				+			 * for it to complete) and then invalidate the pages for
			
 
				+			 * this range (through invalidate_inode_pages2_range()),
			
 
				+			 * but that can lead us to a deadlock with a concurrent
			
 
				+			 * call to readpages() (a buffered read or a defrag call
			
 
				+			 * triggered a readahead) on a page lock due to an
			
 
				+			 * ordered dio extent we created before but did not have
			
 
				+			 * yet a corresponding bio submitted (whence it can not
			
 
				+			 * complete), which makes readpages() wait for that
			
 
				+			 * ordered extent to complete while holding a lock on
			
 
				+			 * that page.
			
 
				 			 */
			
 
				-			ret = invalidate_inode_pages2_range(inode->i_mapping,
			
 
				-					lockstart >> PAGE_CACHE_SHIFT,
			
 
				-					lockend >> PAGE_CACHE_SHIFT);
			
 
				-			if (ret)
			
 
				-				break;
			
 
				+			ret = -ENOTBLK;
			
 
				+			break;
			
 
				 		}
			
 
				 
			
 
				 		cond_resched();
			
@@ -7482,11 +7485,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 
				 	return em;
			
 
				 }
			
 
				 
			
 
				-struct btrfs_dio_data {
			
 
				-	u64 outstanding_extents;
			
 
				-	u64 reserve;
			
 
				-};
			
 
				-
			
 
				 static void adjust_dio_outstanding_extents(struct inode *inode,
			
 
				 					   struct btrfs_dio_data *dio_data,
			
 
				 					   const u64 len)
			
@@ -7670,6 +7668,7 @@ unlock:
 
				 		btrfs_free_reserved_data_space(inode, start, len);
			
 
				 		WARN_ON(dio_data->reserve < len);
			
 
				 		dio_data->reserve -= len;
			
 
				+		dio_data->unsubmitted_oe_range_end = start + len;
			
 
				 		current->journal_info = dio_data;
			
 
				 	}
			
 
				 
			
@@ -7992,22 +7991,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
 
				 	bio_put(bio);
			
 
				 }
			
 
				 
			
 
				-static void btrfs_endio_direct_write(struct bio *bio)
			
 
				+static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
			
 
				+						    const u64 offset,
			
 
				+						    const u64 bytes,
			
 
				+						    const int uptodate)
			
 
				 {
			
 
				-	struct btrfs_dio_private *dip = bio->bi_private;
			
 
				-	struct inode *inode = dip->inode;
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				 	struct btrfs_ordered_extent *ordered = NULL;
			
 
				-	u64 ordered_offset = dip->logical_offset;
			
 
				-	u64 ordered_bytes = dip->bytes;
			
 
				-	struct bio *dio_bio;
			
 
				+	u64 ordered_offset = offset;
			
 
				+	u64 ordered_bytes = bytes;
			
 
				 	int ret;
			
 
				 
			
 
				 again:
			
 
				 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
			
 
				 						   &ordered_offset,
			
 
				 						   ordered_bytes,
			
 
				-						   !bio->bi_error);
			
 
				+						   uptodate);
			
 
				 	if (!ret)
			
 
				 		goto out_test;
			
 
				 
			
@@ -8020,13 +8019,22 @@ out_test:
 
				 	 * our bio might span multiple ordered extents.  If we haven't
			
 
				 	 * completed the accounting for the whole dio, go back and try again
			
 
				 	 */
			
 
				-	if (ordered_offset < dip->logical_offset + dip->bytes) {
			
 
				-		ordered_bytes = dip->logical_offset + dip->bytes -
			
 
				-			ordered_offset;
			
 
				+	if (ordered_offset < offset + bytes) {
			
 
				+		ordered_bytes = offset + bytes - ordered_offset;
			
 
				 		ordered = NULL;
			
 
				 		goto again;
			
 
				 	}
			
 
				-	dio_bio = dip->dio_bio;
			
 
				+}
			
 
				+
			
 
				+static void btrfs_endio_direct_write(struct bio *bio)
			
 
				+{
			
 
				+	struct btrfs_dio_private *dip = bio->bi_private;
			
 
				+	struct bio *dio_bio = dip->dio_bio;
			
 
				+
			
 
				+	btrfs_endio_direct_write_update_ordered(dip->inode,
			
 
				+						dip->logical_offset,
			
 
				+						dip->bytes,
			
 
				+						!bio->bi_error);
			
 
				 
			
 
				 	kfree(dip);
			
 
				 
			
@@ -8334,6 +8342,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 
				 		dip->subio_endio = btrfs_subio_endio_read;
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * Reset the range for unsubmitted ordered extents (to a 0 length range)
			
 
				+	 * even if we fail to submit a bio, because in such case we do the
			
 
				+	 * corresponding error handling below and it must not be done a second
			
 
				+	 * time by btrfs_direct_IO().
			
 
				+	 */
			
 
				+	if (write) {
			
 
				+		struct btrfs_dio_data *dio_data = current->journal_info;
			
 
				+
			
 
				+		dio_data->unsubmitted_oe_range_end = dip->logical_offset +
			
 
				+			dip->bytes;
			
 
				+		dio_data->unsubmitted_oe_range_start =
			
 
				+			dio_data->unsubmitted_oe_range_end;
			
 
				+	}
			
 
				+
			
 
				 	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
			
 
				 	if (!ret)
			
 
				 		return;
			
@@ -8362,24 +8385,15 @@ free_ordered:
 
				 		dip = NULL;
			
 
				 		io_bio = NULL;
			
 
				 	} else {
			
 
				-		if (write) {
			
 
				-			struct btrfs_ordered_extent *ordered;
			
 
				-
			
 
				-			ordered = btrfs_lookup_ordered_extent(inode,
			
 
				-							      file_offset);
			
 
				-			set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
			
 
				-			/*
			
 
				-			 * Decrements our ref on the ordered extent and removes
			
 
				-			 * the ordered extent from the inode's ordered tree,
			
 
				-			 * doing all the proper resource cleanup such as for the
			
 
				-			 * reserved space and waking up any waiters for this
			
 
				-			 * ordered extent (through btrfs_remove_ordered_extent).
			
 
				-			 */
			
 
				-			btrfs_finish_ordered_io(ordered);
			
 
				-		} else {
			
 
				+		if (write)
			
 
				+			btrfs_endio_direct_write_update_ordered(inode,
			
 
				+						file_offset,
			
 
				+						dio_bio->bi_iter.bi_size,
			
 
				+						0);
			
 
				+		else
			
 
				 			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
			
 
				 			      file_offset + dio_bio->bi_iter.bi_size - 1);
			
 
				-		}
			
 
				+
			
 
				 		dio_bio->bi_error = -EIO;
			
 
				 		/*
			
 
				 		 * Releases and cleans up our dio_bio, no need to bio_put()
			
@@ -8479,6 +8493,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
				 		 * originally calculated.  Abuse current->journal_info for this.
			
 
				 		 */
			
 
				 		dio_data.reserve = round_up(count, root->sectorsize);
			
 
				+		dio_data.unsubmitted_oe_range_start = (u64)offset;
			
 
				+		dio_data.unsubmitted_oe_range_end = (u64)offset;
			
 
				 		current->journal_info = &dio_data;
			
 
				 	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
			
 
				 				     &BTRFS_I(inode)->runtime_flags)) {
			
@@ -8497,6 +8513,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
				 			if (dio_data.reserve)
			
 
				 				btrfs_delalloc_release_space(inode, offset,
			
 
				 							     dio_data.reserve);
			
 
				+			/*
			
 
				+			 * On error we might have left some ordered extents
			
 
				+			 * without submitting corresponding bios for them, so
			
 
				+			 * cleanup them up to avoid other tasks getting them
			
 
				+			 * and waiting for them to complete forever.
			
 
				+			 */
			
 
				+			if (dio_data.unsubmitted_oe_range_start <
			
 
				+			    dio_data.unsubmitted_oe_range_end)
			
 
				+				btrfs_endio_direct_write_update_ordered(inode,
			
 
				+					dio_data.unsubmitted_oe_range_start,
			
 
				+					dio_data.unsubmitted_oe_range_end -
			
 
				+					dio_data.unsubmitted_oe_range_start,
			
 
				+					0);
			
 
				 		} else if (ret >= 0 && (size_t)ret < count)
			
 
				 			btrfs_delalloc_release_space(inode, offset,
			
 
				 						     count - (size_t)ret);
			
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 
				 			list_del_init(&em->list);
			
 
				 			free_extent_map(em);
			
 
				 		}
			
 
				+		/*
			
 
				+		 * If any block groups are found in ->deleted_bgs then it's
			
 
				+		 * because the transaction was aborted and a commit did not
			
 
				+		 * happen (things failed before writing the new superblock
			
 
				+		 * and calling btrfs_finish_extent_commit()), so we can not
			
 
				+		 * discard the physical locations of the block groups.
			
 
				+		 */
			
 
				+		while (!list_empty(&transaction->deleted_bgs)) {
			
 
				+			struct btrfs_block_group_cache *cache;
			
 
				+
			
 
				+			cache = list_first_entry(&transaction->deleted_bgs,
			
 
				+						 struct btrfs_block_group_cache,
			
 
				+						 bg_list);
			
 
				+			list_del_init(&cache->bg_list);
			
 
				+			btrfs_put_block_group_trimming(cache);
			
 
				+			btrfs_put_block_group(cache);
			
 
				+		}
			
 
				 		kmem_cache_free(btrfs_transaction_cachep, transaction);
			
 
				 	}
			
 
				 }
			
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
				 		goto out;
			
 
				 	}
			
 
				 	btrfs_release_path(path);
			
 
				+	/*
			
 
				+	 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
			
 
				+	 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
			
 
				+	 * a deadlock (attempting to write lock an already write locked leaf).
			
 
				+	 */
			
 
				+	path->lowest_level = 1;
			
 
				 	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
			
 
				 
			
 
				 	if (wret < 0) {
			
@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
				 		ret = 0;
			
 
				 		goto out;
			
 
				 	}
			
 
				-	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
			
 
				-	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
			
 
				-					   min_trans);
			
 
				+	/*
			
 
				+	 * The node at level 1 must always be locked when our path has
			
 
				+	 * keep_locks set and lowest_level is 1, regardless of the value of
			
 
				+	 * path->slots[1].
			
 
				+	 */
			
 
				+	BUG_ON(path->locks[1] == 0);
			
 
				 	ret = btrfs_realloc_node(trans, root,
			
 
				 				 path->nodes[1], 0,
			
 
				 				 &last_ret,
			
@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
				 		WARN_ON(ret == -EAGAIN);
			
 
				 		goto out;
			
 
				 	}
			
 
				+	/*
			
 
				+	 * Now that we reallocated the node we can find the next key. Note that
			
 
				+	 * btrfs_find_next_key() can release our path and do another search
			
 
				+	 * without COWing, this is because even with path->keep_locks = 1,
			
 
				+	 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
			
 
				+	 * node when path->slots[node_level - 1] does not point to the last
			
 
				+	 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
			
 
				+	 * we search for the next key after reallocating our node.
			
 
				+	 */
			
 
				+	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
			
 
				+	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
			
 
				+					   min_trans);
			
 
				 	if (next_key_ret == 0) {
			
 
				 		memcpy(&root->defrag_progress, &key, sizeof(key));
			
 
				 		ret = -EAGAIN;
			
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4825,20 +4825,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * Take the device list mutex to prevent races with the final phase of
			
 
				+	 * a device replace operation that replaces the device object associated
			
 
				+	 * with the map's stripes, because the device object's id can change
			
 
				+	 * at any time during that final phase of the device replace operation
			
 
				+	 * (dev-replace.c:btrfs_dev_replace_finishing()).
			
 
				+	 */
			
 
				+	mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
			
 
				 	for (i = 0; i < map->num_stripes; i++) {
			
 
				 		device = map->stripes[i].dev;
			
 
				 		dev_offset = map->stripes[i].physical;
			
 
				 
			
 
				 		ret = btrfs_update_device(trans, device);
			
 
				 		if (ret)
			
 
				-			goto out;
			
 
				+			break;
			
 
				 		ret = btrfs_alloc_dev_extent(trans, device,
			
 
				 					     chunk_root->root_key.objectid,
			
 
				 					     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
			
 
				 					     chunk_offset, dev_offset,
			
 
				 					     stripe_size);
			
 
				 		if (ret)
			
 
				-			goto out;
			
 
				+			break;
			
 
				+	}
			
 
				+	if (ret) {
			
 
				+		mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
			
 
				+		goto out;
			
 
				 	}
			
 
				 
			
 
				 	stripe = &chunk->stripe;
			
@@ -4851,6 +4863,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 
				 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
			
 
				 		stripe++;
			
 
				 	}
			
 
				+	mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
			
 
				 
			
 
				 	btrfs_set_stack_chunk_length(chunk, chunk_size);
			
 
				 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);