11 лет назад · 2fe6303e7c
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1962,25 +1962,6 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 
				 		SetPageUptodate(page);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * When IO fails, either with EIO or csum verification fails, we
			
 
				- * try other mirrors that might have a good copy of the data.  This
			
 
				- * io_failure_record is used to record state as we go through all the
			
 
				- * mirrors.  If another mirror has good data, the page is set up to date
			
 
				- * and things continue.  If a good mirror can't be found, the original
			
 
				- * bio end_io callback is called to indicate things have failed.
			
 
				- */
			
 
				-struct io_failure_record {
			
 
				-	struct page *page;
			
 
				-	u64 start;
			
 
				-	u64 len;
			
 
				-	u64 logical;
			
 
				-	unsigned long bio_flags;
			
 
				-	int this_mirror;
			
 
				-	int failed_mirror;
			
 
				-	int in_validation;
			
 
				-};
			
 
				-
			
 
				 static int free_io_failure(struct inode *inode, struct io_failure_record *rec)
			
 
				 {
			
 
				 	int ret;
			
@@ -2156,40 +2137,24 @@ static int clean_io_failure(u64 start, struct page *page)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * this is a generic handler for readpage errors (default
			
 
				- * readpage_io_failed_hook). if other copies exist, read those and write back
			
 
				- * good data to the failed position. does not investigate in remapping the
			
 
				- * failed extent elsewhere, hoping the device will be smart enough to do this as
			
 
				- * needed
			
 
				- */
			
 
				-
			
 
				-static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
			
 
				-			      struct page *page, u64 start, u64 end,
			
 
				-			      int failed_mirror)
			
 
				+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
			
 
				+				struct io_failure_record **failrec_ret)
			
 
				 {
			
 
				-	struct io_failure_record *failrec = NULL;
			
 
				+	struct io_failure_record *failrec;
			
 
				 	u64 private;
			
 
				 	struct extent_map *em;
			
 
				-	struct inode *inode = page->mapping->host;
			
 
				 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
			
 
				 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
			
 
				 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
			
 
				-	struct bio *bio;
			
 
				-	struct btrfs_io_bio *btrfs_failed_bio;
			
 
				-	struct btrfs_io_bio *btrfs_bio;
			
 
				-	int num_copies;
			
 
				 	int ret;
			
 
				-	int read_mode;
			
 
				 	u64 logical;
			
 
				 
			
 
				-	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
			
 
				-
			
 
				 	ret = get_state_private(failure_tree, start, &private);
			
 
				 	if (ret) {
			
 
				 		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
			
 
				 		if (!failrec)
			
 
				 			return -ENOMEM;
			
 
				+
			
 
				 		failrec->start = start;
			
 
				 		failrec->len = end - start + 1;
			
 
				 		failrec->this_mirror = 0;
			
@@ -2209,11 +2174,11 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
				 			em = NULL;
			
 
				 		}
			
 
				 		read_unlock(&em_tree->lock);
			
 
				-
			
 
				 		if (!em) {
			
 
				 			kfree(failrec);
			
 
				 			return -EIO;
			
 
				 		}
			
 
				+
			
 
				 		logical = start - em->start;
			
 
				 		logical = em->block_start + logical;
			
 
				 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
			
@@ -2222,8 +2187,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
				 			extent_set_compress_type(&failrec->bio_flags,
			
 
				 						 em->compress_type);
			
 
				 		}
			
 
				-		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
			
 
				-			 "len=%llu\n", logical, start, failrec->len);
			
 
				+
			
 
				+		pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
			
 
				+			 logical, start, failrec->len);
			
 
				+
			
 
				 		failrec->logical = logical;
			
 
				 		free_extent_map(em);
			
 
				 
			
@@ -2243,8 +2210,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
				 		}
			
 
				 	} else {
			
 
				 		failrec = (struct io_failure_record *)(unsigned long)private;
			
 
				-		pr_debug("bio_readpage_error: (found) logical=%llu, "
			
 
				-			 "start=%llu, len=%llu, validation=%d\n",
			
 
				+		pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
			
 
				 			 failrec->logical, failrec->start, failrec->len,
			
 
				 			 failrec->in_validation);
			
 
				 		/*
			
@@ -2253,6 +2219,17 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
				 		 * clean_io_failure() clean all those errors at once.
			
 
				 		 */
			
 
				 	}
			
 
				+
			
 
				+	*failrec_ret = failrec;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
			
 
				+			   struct io_failure_record *failrec, int failed_mirror)
			
 
				+{
			
 
				+	int num_copies;
			
 
				+
			
 
				 	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
			
 
				 				      failrec->logical, failrec->len);
			
 
				 	if (num_copies == 1) {
			
@@ -2261,10 +2238,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
				 		 * all the retry and error correction code that follows. no
			
 
				 		 * matter what the error is, it is very likely to persist.
			
 
				 		 */
			
 
				-		pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
			
 
				+		pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
			
 
				 			 num_copies, failrec->this_mirror, failed_mirror);
			
 
				-		free_io_failure(inode, failrec);
			
 
				-		return -EIO;
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -2284,7 +2260,6 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
				 		BUG_ON(failrec->in_validation);
			
 
				 		failrec->in_validation = 1;
			
 
				 		failrec->this_mirror = failed_mirror;
			
 
				-		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
			
 
				 	} else {
			
 
				 		/*
			
 
				 		 * we're ready to fulfill a) and b) alongside. get a good copy
			
@@ -2300,22 +2275,32 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
				 		failrec->this_mirror++;
			
 
				 		if (failrec->this_mirror == failed_mirror)
			
 
				 			failrec->this_mirror++;
			
 
				-		read_mode = READ_SYNC;
			
 
				 	}
			
 
				 
			
 
				 	if (failrec->this_mirror > num_copies) {
			
 
				-		pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
			
 
				+		pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
			
 
				 			 num_copies, failrec->this_mirror, failed_mirror);
			
 
				-		free_io_failure(inode, failrec);
			
 
				-		return -EIO;
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
			
 
				+				    struct io_failure_record *failrec,
			
 
				+				    struct page *page, int pg_offset, int icsum,
			
 
				+				    bio_end_io_t *endio_func)
			
 
				+{
			
 
				+	struct bio *bio;
			
 
				+	struct btrfs_io_bio *btrfs_failed_bio;
			
 
				+	struct btrfs_io_bio *btrfs_bio;
			
 
				+
			
 
				 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
			
 
				-	if (!bio) {
			
 
				-		free_io_failure(inode, failrec);
			
 
				-		return -EIO;
			
 
				-	}
			
 
				-	bio->bi_end_io = failed_bio->bi_end_io;
			
 
				+	if (!bio)
			
 
				+		return NULL;
			
 
				+
			
 
				+	bio->bi_end_io = endio_func;
			
 
				 	bio->bi_iter.bi_sector = failrec->logical >> 9;
			
 
				 	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
			
 
				 	bio->bi_iter.bi_size = 0;
			
@@ -2327,17 +2312,63 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
				 
			
 
				 		btrfs_bio = btrfs_io_bio(bio);
			
 
				 		btrfs_bio->csum = btrfs_bio->csum_inline;
			
 
				-		phy_offset >>= inode->i_sb->s_blocksize_bits;
			
 
				-		phy_offset *= csum_size;
			
 
				-		memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
			
 
				+		icsum *= csum_size;
			
 
				+		memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
			
 
				 		       csum_size);
			
 
				 	}
			
 
				 
			
 
				-	bio_add_page(bio, page, failrec->len, start - page_offset(page));
			
 
				+	bio_add_page(bio, page, failrec->len, pg_offset);
			
 
				+
			
 
				+	return bio;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * this is a generic handler for readpage errors (default
			
 
				+ * readpage_io_failed_hook). if other copies exist, read those and write back
			
 
				+ * good data to the failed position. does not investigate in remapping the
			
 
				+ * failed extent elsewhere, hoping the device will be smart enough to do this as
			
 
				+ * needed
			
 
				+ */
			
 
				+
			
 
				+static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
			
 
				+			      struct page *page, u64 start, u64 end,
			
 
				+			      int failed_mirror)
			
 
				+{
			
 
				+	struct io_failure_record *failrec;
			
 
				+	struct inode *inode = page->mapping->host;
			
 
				+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
			
 
				+	struct bio *bio;
			
 
				+	int read_mode;
			
 
				+	int ret;
			
 
				+
			
 
				+	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
			
 
				+
			
 
				+	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
			
 
				+	if (!ret) {
			
 
				+		free_io_failure(inode, failrec);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+
			
 
				+	if (failed_bio->bi_vcnt > 1)
			
 
				+		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
			
 
				+	else
			
 
				+		read_mode = READ_SYNC;
			
 
				+
			
 
				+	phy_offset >>= inode->i_sb->s_blocksize_bits;
			
 
				+	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
			
 
				+				      start - page_offset(page),
			
 
				+				      (int)phy_offset, failed_bio->bi_end_io);
			
 
				+	if (!bio) {
			
 
				+		free_io_failure(inode, failrec);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				 
			
 
				-	pr_debug("bio_readpage_error: submitting new read[%#x] to "
			
 
				-		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
			
 
				-		 failrec->this_mirror, num_copies, failrec->in_validation);
			
 
				+	pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
			
 
				+		 read_mode, failrec->this_mirror, failrec->in_validation);
			
 
				 
			
 
				 	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
			
 
				 					 failrec->this_mirror,
			
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -344,6 +344,34 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 
				 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
			
 
				 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
			
 
				 			 int mirror_num);
			
 
				+
			
 
				+/*
			
 
				+ * When IO fails, either with EIO or csum verification fails, we
			
 
				+ * try other mirrors that might have a good copy of the data.  This
			
 
				+ * io_failure_record is used to record state as we go through all the
			
 
				+ * mirrors.  If another mirror has good data, the page is set up to date
			
 
				+ * and things continue.  If a good mirror can't be found, the original
			
 
				+ * bio end_io callback is called to indicate things have failed.
			
 
				+ */
			
 
				+struct io_failure_record {
			
 
				+	struct page *page;
			
 
				+	u64 start;
			
 
				+	u64 len;
			
 
				+	u64 logical;
			
 
				+	unsigned long bio_flags;
			
 
				+	int this_mirror;
			
 
				+	int failed_mirror;
			
 
				+	int in_validation;
			
 
				+};
			
 
				+
			
 
				+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
			
 
				+				struct io_failure_record **failrec_ret);
			
 
				+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
			
 
				+			   struct io_failure_record *failrec, int fail_mirror);
			
 
				+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
			
 
				+				    struct io_failure_record *failrec,
			
 
				+				    struct page *page, int pg_offset, int icsum,
			
 
				+				    bio_end_io_t *endio_func);
			
 
				 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
			
 
				 noinline u64 find_lock_delalloc_range(struct inode *inode,
			
 
				 				      struct extent_io_tree *tree,