9 年之前 · 7088c4136f
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -61,6 +61,9 @@ xlog_recover_check_summary(
 
				 #else
			
 
				 #define	xlog_recover_check_summary(log)
			
 
				 #endif
			
 
				+STATIC int
			
 
				+xlog_do_recovery_pass(
			
 
				+        struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
			
 
				 
			
 
				 /*
			
 
				  * This structure is used during recovery to record the buf log items which
			
@@ -940,6 +943,278 @@ out_error:
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Seek forward in the log for log record headers.
			
 
				+ *
			
 
				+ * Given head and tail blocks, walk forward from the tail block until we find
			
 
				+ * the provided number of records or hit the head block. The return value is the
			
 
				+ * number of records encountered or a negative error code. The log block and
			
 
				+ * buffer pointer of the last record seen are returned in rblk and rhead
			
 
				+ * respectively.
			
 
				+ */
			
 
				+STATIC int
			
 
				+xlog_seek_logrec_hdr(
			
 
				+	struct xlog		*log,
			
 
				+	xfs_daddr_t		head_blk,
			
 
				+	xfs_daddr_t		tail_blk,
			
 
				+	int			count,
			
 
				+	struct xfs_buf		*bp,
			
 
				+	xfs_daddr_t		*rblk,
			
 
				+	struct xlog_rec_header	**rhead,
			
 
				+	bool			*wrapped)
			
 
				+{
			
 
				+	int			i;
			
 
				+	int			error;
			
 
				+	int			found = 0;
			
 
				+	char			*offset = NULL;
			
 
				+	xfs_daddr_t		end_blk;
			
 
				+
			
 
				+	*wrapped = false;
			
 
				+
			
 
				+	/*
			
 
				+	 * Walk forward from the tail block until we hit the head or the last
			
 
				+	 * block in the log.
			
 
				+	 */
			
 
				+	end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
			
 
				+	for (i = (int) tail_blk; i <= end_blk; i++) {
			
 
				+		error = xlog_bread(log, i, 1, bp, &offset);
			
 
				+		if (error)
			
 
				+			goto out_error;
			
 
				+
			
 
				+		if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
			
 
				+			*rblk = i;
			
 
				+			*rhead = (struct xlog_rec_header *) offset;
			
 
				+			if (++found == count)
			
 
				+				break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * If we haven't hit the head block or the log record header count,
			
 
				+	 * start looking again from the start of the physical log.
			
 
				+	 */
			
 
				+	if (tail_blk > head_blk && found != count) {
			
 
				+		for (i = 0; i < (int) head_blk; i++) {
			
 
				+			error = xlog_bread(log, i, 1, bp, &offset);
			
 
				+			if (error)
			
 
				+				goto out_error;
			
 
				+
			
 
				+			if (*(__be32 *)offset ==
			
 
				+			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
			
 
				+				*wrapped = true;
			
 
				+				*rblk = i;
			
 
				+				*rhead = (struct xlog_rec_header *) offset;
			
 
				+				if (++found == count)
			
 
				+					break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return found;
			
 
				+
			
 
				+out_error:
			
 
				+	return error;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Check the log tail for torn writes. This is required when torn writes are
			
 
				+ * detected at the head and the head had to be walked back to a previous record.
			
 
				+ * The tail of the previous record must now be verified to ensure the torn
			
 
				+ * writes didn't corrupt the previous tail.
			
 
				+ *
			
 
				+ * Return an error if CRC verification fails as recovery cannot proceed.
			
 
				+ */
			
 
				+STATIC int
			
 
				+xlog_verify_tail(
			
 
				+	struct xlog		*log,
			
 
				+	xfs_daddr_t		head_blk,
			
 
				+	xfs_daddr_t		tail_blk)
			
 
				+{
			
 
				+	struct xlog_rec_header	*thead;
			
 
				+	struct xfs_buf		*bp;
			
 
				+	xfs_daddr_t		first_bad;
			
 
				+	int			count;
			
 
				+	int			error = 0;
			
 
				+	bool			wrapped;
			
 
				+	xfs_daddr_t		tmp_head;
			
 
				+
			
 
				+	bp = xlog_get_bp(log, 1);
			
 
				+	if (!bp)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	/*
			
 
				+	 * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
			
 
				+	 * a temporary head block that points after the last possible
			
 
				+	 * concurrently written record of the tail.
			
 
				+	 */
			
 
				+	count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
			
 
				+				     XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
			
 
				+				     &wrapped);
			
 
				+	if (count < 0) {
			
 
				+		error = count;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
			
 
				+	 * into the actual log head. tmp_head points to the start of the record
			
 
				+	 * so update it to the actual head block.
			
 
				+	 */
			
 
				+	if (count < XLOG_MAX_ICLOGS + 1)
			
 
				+		tmp_head = head_blk;
			
 
				+
			
 
				+	/*
			
 
				+	 * We now have a tail and temporary head block that covers at least
			
 
				+	 * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
			
 
				+	 * records were completely written. Run a CRC verification pass from
			
 
				+	 * tail to head and return the result.
			
 
				+	 */
			
 
				+	error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
			
 
				+				      XLOG_RECOVER_CRCPASS, &first_bad);
			
 
				+
			
 
				+out:
			
 
				+	xlog_put_bp(bp);
			
 
				+	return error;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Detect and trim torn writes from the head of the log.
			
 
				+ *
			
 
				+ * Storage without sector atomicity guarantees can result in torn writes in the
			
 
				+ * log in the event of a crash. Our only means to detect this scenario is via
			
 
				+ * CRC verification. While we can't always be certain that CRC verification
			
 
				+ * failure is due to a torn write vs. an unrelated corruption, we do know that
			
 
				+ * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
			
 
				+ * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
			
 
				+ * the log and treat failures in this range as torn writes as a matter of
			
 
				+ * policy. In the event of CRC failure, the head is walked back to the last good
			
 
				+ * record in the log and the tail is updated from that record and verified.
			
 
				+ */
			
 
				+STATIC int
			
 
				+xlog_verify_head(
			
 
				+	struct xlog		*log,
			
 
				+	xfs_daddr_t		*head_blk,	/* in/out: unverified head */
			
 
				+	xfs_daddr_t		*tail_blk,	/* out: tail block */
			
 
				+	struct xfs_buf		*bp,
			
 
				+	xfs_daddr_t		*rhead_blk,	/* start blk of last record */
			
 
				+	struct xlog_rec_header	**rhead,	/* ptr to last record */
			
 
				+	bool			*wrapped)	/* last rec. wraps phys. log */
			
 
				+{
			
 
				+	struct xlog_rec_header	*tmp_rhead;
			
 
				+	struct xfs_buf		*tmp_bp;
			
 
				+	xfs_daddr_t		first_bad;
			
 
				+	xfs_daddr_t		tmp_rhead_blk;
			
 
				+	int			found;
			
 
				+	int			error;
			
 
				+	bool			tmp_wrapped;
			
 
				+
			
 
				+	/*
			
 
				+	 * Search backwards through the log looking for the log record header
			
 
				+	 * block. This wraps all the way back around to the head so something is
			
 
				+	 * seriously wrong if we can't find it.
			
 
				+	 */
			
 
				+	found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
			
 
				+				      rhead, wrapped);
			
 
				+	if (found < 0)
			
 
				+		return found;
			
 
				+	if (!found) {
			
 
				+		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+
			
 
				+	*tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
			
 
				+
			
 
				+	/*
			
 
				+	 * Now that we have a tail block, check the head of the log for torn
			
 
				+	 * writes. Search again until we hit the tail or the maximum number of
			
 
				+	 * log record I/Os that could have been in flight at one time. Use a
			
 
				+	 * temporary buffer so we don't trash the rhead/bp pointer from the
			
 
				+	 * call above.
			
 
				+	 */
			
 
				+	tmp_bp = xlog_get_bp(log, 1);
			
 
				+	if (!tmp_bp)
			
 
				+		return -ENOMEM;
			
 
				+	error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
			
 
				+				      XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
			
 
				+				      &tmp_rhead, &tmp_wrapped);
			
 
				+	xlog_put_bp(tmp_bp);
			
 
				+	if (error < 0)
			
 
				+		return error;
			
 
				+
			
 
				+	/*
			
 
				+	 * Now run a CRC verification pass over the records starting at the
			
 
				+	 * block found above to the current head. If a CRC failure occurs, the
			
 
				+	 * log block of the first bad record is saved in first_bad.
			
 
				+	 */
			
 
				+	error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
			
 
				+				      XLOG_RECOVER_CRCPASS, &first_bad);
			
 
				+	if (error == -EFSBADCRC) {
			
 
				+		/*
			
 
				+		 * We've hit a potential torn write. Reset the error and warn
			
 
				+		 * about it.
			
 
				+		 */
			
 
				+		error = 0;
			
 
				+		xfs_warn(log->l_mp,
			
 
				+"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
			
 
				+			 first_bad, *head_blk);
			
 
				+
			
 
				+		/*
			
 
				+		 * Get the header block and buffer pointer for the last good
			
 
				+		 * record before the bad record.
			
 
				+		 *
			
 
				+		 * Note that xlog_find_tail() clears the blocks at the new head
			
 
				+		 * (i.e., the records with invalid CRC) if the cycle number
			
 
				+		 * matches the the current cycle.
			
 
				+		 */
			
 
				+		found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
			
 
				+					      rhead_blk, rhead, wrapped);
			
 
				+		if (found < 0)
			
 
				+			return found;
			
 
				+		if (found == 0)		/* XXX: right thing to do here? */
			
 
				+			return -EIO;
			
 
				+
			
 
				+		/*
			
 
				+		 * Reset the head block to the starting block of the first bad
			
 
				+		 * log record and set the tail block based on the last good
			
 
				+		 * record.
			
 
				+		 *
			
 
				+		 * Bail out if the updated head/tail match as this indicates
			
 
				+		 * possible corruption outside of the acceptable
			
 
				+		 * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
			
 
				+		 */
			
 
				+		*head_blk = first_bad;
			
 
				+		*tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
			
 
				+		if (*head_blk == *tail_blk) {
			
 
				+			ASSERT(0);
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Now verify the tail based on the updated head. This is
			
 
				+		 * required because the torn writes trimmed from the head could
			
 
				+		 * have been written over the tail of a previous record. Return
			
 
				+		 * any errors since recovery cannot proceed if the tail is
			
 
				+		 * corrupt.
			
 
				+		 *
			
 
				+		 * XXX: This leaves a gap in truly robust protection from torn
			
 
				+		 * writes in the log. If the head is behind the tail, the tail
			
 
				+		 * pushes forward to create some space and then a crash occurs
			
 
				+		 * causing the writes into the previous record's tail region to
			
 
				+		 * tear, log recovery isn't able to recover.
			
 
				+		 *
			
 
				+		 * How likely is this to occur? If possible, can we do something
			
 
				+		 * more intelligent here? Is it safe to push the tail forward if
			
 
				+		 * we can determine that the tail is within the range of the
			
 
				+		 * torn write (e.g., the kernel can only overwrite the tail if
			
 
				+		 * it has actually been pushed forward)? Alternatively, could we
			
 
				+		 * somehow prevent this condition at runtime?
			
 
				+		 */
			
 
				+		error = xlog_verify_tail(log, *head_blk, *tail_blk);
			
 
				+	}
			
 
				+
			
 
				+	return error;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Find the sync block number or the tail of the log.
			
 
				  *
			
@@ -966,9 +1241,10 @@ xlog_find_tail(
 
				 	xlog_op_header_t	*op_head;
			
 
				 	char			*offset = NULL;
			
 
				 	xfs_buf_t		*bp;
			
 
				-	int			error, i, found;
			
 
				+	int			error;
			
 
				 	xfs_daddr_t		umount_data_blk;
			
 
				 	xfs_daddr_t		after_umount_blk;
			
 
				+	xfs_daddr_t		rhead_blk;
			
 
				 	xfs_lsn_t		tail_lsn;
			
 
				 	int			hblks;
			
 
				 	bool			wrapped = false;
			
@@ -995,24 +1271,16 @@ xlog_find_tail(
 
				 	}
			
 
				 
			
 
				 	/*
			
 
				-	 * Search backwards through the log looking for the log record header
			
 
				-	 * block. This wraps all the way back around to the head so something is
			
 
				-	 * seriously wrong if we can't find it.
			
 
				+	 * Trim the head block back to skip over torn records. We can have
			
 
				+	 * multiple log I/Os in flight at any time, so we assume CRC failures
			
 
				+	 * back through the previous several records are torn writes and skip
			
 
				+	 * them.
			
 
				 	 */
			
 
				 	ASSERT(*head_blk < INT_MAX);
			
 
				-	found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, &i,
			
 
				-				      &rhead, &wrapped);
			
 
				-	if (found < 0) {
			
 
				-		error = found;
			
 
				+	error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
			
 
				+				 &rhead, &wrapped);
			
 
				+	if (error)
			
 
				 		goto done;
			
 
				-	}
			
 
				-	if (!found) {
			
 
				-		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
			
 
				-		xlog_put_bp(bp);
			
 
				-		ASSERT(0);
			
 
				-		return -EIO;
			
 
				-	}
			
 
				-	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
			
 
				 
			
 
				 	/*
			
 
				 	 * Reset log values according to the state of the log when we
			
@@ -1024,7 +1292,7 @@ xlog_find_tail(
 
				 	 * written was complete and ended exactly on the end boundary
			
 
				 	 * of the physical log.
			
 
				 	 */
			
 
				-	log->l_prev_block = i;
			
 
				+	log->l_prev_block = rhead_blk;
			
 
				 	log->l_curr_block = (int)*head_blk;
			
 
				 	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
			
 
				 	if (wrapped)
			
@@ -1062,12 +1330,13 @@ xlog_find_tail(
 
				 	} else {
			
 
				 		hblks = 1;
			
 
				 	}
			
 
				-	after_umount_blk = (i + hblks + (int)
			
 
				-		BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
			
 
				+	after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
			
 
				+	after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
			
 
				 	tail_lsn = atomic64_read(&log->l_tail_lsn);
			
 
				 	if (*head_blk == after_umount_blk &&
			
 
				 	    be32_to_cpu(rhead->h_num_logops) == 1) {
			
 
				-		umount_data_blk = (i + hblks) % log->l_logBBsize;
			
 
				+		umount_data_blk = rhead_blk + hblks;
			
 
				+		umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
			
 
				 		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
			
 
				 		if (error)
			
 
				 			goto done;