|
@@ -61,6 +61,9 @@ xlog_recover_check_summary(
|
|
|
#else
|
|
|
#define xlog_recover_check_summary(log)
|
|
|
#endif
|
|
|
+STATIC int
|
|
|
+xlog_do_recovery_pass(
|
|
|
+ struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
|
|
|
|
|
|
/*
|
|
|
* This structure is used during recovery to record the buf log items which
|
|
@@ -940,6 +943,278 @@ out_error:
|
|
|
return error;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Seek forward in the log for log record headers.
|
|
|
+ *
|
|
|
+ * Given head and tail blocks, walk forward from the tail block until we find
|
|
|
+ * the provided number of records or hit the head block. The return value is the
|
|
|
+ * number of records encountered or a negative error code. The log block and
|
|
|
+ * buffer pointer of the last record seen are returned in rblk and rhead
|
|
|
+ * respectively.
|
|
|
+ */
|
|
|
+STATIC int
|
|
|
+xlog_seek_logrec_hdr(
|
|
|
+ struct xlog *log,
|
|
|
+ xfs_daddr_t head_blk,
|
|
|
+ xfs_daddr_t tail_blk,
|
|
|
+ int count,
|
|
|
+ struct xfs_buf *bp,
|
|
|
+ xfs_daddr_t *rblk,
|
|
|
+ struct xlog_rec_header **rhead,
|
|
|
+ bool *wrapped)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+ int error;
|
|
|
+ int found = 0;
|
|
|
+ char *offset = NULL;
|
|
|
+ xfs_daddr_t end_blk;
|
|
|
+
|
|
|
+ *wrapped = false;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Walk forward from the tail block until we hit the head or the last
|
|
|
+ * block in the log.
|
|
|
+ */
|
|
|
+ end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
|
|
|
+ for (i = (int) tail_blk; i <= end_blk; i++) {
|
|
|
+ error = xlog_bread(log, i, 1, bp, &offset);
|
|
|
+ if (error)
|
|
|
+ goto out_error;
|
|
|
+
|
|
|
+ if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
|
|
|
+ *rblk = i;
|
|
|
+ *rhead = (struct xlog_rec_header *) offset;
|
|
|
+ if (++found == count)
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If we haven't hit the head block or the log record header count,
|
|
|
+ * start looking again from the start of the physical log.
|
|
|
+ */
|
|
|
+ if (tail_blk > head_blk && found != count) {
|
|
|
+ for (i = 0; i < (int) head_blk; i++) {
|
|
|
+ error = xlog_bread(log, i, 1, bp, &offset);
|
|
|
+ if (error)
|
|
|
+ goto out_error;
|
|
|
+
|
|
|
+ if (*(__be32 *)offset ==
|
|
|
+ cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
|
|
|
+ *wrapped = true;
|
|
|
+ *rblk = i;
|
|
|
+ *rhead = (struct xlog_rec_header *) offset;
|
|
|
+ if (++found == count)
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return found;
|
|
|
+
|
|
|
+out_error:
|
|
|
+ return error;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Check the log tail for torn writes. This is required when torn writes are
|
|
|
+ * detected at the head and the head had to be walked back to a previous record.
|
|
|
+ * The tail of the previous record must now be verified to ensure the torn
|
|
|
+ * writes didn't corrupt the previous tail.
|
|
|
+ *
|
|
|
+ * Return an error if CRC verification fails as recovery cannot proceed.
|
|
|
+ */
|
|
|
+STATIC int
|
|
|
+xlog_verify_tail(
|
|
|
+ struct xlog *log,
|
|
|
+ xfs_daddr_t head_blk,
|
|
|
+ xfs_daddr_t tail_blk)
|
|
|
+{
|
|
|
+ struct xlog_rec_header *thead;
|
|
|
+ struct xfs_buf *bp;
|
|
|
+ xfs_daddr_t first_bad;
|
|
|
+ int count;
|
|
|
+ int error = 0;
|
|
|
+ bool wrapped;
|
|
|
+ xfs_daddr_t tmp_head;
|
|
|
+
|
|
|
+ bp = xlog_get_bp(log, 1);
|
|
|
+ if (!bp)
|
|
|
+ return -ENOMEM;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
|
|
|
+ * a temporary head block that points after the last possible
|
|
|
+ * concurrently written record of the tail.
|
|
|
+ */
|
|
|
+ count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
|
|
|
+ XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
|
|
|
+ &wrapped);
|
|
|
+ if (count < 0) {
|
|
|
+ error = count;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
|
|
|
+ * into the actual log head. tmp_head points to the start of the record
|
|
|
+ * so update it to the actual head block.
|
|
|
+ */
|
|
|
+ if (count < XLOG_MAX_ICLOGS + 1)
|
|
|
+ tmp_head = head_blk;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We now have a tail and temporary head block that covers at least
|
|
|
+ * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
|
|
|
+ * records were completely written. Run a CRC verification pass from
|
|
|
+ * tail to head and return the result.
|
|
|
+ */
|
|
|
+ error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
|
|
|
+ XLOG_RECOVER_CRCPASS, &first_bad);
|
|
|
+
|
|
|
+out:
|
|
|
+ xlog_put_bp(bp);
|
|
|
+ return error;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Detect and trim torn writes from the head of the log.
|
|
|
+ *
|
|
|
+ * Storage without sector atomicity guarantees can result in torn writes in the
|
|
|
+ * log in the event of a crash. Our only means to detect this scenario is via
|
|
|
+ * CRC verification. While we can't always be certain that CRC verification
|
|
|
+ * failure is due to a torn write vs. an unrelated corruption, we do know that
|
|
|
+ * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
|
|
|
+ * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
|
|
|
+ * the log and treat failures in this range as torn writes as a matter of
|
|
|
+ * policy. In the event of CRC failure, the head is walked back to the last good
|
|
|
+ * record in the log and the tail is updated from that record and verified.
|
|
|
+ */
|
|
|
+STATIC int
|
|
|
+xlog_verify_head(
|
|
|
+ struct xlog *log,
|
|
|
+ xfs_daddr_t *head_blk, /* in/out: unverified head */
|
|
|
+ xfs_daddr_t *tail_blk, /* out: tail block */
|
|
|
+ struct xfs_buf *bp,
|
|
|
+ xfs_daddr_t *rhead_blk, /* start blk of last record */
|
|
|
+ struct xlog_rec_header **rhead, /* ptr to last record */
|
|
|
+ bool *wrapped) /* last rec. wraps phys. log */
|
|
|
+{
|
|
|
+ struct xlog_rec_header *tmp_rhead;
|
|
|
+ struct xfs_buf *tmp_bp;
|
|
|
+ xfs_daddr_t first_bad;
|
|
|
+ xfs_daddr_t tmp_rhead_blk;
|
|
|
+ int found;
|
|
|
+ int error;
|
|
|
+ bool tmp_wrapped;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Search backwards through the log looking for the log record header
|
|
|
+ * block. This wraps all the way back around to the head so something is
|
|
|
+ * seriously wrong if we can't find it.
|
|
|
+ */
|
|
|
+ found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
|
|
|
+ rhead, wrapped);
|
|
|
+ if (found < 0)
|
|
|
+ return found;
|
|
|
+ if (!found) {
|
|
|
+ xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
|
|
|
+ return -EIO;
|
|
|
+ }
|
|
|
+
|
|
|
+ *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Now that we have a tail block, check the head of the log for torn
|
|
|
+ * writes. Search again until we hit the tail or the maximum number of
|
|
|
+ * log record I/Os that could have been in flight at one time. Use a
|
|
|
+ * temporary buffer so we don't trash the rhead/bp pointer from the
|
|
|
+ * call above.
|
|
|
+ */
|
|
|
+ tmp_bp = xlog_get_bp(log, 1);
|
|
|
+ if (!tmp_bp)
|
|
|
+ return -ENOMEM;
|
|
|
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
|
|
|
+ XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
|
|
|
+ &tmp_rhead, &tmp_wrapped);
|
|
|
+ xlog_put_bp(tmp_bp);
|
|
|
+ if (error < 0)
|
|
|
+ return error;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Now run a CRC verification pass over the records starting at the
|
|
|
+ * block found above to the current head. If a CRC failure occurs, the
|
|
|
+ * log block of the first bad record is saved in first_bad.
|
|
|
+ */
|
|
|
+ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
|
|
|
+ XLOG_RECOVER_CRCPASS, &first_bad);
|
|
|
+ if (error == -EFSBADCRC) {
|
|
|
+ /*
|
|
|
+ * We've hit a potential torn write. Reset the error and warn
|
|
|
+ * about it.
|
|
|
+ */
|
|
|
+ error = 0;
|
|
|
+ xfs_warn(log->l_mp,
|
|
|
+"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
|
|
|
+ first_bad, *head_blk);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Get the header block and buffer pointer for the last good
|
|
|
+ * record before the bad record.
|
|
|
+ *
|
|
|
+ * Note that xlog_find_tail() clears the blocks at the new head
|
|
|
+ * (i.e., the records with invalid CRC) if the cycle number
|
|
|
+ * matches the the current cycle.
|
|
|
+ */
|
|
|
+ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
|
|
|
+ rhead_blk, rhead, wrapped);
|
|
|
+ if (found < 0)
|
|
|
+ return found;
|
|
|
+ if (found == 0) /* XXX: right thing to do here? */
|
|
|
+ return -EIO;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Reset the head block to the starting block of the first bad
|
|
|
+ * log record and set the tail block based on the last good
|
|
|
+ * record.
|
|
|
+ *
|
|
|
+ * Bail out if the updated head/tail match as this indicates
|
|
|
+ * possible corruption outside of the acceptable
|
|
|
+ * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
|
|
|
+ */
|
|
|
+ *head_blk = first_bad;
|
|
|
+ *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
|
|
|
+ if (*head_blk == *tail_blk) {
|
|
|
+ ASSERT(0);
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Now verify the tail based on the updated head. This is
|
|
|
+ * required because the torn writes trimmed from the head could
|
|
|
+ * have been written over the tail of a previous record. Return
|
|
|
+ * any errors since recovery cannot proceed if the tail is
|
|
|
+ * corrupt.
|
|
|
+ *
|
|
|
+ * XXX: This leaves a gap in truly robust protection from torn
|
|
|
+ * writes in the log. If the head is behind the tail, the tail
|
|
|
+ * pushes forward to create some space and then a crash occurs
|
|
|
+ * causing the writes into the previous record's tail region to
|
|
|
+ * tear, log recovery isn't able to recover.
|
|
|
+ *
|
|
|
+ * How likely is this to occur? If possible, can we do something
|
|
|
+ * more intelligent here? Is it safe to push the tail forward if
|
|
|
+ * we can determine that the tail is within the range of the
|
|
|
+ * torn write (e.g., the kernel can only overwrite the tail if
|
|
|
+ * it has actually been pushed forward)? Alternatively, could we
|
|
|
+ * somehow prevent this condition at runtime?
|
|
|
+ */
|
|
|
+ error = xlog_verify_tail(log, *head_blk, *tail_blk);
|
|
|
+ }
|
|
|
+
|
|
|
+ return error;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Find the sync block number or the tail of the log.
|
|
|
*
|
|
@@ -966,9 +1241,10 @@ xlog_find_tail(
|
|
|
xlog_op_header_t *op_head;
|
|
|
char *offset = NULL;
|
|
|
xfs_buf_t *bp;
|
|
|
- int error, i, found;
|
|
|
+ int error;
|
|
|
xfs_daddr_t umount_data_blk;
|
|
|
xfs_daddr_t after_umount_blk;
|
|
|
+ xfs_daddr_t rhead_blk;
|
|
|
xfs_lsn_t tail_lsn;
|
|
|
int hblks;
|
|
|
bool wrapped = false;
|
|
@@ -995,24 +1271,16 @@ xlog_find_tail(
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Search backwards through the log looking for the log record header
|
|
|
- * block. This wraps all the way back around to the head so something is
|
|
|
- * seriously wrong if we can't find it.
|
|
|
+ * Trim the head block back to skip over torn records. We can have
|
|
|
+ * multiple log I/Os in flight at any time, so we assume CRC failures
|
|
|
+ * back through the previous several records are torn writes and skip
|
|
|
+ * them.
|
|
|
*/
|
|
|
ASSERT(*head_blk < INT_MAX);
|
|
|
- found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, &i,
|
|
|
- &rhead, &wrapped);
|
|
|
- if (found < 0) {
|
|
|
- error = found;
|
|
|
+ error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
|
|
|
+ &rhead, &wrapped);
|
|
|
+ if (error)
|
|
|
goto done;
|
|
|
- }
|
|
|
- if (!found) {
|
|
|
- xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
|
|
|
- xlog_put_bp(bp);
|
|
|
- ASSERT(0);
|
|
|
- return -EIO;
|
|
|
- }
|
|
|
- *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
|
|
|
|
|
|
/*
|
|
|
* Reset log values according to the state of the log when we
|
|
@@ -1024,7 +1292,7 @@ xlog_find_tail(
|
|
|
* written was complete and ended exactly on the end boundary
|
|
|
* of the physical log.
|
|
|
*/
|
|
|
- log->l_prev_block = i;
|
|
|
+ log->l_prev_block = rhead_blk;
|
|
|
log->l_curr_block = (int)*head_blk;
|
|
|
log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
|
|
|
if (wrapped)
|
|
@@ -1062,12 +1330,13 @@ xlog_find_tail(
|
|
|
} else {
|
|
|
hblks = 1;
|
|
|
}
|
|
|
- after_umount_blk = (i + hblks + (int)
|
|
|
- BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
|
|
|
+ after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
|
|
|
+ after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
|
|
|
tail_lsn = atomic64_read(&log->l_tail_lsn);
|
|
|
if (*head_blk == after_umount_blk &&
|
|
|
be32_to_cpu(rhead->h_num_logops) == 1) {
|
|
|
- umount_data_blk = (i + hblks) % log->l_logBBsize;
|
|
|
+ umount_data_blk = rhead_blk + hblks;
|
|
|
+ umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
|
|
|
error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
|
|
|
if (error)
|
|
|
goto done;
|