Skip to content

Commit 7e7b413

Browse files
committed
Fix kPointInTimeRecovery handling of truncated WAL (facebook#7701)
Summary: WAL may be truncated to an incomplete record due to crash while writing the last record or corruption. In the former case, no hole will be produced since no ACK'd data was lost. In the latter case, a hole could be produced without this PR since we proceeded to recover the next WAL as if nothing happened. This PR changes the record reading code to always report a corruption for incomplete records in `kPointInTimeRecovery` mode, and the upper layer will only ignore them if the next WAL has consecutive seqnum (i.e., we are guaranteed no hole). While this solves the hole problem for the case of incomplete records, the possibility is still there if the WAL is corrupted by truncation to an exact record boundary. Pull Request resolved: facebook#7701 Test Plan: Interestingly there already was a test for this case (`DBWALTestWithParams.kPointInTimeRecovery`); it just had a typo bug in the verification that prevented it from noticing holes in recovery. Reviewed By: anand1976 Differential Revision: D25111765 Pulled By: ajkr fbshipit-source-id: 5e330b13b1ee2b5be096cea9d0ff6075843e57b6
1 parent 9526da8 commit 7e7b413

File tree

4 files changed

+59
-26
lines changed

4 files changed

+59
-26
lines changed

HISTORY.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
# Rocksdb Change Log
2+
## Unreleased
3+
### Bug Fixes
4+
* Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries; for complete protection, users should enable `track_and_verify_wals_in_manifest`.
5+
26
## 6.14.5 (11/15/2020)
37
### Bug Fixes
48
* Fix a bug of encoding and parsing BlockBasedTableOptions::read_amp_bytes_per_bit as a 64-bit integer.

db/db_wal_test.cc

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1366,13 +1366,19 @@ TEST_P(DBWALTestWithParams, kPointInTimeRecovery) {
13661366
size_t recovered_row_count = RecoveryTestHelper::GetData(this);
13671367
ASSERT_LT(recovered_row_count, row_count);
13681368

1369-
bool expect_data = true;
1370-
for (size_t k = 0; k < maxkeys; ++k) {
1371-
bool found = Get("key" + ToString(corrupt_offset)) != "NOT_FOUND";
1372-
if (expect_data && !found) {
1373-
expect_data = false;
1369+
// Verify a prefix of keys were recovered. But not in the case of full WAL
1370+
// truncation, because we have no way to know there was a corruption when
1371+
// truncation happened on record boundaries (preventing recovery holes in
1372+
// that case requires using `track_and_verify_wals_in_manifest`).
1373+
if (!trunc || corrupt_offset != 0) {
1374+
bool expect_data = true;
1375+
for (size_t k = 0; k < maxkeys; ++k) {
1376+
bool found = Get("key" + ToString(k)) != "NOT_FOUND";
1377+
if (expect_data && !found) {
1378+
expect_data = false;
1379+
}
1380+
ASSERT_EQ(found, expect_data);
13741381
}
1375-
ASSERT_EQ(found, expect_data);
13761382
}
13771383

13781384
const size_t min = RecoveryTestHelper::kKeysPerWALFile *

db/log_reader.cc

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -119,16 +119,26 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
119119
break;
120120

121121
case kBadHeader:
122-
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
123-
// in clean shutdown we don't expect any error in the log files
122+
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
123+
wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
124+
// In clean shutdown we don't expect any error in the log files.
125+
// In point-in-time recovery an incomplete record at the end could
126+
// produce a hole in the recovered data. Report an error here, which
127+
// higher layers can choose to ignore when it's provable there is no
128+
// hole.
124129
ReportCorruption(drop_size, "truncated header");
125130
}
126131
FALLTHROUGH_INTENDED;
127132

128133
case kEof:
129134
if (in_fragmented_record) {
130-
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
131-
// in clean shutdown we don't expect any error in the log files
135+
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
136+
wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
137+
// In clean shutdown we don't expect any error in the log files.
138+
// In point-in-time recovery an incomplete record at the end could
139+
// produce a hole in the recovered data. Report an error here, which
140+
// higher layers can choose to ignore when it's provable there is no
141+
// hole.
132142
ReportCorruption(scratch->size(), "error reading trailing data");
133143
}
134144
// This can be caused by the writer dying immediately after
@@ -142,8 +152,13 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
142152
if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) {
143153
// Treat a record from a previous instance of the log as EOF.
144154
if (in_fragmented_record) {
145-
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
146-
// in clean shutdown we don't expect any error in the log files
155+
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
156+
wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
157+
// In clean shutdown we don't expect any error in the log files.
158+
// In point-in-time recovery an incomplete record at the end could
159+
// produce a hole in the recovered data. Report an error here,
160+
// which higher layers can choose to ignore when it's provable
161+
// there is no hole.
147162
ReportCorruption(scratch->size(), "error reading trailing data");
148163
}
149164
// This can be caused by the writer dying immediately after
@@ -164,6 +179,20 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
164179
break;
165180

166181
case kBadRecordLen:
182+
if (eof_) {
183+
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
184+
wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
185+
// In clean shutdown we don't expect any error in the log files.
186+
// In point-in-time recovery an incomplete record at the end could
187+
// produce a hole in the recovered data. Report an error here, which
188+
// higher layers can choose to ignore when it's provable there is no
189+
// hole.
190+
ReportCorruption(drop_size, "truncated record body");
191+
}
192+
return false;
193+
}
194+
FALLTHROUGH_INTENDED;
195+
167196
case kBadRecordChecksum:
168197
if (recycled_ &&
169198
wal_recovery_mode ==
@@ -355,18 +384,14 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
355384
}
356385
}
357386
if (header_size + length > buffer_.size()) {
387+
assert(buffer_.size() >= static_cast<size_t>(header_size));
358388
*drop_size = buffer_.size();
359389
buffer_.clear();
360-
if (!eof_) {
361-
return kBadRecordLen;
362-
}
363-
// If the end of the file has been reached without reading |length|
364-
// bytes of payload, assume the writer died in the middle of writing the
365-
// record. Don't report a corruption unless requested.
366-
if (*drop_size) {
367-
return kBadHeader;
368-
}
369-
return kEof;
390+
// If the end of the read has been reached without seeing
391+
// `header_size + length` bytes of payload, report a corruption. The
392+
// higher layers can decide how to handle it based on the recovery mode,
393+
// whether this occurred at EOF, whether this is the final WAL, etc.
394+
return kBadRecordLen;
370395
}
371396

372397
if (type == kZeroType && length == 0) {

db/log_test.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ TEST_P(LogTest, BadLengthAtEndIsNotIgnored) {
465465
ShrinkSize(1);
466466
ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
467467
ASSERT_GT(DroppedBytes(), 0U);
468-
ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
468+
ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
469469
}
470470

471471
TEST_P(LogTest, ChecksumMismatch) {
@@ -573,9 +573,7 @@ TEST_P(LogTest, PartialLastIsNotIgnored) {
573573
ShrinkSize(1);
574574
ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
575575
ASSERT_GT(DroppedBytes(), 0U);
576-
ASSERT_EQ("OK", MatchError(
577-
"Corruption: truncated headerCorruption: "
578-
"error reading trailing data"));
576+
ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
579577
}
580578

581579
TEST_P(LogTest, ErrorJoinsRecords) {

0 commit comments

Comments
 (0)