Skip to content

Commit e8de5d7

Browse files
Return error instead of panicking if rewriting fails (#343)
* Return error instead of panicing if rewriting fails Signed-off-by: v01dstar <yang.zhang@pingcap.com> * Update rust version Signed-off-by: v01dstar <yang.zhang@pingcap.com> * Update rust version in github workflow Signed-off-by: v01dstar <yang.zhang@pingcap.com> * Update src/file_pipe_log/pipe.rs Co-authored-by: lucasliang <nkcs_lykx@hotmail.com> Signed-off-by: v01dstar <yang.zhang@pingcap.com> * Update src/file_pipe_log/pipe.rs Co-authored-by: lucasliang <nkcs_lykx@hotmail.com> Signed-off-by: v01dstar <yang.zhang@pingcap.com> * Address comments, fix test cases Signed-off-by: v01dstar <yang.zhang@pingcap.com> * Fix format error Signed-off-by: v01dstar <yang.zhang@pingcap.com> * Move panic inside Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Fix clippy Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Propagate error if writing header fails Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Adjust write header fail expectation, from panic to error Signed-off-by: v01dstar <yang.zhang@pingcap.com> * Panic if write header fails since we do not truncate Signed-off-by: v01dstar <yang.zhang@pingcap.com> * Failure other than sync should be returned Signed-off-by: v01dstar <yang.zhang@pingcap.com> * Address comments Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Fix test failures Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Change test exepectations Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Address comments Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Fix format Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Revert sync() signature Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Add more details to rotate test Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Fix style Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Address comments Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Address comments Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Fix clippy Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> * Trigger Github actions Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> --------- Signed-off-by: v01dstar <yang.zhang@pingcap.com> Signed-off-by: Yang Zhang <yang.zhang@pingcap.com> Co-authored-by: lucasliang <nkcs_lykx@hotmail.com>
1 parent 385182b commit e8de5d7

File tree

7 files changed

+84
-65
lines changed

7 files changed

+84
-65
lines changed

.github/workflows/rust.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ jobs:
6060
uses: actions-rs/toolchain@v1
6161
with:
6262
profile: minimal
63-
toolchain: 1.66.0
63+
toolchain: 1.67.1
6464
override: true
6565
components: rustfmt, clippy, rust-src
6666
- uses: Swatinem/rust-cache@v1

Cargo.toml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name = "raft-engine"
33
version = "0.4.1"
44
authors = ["The TiKV Project Developers"]
55
edition = "2018"
6-
rust-version = "1.66.0"
6+
rust-version = "1.67.1"
77
description = "A persistent storage engine for Multi-Raft logs"
88
readme = "README.md"
99
repository = "https://github.com/tikv/raft-engine"
@@ -95,8 +95,6 @@ nightly_group = ["nightly", "swap"]
9595
raft-proto = { git = "https://github.com/tikv/raft-rs", branch = "master" }
9696
protobuf = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" }
9797
protobuf-codegen = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" }
98-
# TODO: Use official grpc-rs once https://github.com/tikv/grpc-rs/pull/622 is merged.
99-
grpcio = { git = "https://github.com/tabokie/grpc-rs", branch = "v0.10.x-win" }
10098

10199
[workspace]
102100
members = ["stress", "ctl"]

src/engine.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ where
172172
}
173173
perf_context!(log_write_duration).observe_since(now);
174174
if sync {
175-
// As per trait protocol, this error should be retriable. But we panic anyway to
175+
// As per trait protocol, sync error should be retriable. But we panic anyway to
176176
// save the trouble of propagating it to other group members.
177177
self.pipe_log.sync(LogQueue::Append).expect("pipe::sync()");
178178
}

src/file_pipe_log/log_file.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ pub struct LogFileWriter<F: FileSystem> {
4343
capacity: usize,
4444
}
4545

46+
// All APIs provided by `LogFileWriter` are fail-safe, i.e. caller can continue
47+
// using the same "writer" even if the previous operation failed.
4648
impl<F: FileSystem> LogFileWriter<F> {
4749
fn open(
4850
handle: Arc<F::Handle>,
@@ -67,7 +69,7 @@ impl<F: FileSystem> LogFileWriter<F> {
6769
}
6870

6971
fn write_header(&mut self, format: LogFileFormat) -> IoResult<()> {
70-
self.writer.seek(SeekFrom::Start(0))?;
72+
self.writer.rewind()?;
7173
self.written = 0;
7274
let mut buf = Vec::with_capacity(LogFileFormat::encoded_len(format.version));
7375
format.encode(&mut buf).unwrap();
@@ -119,7 +121,8 @@ impl<F: FileSystem> LogFileWriter<F> {
119121

120122
pub fn sync(&mut self) -> IoResult<()> {
121123
let _t = StopWatch::new(&*LOG_SYNC_DURATION_HISTOGRAM);
122-
self.handle.sync()?;
124+
// Panic if sync fails, in case of data loss.
125+
self.handle.sync().unwrap();
123126
Ok(())
124127
}
125128

src/file_pipe_log/pipe.rs

Lines changed: 11 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,13 @@ impl<F: FileSystem> SinglePipe<F> {
177177

178178
// Skip syncing directory in Windows. Refer to badger's discussion for more
179179
// detail: https://github.com/dgraph-io/badger/issues/699
180+
//
181+
// Panic if sync calls fail, keep consistent with the behavior of
182+
// `LogFileWriter::sync()`.
180183
#[cfg(not(windows))]
181-
std::fs::File::open(PathBuf::from(&self.paths[path_id])).and_then(|d| d.sync_all())?;
184+
std::fs::File::open(PathBuf::from(&self.paths[path_id]))
185+
.and_then(|d| d.sync_all())
186+
.unwrap();
182187
Ok(())
183188
}
184189

@@ -321,12 +326,7 @@ impl<F: FileSystem> SinglePipe<F> {
321326
fail_point!("file_pipe_log::append");
322327
let mut writable_file = self.writable_file.lock();
323328
if writable_file.writer.offset() >= self.target_file_size {
324-
if let Err(e) = self.rotate_imp(&mut writable_file) {
325-
panic!(
326-
"error when rotate [{:?}:{}]: {e}",
327-
self.queue, writable_file.seq,
328-
);
329-
}
329+
self.rotate_imp(&mut writable_file)?;
330330
}
331331

332332
let seq = writable_file.seq;
@@ -359,9 +359,7 @@ impl<F: FileSystem> SinglePipe<F> {
359359
}
360360
let start_offset = writer.offset();
361361
if let Err(e) = writer.write(bytes.as_bytes(&ctx), self.target_file_size) {
362-
if let Err(te) = writer.truncate() {
363-
panic!("error when truncate {seq} after error: {e}, get: {}", te);
364-
}
362+
writer.truncate()?;
365363
if is_no_space_err(&e) {
366364
// TODO: There exists several corner cases should be tackled if
367365
// `bytes.len()` > `target_file_size`. For example,
@@ -372,12 +370,7 @@ impl<F: FileSystem> SinglePipe<F> {
372370
// - [3] Both main-dir and spill-dir have several recycled logs.
373371
// But as `bytes.len()` is always smaller than `target_file_size` in common
374372
// cases, this issue will be ignored temprorarily.
375-
if let Err(e) = self.rotate_imp(&mut writable_file) {
376-
panic!(
377-
"error when rotate [{:?}:{}]: {e}",
378-
self.queue, writable_file.seq
379-
);
380-
}
373+
self.rotate_imp(&mut writable_file)?;
381374
// If there still exists free space for this record, rotate the file
382375
// and return a special TryAgain Err (for retry) to the caller.
383376
return Err(Error::TryAgain(format!(
@@ -403,15 +396,9 @@ impl<F: FileSystem> SinglePipe<F> {
403396

404397
fn sync(&self) -> Result<()> {
405398
let mut writable_file = self.writable_file.lock();
406-
let seq = writable_file.seq;
407399
let writer = &mut writable_file.writer;
408-
{
409-
let _t = StopWatch::new(perf_context!(log_sync_duration));
410-
if let Err(e) = writer.sync() {
411-
panic!("error when sync [{:?}:{seq}]: {e}", self.queue);
412-
}
413-
}
414-
400+
let _t = StopWatch::new(perf_context!(log_sync_duration));
401+
writer.sync().map_err(Error::Io)?;
415402
Ok(())
416403
}
417404

src/purge.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ where
439439
)?;
440440
let file_handle = self.pipe_log.append(LogQueue::Rewrite, log_batch)?;
441441
if sync {
442-
self.pipe_log.sync(LogQueue::Rewrite)?
442+
self.pipe_log.sync(LogQueue::Rewrite)?;
443443
}
444444
log_batch.finish_write(file_handle);
445445
self.memtables.apply_rewrite_writes(

tests/failpoints/test_io_error.rs

Lines changed: 64 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,7 @@ fn test_file_write_error() {
124124
assert_eq!(engine.last_index(2).unwrap(), 1);
125125
}
126126

127-
#[test]
128-
fn test_file_rotate_error() {
127+
fn test_file_rotate_error(restart_after_failure: bool) {
129128
let dir = tempfile::Builder::new()
130129
.prefix("test_file_rotate_error")
131130
.tempdir()
@@ -138,7 +137,7 @@ fn test_file_rotate_error() {
138137
let fs = Arc::new(ObfuscatedFileSystem::default());
139138
let entry = vec![b'x'; 1024];
140139

141-
let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap();
140+
let mut engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap();
142141
engine
143142
.write(&mut generate_batch(1, 1, 2, Some(&entry)), false)
144143
.unwrap();
@@ -160,27 +159,46 @@ fn test_file_rotate_error() {
160159
let _ = engine.write(&mut generate_batch(1, 4, 5, Some(&entry)), false);
161160
})
162161
.is_err());
163-
assert_eq!(engine.file_span(LogQueue::Append).1, 1);
164162
}
163+
if restart_after_failure {
164+
drop(engine);
165+
engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap();
166+
}
167+
assert_eq!(engine.file_span(LogQueue::Append).1, 1);
165168
{
166169
// Fail to create new log file.
167170
let _f = FailGuard::new("default_fs::create::err", "return");
168-
assert!(catch_unwind_silent(|| {
169-
let _ = engine.write(&mut generate_batch(1, 4, 5, Some(&entry)), false);
170-
})
171-
.is_err());
172-
assert_eq!(engine.file_span(LogQueue::Append).1, 1);
171+
assert!(engine
172+
.write(&mut generate_batch(1, 4, 5, Some(&entry)), false)
173+
.is_err());
173174
}
175+
if restart_after_failure {
176+
drop(engine);
177+
engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap();
178+
}
179+
let num_files_before = std::fs::read_dir(&dir).unwrap().count();
174180
{
175181
// Fail to write header of new log file.
176182
let _f = FailGuard::new("log_file::write::err", "1*off->return");
177-
assert!(catch_unwind_silent(|| {
178-
let _ = engine.write(&mut generate_batch(1, 4, 5, Some(&entry)), false);
179-
})
180-
.is_err());
183+
assert!(engine
184+
.write(&mut generate_batch(1, 4, 5, Some(&entry)), false)
185+
.is_err());
186+
}
187+
if restart_after_failure {
188+
drop(engine);
189+
engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap();
190+
// The new log file is added during recovery phase of restart.
191+
assert_eq!(engine.file_span(LogQueue::Append).1, 2);
192+
} else {
181193
assert_eq!(engine.file_span(LogQueue::Append).1, 1);
182194
}
183-
{
195+
// Although the header is not written, the file is still created.
196+
assert_eq!(
197+
std::fs::read_dir(&dir).unwrap().count() - num_files_before,
198+
1
199+
);
200+
if !restart_after_failure {
201+
// If the engine restarted, the write does not require sync will succeed.
184202
// Fail to sync new log file. The old log file is already sync-ed at this point.
185203
let _f = FailGuard::new("log_fd::sync::err", "return");
186204
assert!(catch_unwind_silent(|| {
@@ -190,18 +208,39 @@ fn test_file_rotate_error() {
190208
assert_eq!(engine.file_span(LogQueue::Append).1, 1);
191209
}
192210

211+
// Only one log file should be created after all the incidents.
212+
assert_eq!(
213+
std::fs::read_dir(&dir).unwrap().count() - num_files_before,
214+
1
215+
);
193216
// We can continue writing after the incidents.
194217
engine
195218
.write(&mut generate_batch(2, 1, 2, Some(&entry)), true)
196219
.unwrap();
197-
drop(engine);
198-
let engine = Engine::open_with_file_system(cfg, fs).unwrap();
220+
if restart_after_failure {
221+
drop(engine);
222+
engine = Engine::open_with_file_system(cfg, fs).unwrap();
223+
}
224+
assert_eq!(
225+
std::fs::read_dir(&dir).unwrap().count() - num_files_before,
226+
1
227+
);
199228
assert_eq!(engine.first_index(1).unwrap(), 1);
200229
assert_eq!(engine.last_index(1).unwrap(), 4);
201230
assert_eq!(engine.first_index(2).unwrap(), 1);
202231
assert_eq!(engine.last_index(2).unwrap(), 1);
203232
}
204233

234+
#[test]
235+
fn test_file_rotate_error_without_restart() {
236+
test_file_rotate_error(false);
237+
}
238+
239+
#[test]
240+
fn test_file_rotate_error_with_restart() {
241+
test_file_rotate_error(true);
242+
}
243+
205244
#[test]
206245
fn test_concurrent_write_error() {
207246
let dir = tempfile::Builder::new()
@@ -262,10 +301,8 @@ fn test_concurrent_write_error() {
262301
let _f2 = FailGuard::new("log_file::truncate::err", "return");
263302
let entry_clone = entry.clone();
264303
ctx.write_ext(move |e| {
265-
catch_unwind_silent(|| {
266-
e.write(&mut generate_batch(1, 11, 21, Some(&entry_clone)), false)
267-
})
268-
.unwrap_err();
304+
e.write(&mut generate_batch(1, 11, 21, Some(&entry_clone)), false)
305+
.unwrap_err();
269306
});
270307
// We don't test followers, their panics are hard to catch.
271308
ctx.join();
@@ -527,20 +564,17 @@ fn test_no_space_write_error() {
527564
cfg.dir = dir.path().to_str().unwrap().to_owned();
528565
cfg.spill_dir = Some(spill_dir.path().to_str().unwrap().to_owned());
529566
{
530-
// Case 1: `Write` is abnormal for no space left, Engine should panic at
567+
// Case 1: `Write` is abnormal for no space left, Engine should fail at
531568
// `rotate`.
532569
let cfg_err = Config {
533570
target_file_size: ReadableSize(1),
534571
..cfg.clone()
535572
};
536573
let engine = Engine::open(cfg_err).unwrap();
537574
let _f = FailGuard::new("log_fd::write::no_space_err", "return");
538-
assert!(catch_unwind_silent(|| {
539-
engine
540-
.write(&mut generate_batch(2, 11, 21, Some(&entry)), true)
541-
.unwrap_err();
542-
})
543-
.is_err());
575+
assert!(engine
576+
.write(&mut generate_batch(2, 11, 21, Some(&entry)), true)
577+
.is_err());
544578
assert_eq!(
545579
0,
546580
engine
@@ -554,12 +588,9 @@ fn test_no_space_write_error() {
554588
let _f1 = FailGuard::new("log_fd::write::no_space_err", "2*return->off");
555589
let _f2 = FailGuard::new("file_pipe_log::force_choose_dir", "return");
556590
// The first write should fail, because all dirs run out of space for writing.
557-
assert!(catch_unwind_silent(|| {
558-
engine
559-
.write(&mut generate_batch(2, 11, 21, Some(&entry)), true)
560-
.unwrap_err();
561-
})
562-
.is_err());
591+
assert!(engine
592+
.write(&mut generate_batch(2, 11, 21, Some(&entry)), true)
593+
.is_err());
563594
assert_eq!(
564595
0,
565596
engine

0 commit comments

Comments
 (0)