@@ -127,23 +127,29 @@ pub struct PhysicalStorage {
127
127
/// - doesn't point to the end of the segment
128
128
file : Option < File > ,
129
129
130
- /// When false, we have just initialized storage using the LSN from find_end_of_wal().
131
- /// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular,
132
- /// there can be a case with unexpected .partial file.
130
+ /// When true, WAL truncation potentially has been interrupted and we need
131
+ /// to finish it before allowing WAL writes; see truncate_wal for details.
132
+ /// In this case [`write_lsn`] can be less than actually written WAL on
133
+ /// disk. In particular, there can be a case with unexpected .partial file.
133
134
///
134
135
/// Imagine the following:
135
136
/// - 000000010000000000000001
136
- /// - it was fully written, but the last record is split between 2 segments
137
- /// - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in the end of this segment
138
- /// - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were initialized to 0/1FFFFF0
137
+ /// - it was fully written, but the last record is split between 2
138
+ /// segments
139
+ /// - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in
140
+ /// the end of this segment
141
+ /// - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were
142
+ /// initialized to 0/1FFFFF0
139
143
/// - 000000010000000000000002.partial
140
- /// - it has only 1 byte written, which is not enough to make a full WAL record
144
+ /// - it has only 1 byte written, which is not enough to make a full WAL
145
+ /// record
141
146
///
142
- /// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal().
143
- /// This flag will be set to true after the first truncate_wal() call.
147
+ /// Partial segment 002 has no WAL records, and it will be removed by the
148
+ /// next truncate_wal(). This flag will be set to true after the first
149
+ /// truncate_wal() call.
144
150
///
145
151
/// [`write_lsn`]: Self::write_lsn
146
- is_truncated_after_restart : bool ,
152
+ pending_wal_truncation : bool ,
147
153
}
148
154
149
155
impl PhysicalStorage {
@@ -208,7 +214,7 @@ impl PhysicalStorage {
208
214
flush_record_lsn : flush_lsn,
209
215
decoder : WalStreamDecoder :: new ( write_lsn, state. server . pg_version / 10000 ) ,
210
216
file : None ,
211
- is_truncated_after_restart : false ,
217
+ pending_wal_truncation : true ,
212
218
} )
213
219
}
214
220
@@ -405,6 +411,13 @@ impl Storage for PhysicalStorage {
405
411
startpos
406
412
) ;
407
413
}
414
+ if self . pending_wal_truncation {
415
+ bail ! (
416
+ "write_wal called with pending WAL truncation, write_lsn={}, startpos={}" ,
417
+ self . write_lsn,
418
+ startpos
419
+ ) ;
420
+ }
408
421
409
422
let write_seconds = time_io_closure ( self . write_exact ( startpos, buf) ) . await ?;
410
423
// WAL is written, updating write metrics
@@ -479,15 +492,34 @@ impl Storage for PhysicalStorage {
479
492
) ;
480
493
}
481
494
482
- // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
483
- // disk (this happens on each connect).
484
- if self . is_truncated_after_restart
495
+ // Quick exit if nothing to do and we know that the state is clean to
496
+ // avoid writing up to 16 MiB of zeros on disk (this happens on each
497
+ // connect).
498
+ if !self . pending_wal_truncation
485
499
&& end_pos == self . write_lsn
486
500
&& end_pos == self . flush_record_lsn
487
501
{
488
502
return Ok ( ( ) ) ;
489
503
}
490
504
505
+ // Atomicity: we start with LSNs reset because once on disk deletion is
506
+ // started it can't be reversed. However, we might crash/error in the
507
+ // middle, leaving garbage above the truncation point. In theory,
508
+ // concatenated with previous records it might form bogus WAL (though
509
+ // very unlikely in practice because CRC would guard from that). To
510
+ // protect, set pending_wal_truncation flag before beginning: it means
511
+ // truncation must be retried and WAL writes are prohibited until it
512
+ // succeeds. Flag is also set on boot because we don't know if the last
513
+ // state was clean.
514
+ //
515
+ // Protocol (HandleElected before first AppendRequest) ensures we'll
516
+ // always try to ensure clean truncation before any writes.
517
+ self . pending_wal_truncation = true ;
518
+
519
+ self . write_lsn = end_pos;
520
+ self . write_record_lsn = end_pos;
521
+ self . flush_record_lsn = end_pos;
522
+
491
523
// Close previously opened file, if any
492
524
if let Some ( unflushed_file) = self . file . take ( ) {
493
525
self . fdatasync_file ( & unflushed_file) . await ?;
@@ -513,11 +545,7 @@ impl Storage for PhysicalStorage {
513
545
fs:: rename ( wal_file_path, wal_file_partial_path) . await ?;
514
546
}
515
547
516
- // Update LSNs
517
- self . write_lsn = end_pos;
518
- self . write_record_lsn = end_pos;
519
- self . flush_record_lsn = end_pos;
520
- self . is_truncated_after_restart = true ;
548
+ self . pending_wal_truncation = false ;
521
549
Ok ( ( ) )
522
550
}
523
551
0 commit comments