@@ -7,13 +7,16 @@ use crate::encoding::{self, Key as _, Value as _, bincode};
77use crate :: error:: Result ;
88use crate :: storage;
99
10- /// A log index. Starts at 1, indicates no index if 0 .
10+ /// A log index (entry position) . Starts at 1. 0 indicates no index.
1111pub type Index = u64 ;
1212
13- /// A log entry.
13+ /// A log entry containing a state machine command .
1414#[ derive( Clone , Debug , PartialEq , Serialize , Deserialize ) ]
1515pub struct Entry {
1616 /// The entry index.
17+ ///
18+ /// We could omit the index in the encoded value, since it's also stored in
19+ /// the key, but we keep it simple.
1720 pub index : Index ,
1821 /// The term in which the entry was added.
1922 pub term : Term ,
@@ -106,15 +109,16 @@ pub struct Log {
106109 /// If true, fsync entries to disk when appended. This is mandated by Raft,
107110 /// but comes with a hefty performance penalty (especially since we don't
108111 /// optimize for it by batching entries before fsyncing). Disabling it will
109- /// yield much better write performance, but may lose data on host crashes,
110- /// which in some scenarios can cause log entries to become "uncommitted"
111- /// and state machines diverging.
112+ /// yield much better write performance, but may lose data on crashes, which
113+ /// in some scenarios can cause log entries to become "uncommitted" and
114+ /// state machines diverging.
112115 fsync : bool ,
113116}
114117
115118impl Log {
116119 /// Initializes a log using the given storage engine.
117120 pub fn new ( mut engine : Box < dyn storage:: Engine > ) -> Result < Self > {
121+ // Load some initial in-memory state from disk.
118122 let ( term, vote) = engine
119123 . get ( & Key :: TermVote . encode ( ) ) ?
120124 . map ( |v| bincode:: deserialize ( & v) )
@@ -136,7 +140,8 @@ impl Log {
136140 . map ( |v| bincode:: deserialize ( & v) )
137141 . transpose ( ) ?
138142 . unwrap_or ( ( 0 , 0 ) ) ;
139- let fsync = true ; // fsync by default (NB: BitCask::flush() is a noop in tests)
143+
144+ let fsync = true ; // fsync by default
140145 Ok ( Self { engine, term, vote, last_index, last_term, commit_index, commit_term, fsync } )
141146 }
142147
@@ -168,11 +173,12 @@ impl Log {
168173 assert ! ( term > 0 , "can't set term 0" ) ;
169174 assert ! ( term >= self . term, "term regression {} → {}" , self . term, term) ;
170175 assert ! ( term > self . term || self . vote. is_none( ) || vote == self . vote, "can't change vote" ) ;
176+
171177 if term == self . term && vote == self . vote {
172178 return Ok ( ( ) ) ;
173179 }
174180 self . engine . set ( & Key :: TermVote . encode ( ) , bincode:: serialize ( & ( term, vote) ) ) ?;
175- // Always fsync, even with Log. fsync = false. Term changes are rare, so
181+ // Always fsync, even with Log:: fsync = false. Term changes are rare, so
176182 // this doesn't materially affect performance, and double voting could
177183 // lead to multiple leaders and split brain which is really bad.
178184 self . engine . flush ( ) ?;
@@ -186,8 +192,6 @@ impl Log {
186192 /// Raft leader changes.
187193 pub fn append ( & mut self , command : Option < Vec < u8 > > ) -> Result < Index > {
188194 assert ! ( self . term > 0 , "can't append entry in term 0" ) ;
189- // We could omit the index in the encoded value, since it's also stored
190- // in the key, but we keep it simple.
191195 let entry = Entry { index : self . last_index + 1 , term : self . term , command } ;
192196 self . engine . set ( & Key :: Entry ( entry. index ) . encode ( ) , entry. encode ( ) ) ?;
193197 if self . fsync {
@@ -202,16 +206,16 @@ impl Log {
202206 /// exist and be at or after the current commit index.
203207 pub fn commit ( & mut self , index : Index ) -> Result < Index > {
204208 let term = match self . get ( index) ? {
205- Some ( e ) if e . index < self . commit_index => {
206- panic ! ( "commit index regression {} → {}" , self . commit_index, e . index) ;
209+ Some ( entry ) if entry . index < self . commit_index => {
210+ panic ! ( "commit index regression {} → {}" , self . commit_index, entry . index) ;
207211 }
208- Some ( e ) if e . index == self . commit_index => return Ok ( index) ,
209- Some ( e ) => e . term ,
212+ Some ( entry ) if entry . index == self . commit_index => return Ok ( index) ,
213+ Some ( entry ) => entry . term ,
210214 None => panic ! ( "commit index {index} does not exist" ) ,
211215 } ;
212216 self . engine . set ( & Key :: CommitIndex . encode ( ) , bincode:: serialize ( & ( index, term) ) ) ?;
213217 // NB: the commit index doesn't need to be fsynced, since the entries
214- // are fsynced and the commit index can be recovered from a log quorum.
218+ // are fsynced and the commit index can be recovered from the quorum.
215219 self . commit_index = index;
216220 self . commit_term = term;
217221 Ok ( index)
@@ -255,21 +259,22 @@ impl Log {
255259 pub fn scan_apply ( & mut self , applied_index : Index ) -> Iterator {
256260 // NB: we don't assert that commit_index >= applied_index, because the
257261 // local commit index is not flushed to durable storage -- if lost on
258- // restart, it can be recovered from a quorum of logs .
262+ // restart, it can be recovered from the logs of a quorum .
259263 if applied_index >= self . commit_index {
260264 return Iterator :: new ( Box :: new ( std:: iter:: empty ( ) ) ) ;
261265 }
262266 self . scan ( applied_index + 1 ..=self . commit_index )
263267 }
264268
265- /// Splices a set of entries into the log and flushes it to disk. The
266- /// entries must have contiguous indexes and equal/increasing terms, and the
267- /// first entry must be in the range [1,last_index+1] with a term at or
269+ /// Splices a set of entries into the log and flushes it to disk. New
270+ /// indexes will be appended. Overlapping indexes with the same term must be
271+ /// equal and will be ignored. Overlapping indexes with different terms will
272+ /// truncate the existing log at the first conflict and then splice the new
273+ /// entries.
274+ ///
275+ /// The entries must have contiguous indexes and equal/increasing terms, and
276+ /// the first entry must be in the range [1,last_index+1] with a term at or
268277 /// above the previous (base) entry's term and at or below the current term.
269- /// New indexes will be appended. Overlapping indexes with the same term
270- /// must be equal and will be ignored. Overlapping indexes with different
271- /// terms will truncate the existing log at the first conflict and then
272- /// splice the new entries.
273278 pub fn splice ( & mut self , entries : Vec < Entry > ) -> Result < Index > {
274279 let ( Some ( first) , Some ( last) ) = ( entries. first ( ) , entries. last ( ) ) else {
275280 return Ok ( self . last_index ) ; // empty input is noop
0 commit comments