100100 queries_waiting_heartbeats := queue :queue ({non_neg_integer (),
101101 consistent_query_ref ()}),
102102 pending_consistent_queries := [consistent_query_ref ()],
103- commit_latency => option (non_neg_integer ())
103+ commit_latency => option (non_neg_integer ()),
104+ snapshot_phase => chunk_flag ()
104105 }.
105106
106107-type state () :: ra_server_state ().
@@ -1445,25 +1446,27 @@ handle_follower(#install_snapshot_rpc{term = Term,
14451446 SnapIdx > LastApplied andalso
14461447 % % only install snapshot if the machine version is understood
14471448 MacVer >= SnapMacVer andalso
1448- Num =< 1 ->
1449+ Num =< 1 andalso
1450+ ChunkFlag /= pre ->
14491451 % % only begin snapshot procedure if Idx is higher than the last_applied
14501452 % % index.
1451- ? DEBUG (" ~ts : begin_accept snapshot at index ~b in term ~b " ,
1452- [LogId , SnapIdx , Term ]),
1453+ ? DEBUG (" ~ts : begin_accept snapshot at index ~b in term ~b , phase ~s " ,
1454+ [LogId , SnapIdx , Term , ChunkFlag ]),
14531455 SnapState0 = ra_log :snapshot_state (Log0 ),
14541456 {ok , SS } = ra_snapshot :begin_accept (Meta , SnapState0 ),
14551457 Log1 = ra_log :set_snapshot_state (SS , Log0 ),
14561458
14571459 % % if the snaphost includes pre entries (live entries) then we need
14581460 % % to reset the log to the last applied index to avoid issues
14591461 Log = case ChunkFlag of
1460- pre ->
1462+ init ->
14611463 {ok , L } = ra_log :set_last_index (LastApplied , Log1 ),
14621464 L ;
14631465 _ ->
14641466 Log1
14651467 end ,
14661468 {receive_snapshot , update_term (Term , State0 #{log => Log ,
1469+ snapshot_phase => ChunkFlag ,
14671470 leader_id => LeaderId }),
14681471 [{next_event , Rpc }, {record_leader_msg , LeaderId }]};
14691472handle_follower (# install_snapshot_rpc {term = Term ,
@@ -1538,7 +1541,7 @@ handle_receive_snapshot(#install_snapshot_rpc{term = Term,
15381541 cluster := ClusterIds ,
15391542 term := SnapTerm } = SnapMeta ,
15401543 chunk_state = {Num , ChunkFlag },
1541- data = ChunkOrEntries },
1544+ data = ChunkOrEntries } = Rpc ,
15421545 #{cfg := # cfg {id = Id ,
15431546 log_id = LogId ,
15441547 effective_machine_version = CurEffMacVer ,
@@ -1548,15 +1551,33 @@ handle_receive_snapshot(#install_snapshot_rpc{term = Term,
15481551 cluster := Cluster ,
15491552 current_term := CurTerm ,
15501553 last_applied := LastApplied ,
1551- machine_state := OldMacState } = State0 )
1554+ machine_state := OldMacState ,
1555+ snapshot_phase := SnapPhase } = State0 )
15521556 when Term >= CurTerm ->
1553- ? DEBUG (" ~ts : receiving snapshot chunk: ~b / ~w , index ~b , term ~b " ,
1554- [LogId , Num , ChunkFlag , SnapIndex , SnapTerm ]),
15551557 Reply = # install_snapshot_result {term = CurTerm ,
15561558 last_term = SnapTerm ,
15571559 last_index = SnapIndex },
15581560 case ChunkFlag of
1561+ init when SnapPhase == init ->
1562+ % % this is ok, just reply
1563+ {receive_snapshot , State0 , [{reply , Reply }]};
1564+ init ->
1565+ ? DEBUG (" ~ts : receiving snapshot saw unexpected init phase at snapshot"
1566+ " index term {~b , ~b }, current phase ~s restarting
1567+ snapshot receive process" ,
1568+ [LogId , SnapIndex , SnapTerm , SnapPhase ]),
1569+ % % the snapshot sending must have been interrupted and restarted
1570+ % % during the init or pre-phase
1571+ % % abort the snapshot, and revert to follower
1572+ SnapState0 = ra_log :snapshot_state (Log00 ),
1573+ SnapState = ra_snapshot :abort_accept (SnapState0 ),
1574+ Log = ra_log :set_snapshot_state (SnapState , Log00 ),
1575+ {follower , maps :remove (snapshot_phase , State0 #{log => Log }),
1576+ [{next_event , Rpc }]};
15591577 pre when is_list (ChunkOrEntries ) ->
1578+ [{_FstIdx , _ , _ } | _ ] = ChunkOrEntries ,
1579+ % ?DEBUG("~ts: receiving snapshot chunk pre first index ~b snap index ~b, term ~b",
1580+ % [LogId, FstIdx, SnapIndex, SnapTerm]),
15601581 % % reset last index to last applied
15611582 % % as we dont know for sure indexes after last applied
15621583 % % are of the right term
@@ -1566,15 +1587,21 @@ handle_receive_snapshot(#install_snapshot_rpc{term = Term,
15661587 {ok , L } = ra_log :write_sparse (E , LstIdx , L0 ),
15671588 {L , I }
15681589 end , {Log00 , LastIdx }, ChunkOrEntries ),
1569- State = update_term (Term , State0 #{log => Log }),
1590+ State = update_term (Term , State0 #{log => Log ,
1591+ snapshot_phase => pre }),
15701592 {receive_snapshot , State , [{reply , Reply }]};
15711593 next ->
1594+ ? DEBUG (" ~ts : receiving snapshot chunk: ~b / ~w , index ~b , term ~b " ,
1595+ [LogId , Num , ChunkFlag , SnapIndex , SnapTerm ]),
15721596 SnapState0 = ra_log :snapshot_state (Log00 ),
15731597 SnapState = ra_snapshot :accept_chunk (ChunkOrEntries , Num , SnapState0 ),
15741598 Log0 = ra_log :set_snapshot_state (SnapState , Log00 ),
1575- State = update_term (Term , State0 #{log => Log0 }),
1599+ State = update_term (Term , State0 #{log => Log0 ,
1600+ snapshot_phase => next }),
15761601 {receive_snapshot , State , [{reply , Reply }]};
15771602 last ->
1603+ ? DEBUG (" ~ts : receiving snapshot chunk: ~b / ~w , index ~b , term ~b " ,
1604+ [LogId , Num , ChunkFlag , SnapIndex , SnapTerm ]),
15781605 SnapState0 = ra_log :snapshot_state (Log00 ),
15791606 {SnapState , MacState , LiveIndexes , Effs0 } =
15801607 ra_snapshot :complete_accept (ChunkOrEntries , Num , Machine ,
@@ -1612,7 +1639,7 @@ handle_receive_snapshot(#install_snapshot_rpc{term = Term,
16121639 MacState ,
16131640 OldMeta ,
16141641 OldMacState ),
1615- State = update_term (Term ,
1642+ State1 = update_term (Term ,
16161643 State0 #{cfg => Cfg ,
16171644 log => Log ,
16181645 commit_index => SnapIndex ,
@@ -1624,6 +1651,7 @@ handle_receive_snapshot(#install_snapshot_rpc{term = Term,
16241651 membership =>
16251652 get_membership (ClusterIds , State0 ),
16261653 machine_state => MacState }),
1654+ State = maps :remove (snapshot_phase , State1 ),
16271655 put_counter (Cfg , ? C_RA_SVR_METRIC_LAST_APPLIED , SnapIndex ),
16281656 % % it was the last snapshot chunk so we can revert back to
16291657 % % follower status
@@ -1643,13 +1671,15 @@ handle_receive_snapshot(#append_entries_rpc{term = Term} = Msg,
16431671 SnapState0 = ra_log :snapshot_state (Log0 ),
16441672 SnapState = ra_snapshot :abort_accept (SnapState0 ),
16451673 Log = ra_log :set_snapshot_state (SnapState , Log0 ),
1646- {follower , update_term (Term , clear_leader_id (State #{log => Log })),
1674+ {follower , maps :remove (snapshot_phase ,
1675+ update_term (Term ,
1676+ clear_leader_id (State #{log => Log }))),
16471677 [{next_event , Msg }]};
16481678handle_receive_snapshot ({ra_log_event , Evt },
1649- #{cfg := # cfg {log_id = LogId },
1679+ #{cfg := # cfg {log_id = _LogId },
16501680 log := Log0 } = State ) ->
1651- ? DEBUG (" ~ts : ~s ra_log_event received: ~w " ,
1652- [LogId , ? FUNCTION_NAME , Evt ]),
1681+ % ?DEBUG("~ts: ~s ra_log_event received: ~w",
1682+ % [LogId, ?FUNCTION_NAME, Evt]),
16531683 % simply forward all other events to ra_log
16541684 % whilst the snapshot is being received
16551685 {Log , Effects } = ra_log :handle_event (Evt , Log0 ),
@@ -1662,7 +1692,7 @@ handle_receive_snapshot(receive_snapshot_timeout,
16621692 SnapState0 = ra_log :snapshot_state (Log0 ),
16631693 SnapState = ra_snapshot :abort_accept (SnapState0 ),
16641694 Log = ra_log :set_snapshot_state (SnapState , Log0 ),
1665- {follower , State #{log => Log }, []};
1695+ {follower , maps : remove ( snapshot_phase , State #{log => Log }) , []};
16661696handle_receive_snapshot (# info_rpc {term = Term } = Msg ,
16671697 #{current_term := CurTerm ,
16681698 cfg := # cfg {log_id = LogId },
@@ -1675,7 +1705,8 @@ handle_receive_snapshot(#info_rpc{term = Term} = Msg,
16751705 SnapState0 = ra_log :snapshot_state (Log0 ),
16761706 SnapState = ra_snapshot :abort_accept (SnapState0 ),
16771707 Log = ra_log :set_snapshot_state (SnapState , Log0 ),
1678- {follower , update_term (Term , clear_leader_id (State #{log => Log })),
1708+ {follower , maps :remove (snapshot_phase ,
1709+ update_term (Term , clear_leader_id (State #{log => Log }))),
16791710 [{next_event , Msg }]};
16801711handle_receive_snapshot (# info_rpc {} = InfoRpc , State ) ->
16811712 InfoReplyEffect = empty_info_reply_effect (State , InfoRpc ),
@@ -1692,7 +1723,8 @@ handle_receive_snapshot(#info_reply{term = Term} = Msg,
16921723 SnapState0 = ra_log :snapshot_state (Log0 ),
16931724 SnapState = ra_snapshot :abort_accept (SnapState0 ),
16941725 Log = ra_log :set_snapshot_state (SnapState , Log0 ),
1695- {follower , update_term (Term , clear_leader_id (State #{log => Log })),
1726+ {follower , maps :remove (snapshot_phase ,
1727+ update_term (Term , clear_leader_id (State #{log => Log }))),
16961728 [{next_event , Msg }]};
16971729handle_receive_snapshot (# info_reply {}, State ) ->
16981730 {receive_snapshot , State , []};
0 commit comments