@@ -215,7 +215,7 @@ func (sc *SplitController) currentPhase() *model.SplitPhase {
215215}
216216
217217// updatePhase atomically updates the split phase on both parent and children.
218- func (sc * SplitController ) updatePhase (newPhase model.SplitPhase ) {
218+ func (sc * SplitController ) updatePhase (newPhase model.SplitPhase ) error {
219219 status := sc .statusResource .Load ()
220220 cloned := status .Clone ()
221221
@@ -235,7 +235,7 @@ func (sc *SplitController) updatePhase(newPhase model.SplitPhase) {
235235 }
236236 }
237237
238- sc .statusResource .Update (cloned )
238+ return sc .statusResource .Update (cloned )
239239}
240240
241241// runBootstrap validates preconditions, fences child ensemble members, elects
@@ -285,13 +285,14 @@ func (sc *SplitController) runBootstrap() error {
285285 childLeaders [childId ] = childMeta .Leader .Internal
286286 }
287287 }
288- sc .updateParentMeta (func (meta * model.ShardMetadata ) {
288+ if err := sc .updateParentMeta (func (meta * model.ShardMetadata ) {
289289 meta .Split .ParentTermAtBootstrap = parentTerm
290290 meta .Split .ChildLeadersAtBootstrap = childLeaders
291- })
291+ }); err != nil {
292+ return err
293+ }
292294
293- sc .updatePhase (model .SplitPhaseCatchUp )
294- return nil
295+ return sc .updatePhase (model .SplitPhaseCatchUp )
295296}
296297
297298// fenceAndElectChild fences a child shard's ensemble and elects a leader.
@@ -318,11 +319,13 @@ func (sc *SplitController) fenceAndElectChild(childId int64) error {
318319
319320 childLeader := sc .pickLeader (headEntries )
320321
321- sc .updateChildMeta (childId , func (meta * model.ShardMetadata ) {
322+ if err := sc .updateChildMeta (childId , func (meta * model.ShardMetadata ) {
322323 meta .Term = childTerm
323324 meta .Leader = & childLeader
324325 meta .Status = model .ShardStatusSteadyState
325- })
326+ }); err != nil {
327+ return err
328+ }
326329
327330 // Elect the child leader so it replicates to its followers immediately.
328331 // Without this, only the single child leader node has the data.
@@ -421,8 +424,7 @@ func (sc *SplitController) runCatchUp() error {
421424 }
422425 if caughtUp {
423426 sc .log .Info ("All children caught up" )
424- sc .updatePhase (model .SplitPhaseCutover )
425- return nil
427+ return sc .updatePhase (model .SplitPhaseCutover )
426428 }
427429 }
428430}
@@ -443,14 +445,18 @@ func (sc *SplitController) checkObserverCursorsStale() (bool, error) {
443445 slog .Int64 ("bootstrap-term" , parentMeta .Split .ParentTermAtBootstrap ),
444446 slog .Int64 ("current-term" , parentMeta .Term ),
445447 )
446- sc .updatePhase (model .SplitPhaseBootstrap )
448+ if err := sc .updatePhase (model .SplitPhaseBootstrap ); err != nil {
449+ return false , err
450+ }
447451 return true , nil
448452 }
449453
450454 // Child leader election: the observer cursor targets the old (dead) leader.
451455 // Remove the stale cursor and fall back to Bootstrap to re-add.
452456 if sc .removeStaleChildObservers (parentMeta ) {
453- sc .updatePhase (model .SplitPhaseBootstrap )
457+ if err := sc .updatePhase (model .SplitPhaseBootstrap ); err != nil {
458+ return false , err
459+ }
454460 return true , nil
455461 }
456462
@@ -562,11 +568,13 @@ func (sc *SplitController) runCutover() error {
562568 )
563569
564570 // Update parent term in metadata
565- sc .updateParentMeta (func (meta * model.ShardMetadata ) {
571+ if err := sc .updateParentMeta (func (meta * model.ShardMetadata ) {
566572 meta .Term = newParentTerm
567573 meta .Leader = nil
568574 meta .Status = model .ShardStatusElection
569- })
575+ }); err != nil {
576+ return err
577+ }
570578
571579 // Step 2: Wait for children to commit parentFinalOffset.
572580 // Children were already elected leader in Bootstrap, so commitOffset
@@ -587,14 +595,18 @@ func (sc *SplitController) runCutover() error {
587595 // Step 4: Clear split metadata from children and mark parent for deletion.
588596 // Children are now independent shards.
589597 for _ , childId := range []int64 {sc .leftChildId , sc .rightChildId } {
590- sc .updateChildMeta (childId , func (meta * model.ShardMetadata ) {
598+ if err := sc .updateChildMeta (childId , func (meta * model.ShardMetadata ) {
591599 meta .Split = nil
592- })
600+ }); err != nil {
601+ return err
602+ }
593603 }
594604
595- sc .updateParentMeta (func (meta * model.ShardMetadata ) {
605+ if err := sc .updateParentMeta (func (meta * model.ShardMetadata ) {
596606 meta .Status = model .ShardStatusDeleting
597- })
607+ }); err != nil {
608+ return err
609+ }
598610
599611 // Step 5: Notify the coordinator. This triggers the parent shard
600612 // controller's DeleteShard (which retries indefinitely with backoff)
@@ -603,9 +615,11 @@ func (sc *SplitController) runCutover() error {
603615
604616 // Clear split metadata from parent — the split controller's job is done.
605617 // The parent shard controller handles the actual deletion.
606- sc .updateParentMeta (func (meta * model.ShardMetadata ) {
618+ if err := sc .updateParentMeta (func (meta * model.ShardMetadata ) {
607619 meta .Split = nil
608- })
620+ }); err != nil {
621+ return err
622+ }
609623
610624 return nil
611625}
@@ -650,13 +664,19 @@ func (sc *SplitController) abort() {
650664
651665 // Delete child shards from status.
652666 for _ , childId := range []int64 {sc .leftChildId , sc .rightChildId } {
653- sc .statusResource .DeleteShardMetadata (sc .namespace , childId )
667+ if err := sc .statusResource .DeleteShardMetadata (sc .namespace , childId ); err != nil {
668+ sc .log .Warn ("Failed to delete child shard metadata during abort" ,
669+ slog .Int64 ("child-shard" , childId ),
670+ slog .Any ("error" , err ))
671+ }
654672 }
655673
656674 // Clear parent split metadata.
657- sc .updateParentMeta (func (meta * model.ShardMetadata ) {
675+ if err := sc .updateParentMeta (func (meta * model.ShardMetadata ) {
658676 meta .Split = nil
659- })
677+ }); err != nil {
678+ sc .log .Warn ("Failed to clear parent split metadata during abort" , slog .Any ("error" , err ))
679+ }
660680
661681 sc .log .Info ("Split aborted, parent restored" )
662682
@@ -684,23 +704,24 @@ func (sc *SplitController) loadShardMeta(shardId int64) *model.ShardMetadata {
684704 return & cloned
685705}
686706
687- func (sc * SplitController ) updateParentMeta (fn func (meta * model.ShardMetadata )) {
688- sc .updateShardMeta (sc .parentShardId , fn )
707+ func (sc * SplitController ) updateParentMeta (fn func (meta * model.ShardMetadata )) error {
708+ return sc .updateShardMeta (sc .parentShardId , fn )
689709}
690710
691- func (sc * SplitController ) updateChildMeta (childId int64 , fn func (meta * model.ShardMetadata )) {
692- sc .updateShardMeta (childId , fn )
711+ func (sc * SplitController ) updateChildMeta (childId int64 , fn func (meta * model.ShardMetadata )) error {
712+ return sc .updateShardMeta (childId , fn )
693713}
694714
695- func (sc * SplitController ) updateShardMeta (shardId int64 , fn func (meta * model.ShardMetadata )) {
715+ func (sc * SplitController ) updateShardMeta (shardId int64 , fn func (meta * model.ShardMetadata )) error {
696716 status := sc .statusResource .Load ()
697717 cloned := status .Clone ()
698718 ns := cloned .Namespaces [sc .namespace ]
699719 if meta , exists := ns .Shards [shardId ]; exists {
700720 fn (& meta )
701721 ns .Shards [shardId ] = meta
702- sc .statusResource .Update (cloned )
722+ return sc .statusResource .Update (cloned )
703723 }
724+ return nil
704725}
705726
706727// fenceEnsemble sends NewTerm to all ensemble members and returns the
@@ -860,11 +881,13 @@ func (sc *SplitController) reelectChild(childId int64) error {
860881 }
861882
862883 // Update child metadata
863- sc .updateChildMeta (childId , func (meta * model.ShardMetadata ) {
884+ if err := sc .updateChildMeta (childId , func (meta * model.ShardMetadata ) {
864885 meta .Term = newTerm
865886 meta .Leader = & newLeader
866887 meta .Status = model .ShardStatusSteadyState
867- })
888+ }); err != nil {
889+ return err
890+ }
868891
869892 sc .log .Info ("Child re-elected in clean term" ,
870893 slog .Int64 ("child-shard" , childId ),
0 commit comments