flush dc_lsn before persisting shard metablk in on_commit of sealing shard

JacksonYao287 · JacksonYao287 · commit c00ab0d52a2a · 2026-02-23T13:30:56.000+08:00
diff --git a/conanfile.py b/conanfile.py
@@ -10,7 +10,7 @@
 
 class HomeObjectConan(ConanFile):
     name = "homeobject"
-    version = "3.0.19"
+    version = "3.0.20"
 
     homepage = "https://github.com/eBay/HomeObject"
     description = "Blob Store built on HomeStore"
diff --git a/src/lib/homestore_backend/gc_manager.cpp b/src/lib/homestore_backend/gc_manager.cpp
@@ -25,14 +25,14 @@ SISL_LOGGING_DECL(gcmgr)
 GCManager::GCManager(HSHomeObject* homeobject) :
         m_chunk_selector{homeobject->chunk_selector()}, m_hs_home_object{homeobject} {
     homestore::meta_service().register_handler(
-        GCManager::_gc_actor_meta_name,
+        _gc_actor_meta_name,
         [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) {
             on_gc_actor_meta_blk_found(std::move(buf), voidptr_cast(mblk));
         },
         nullptr, true);
 
     homestore::meta_service().register_handler(
-        GCManager::_gc_reserved_chunk_meta_name,
+        _gc_reserved_chunk_meta_name,
         [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) {
             on_reserved_chunk_meta_blk_found(std::move(buf), voidptr_cast(mblk));
         },
@@ -44,7 +44,7 @@ GCManager::GCManager(HSHomeObject* homeobject) :
         true);
 
     homestore::meta_service().register_handler(
-        GCManager::_gc_task_meta_name,
+        _gc_task_meta_name,
         [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) {
             on_gc_task_meta_blk_found(std::move(buf), voidptr_cast(mblk));
         },
@@ -63,8 +63,8 @@ void GCManager::on_gc_task_meta_blk_found(sisl::byte_view const& buf, void* meta
 
     // here, we are under the protection of the lock of metaservice. however, we will also try to update pg and shard
     // metablk and then destroy the gc_task_sb, which will also try to acquire the lock of metaservice, as a result, a
-    // dead lock will happen. so here we will handle all the gc tasks after read all the their metablks
-    m_recovered_gc_tasks.emplace_back(GCManager::_gc_task_meta_name);
+    // dead lock will happen. so here we will handle all the gc tasks after read all the metablks
+    m_recovered_gc_tasks.emplace_back(_gc_task_meta_name);
     m_recovered_gc_tasks.back().load(buf, meta_cookie);
 }
 
@@ -89,7 +89,7 @@ void GCManager::handle_all_recovered_gc_tasks() {
 }
 
 void GCManager::on_gc_actor_meta_blk_found(sisl::byte_view const& buf, void* meta_cookie) {
-    m_gc_actor_sbs.emplace_back(GCManager::_gc_actor_meta_name);
+    m_gc_actor_sbs.emplace_back(_gc_actor_meta_name);
     auto& gc_actor_sb = m_gc_actor_sbs.back();
     gc_actor_sb.load(buf, meta_cookie);
     auto pdev_id = gc_actor_sb->pdev_id;
@@ -100,8 +100,7 @@ void GCManager::on_gc_actor_meta_blk_found(sisl::byte_view const& buf, void* met
 }
 
 void GCManager::on_reserved_chunk_meta_blk_found(sisl::byte_view const& buf, void* meta_cookie) {
-    homestore::superblk< GCManager::gc_reserved_chunk_superblk > reserved_chunk_sb(
-        GCManager::_gc_reserved_chunk_meta_name);
+    homestore::superblk< gc_reserved_chunk_superblk > reserved_chunk_sb(_gc_reserved_chunk_meta_name);
     auto chunk_id = reserved_chunk_sb.load(buf, meta_cookie)->chunk_id;
     auto EXVchunk = m_chunk_selector->get_extend_vchunk(chunk_id);
     if (EXVchunk == nullptr) {
@@ -184,8 +183,7 @@ folly::SemiFuture< bool > GCManager::submit_gc_task(task_priority priority, chun
 }
 
 std::shared_ptr< GCManager::pdev_gc_actor >
-GCManager::try_create_pdev_gc_actor(uint32_t pdev_id,
-                                    const homestore::superblk< GCManager::gc_actor_superblk >& gc_actor_sb) {
+GCManager::try_create_pdev_gc_actor(uint32_t pdev_id, const homestore::superblk< gc_actor_superblk >& gc_actor_sb) {
     auto const [it, happened] = m_pdev_gc_actors.try_emplace(
         pdev_id, std::make_shared< pdev_gc_actor >(gc_actor_sb, m_chunk_selector, m_hs_home_object));
     RELEASE_ASSERT((it != m_pdev_gc_actors.end()), "Unexpected error in m_pdev_gc_actors!!!");
@@ -776,7 +774,7 @@ bool GCManager::pdev_gc_actor::copy_valid_data(
 #endif
         // TODO::involve ratelimiter in the following code, where read/write are scheduled. or do we need a central
         // ratelimter shared by all components except client io?
-        auto succeed_copying_shard =
+        const auto succeed_copying_shard =
             // 1 write the shard header to move_to_chunk
             data_service.async_alloc_write(header_sgs, hints, out_blkids)
                 .thenValue([this, &hints, &move_to_chunk, &move_from_chunk, &is_last_shard, &shard_id, &blk_size,
@@ -968,11 +966,9 @@ bool GCManager::pdev_gc_actor::copy_valid_data(
                    move_from_chunk, move_to_chunk);
             return false;
         }
-
         GCLOGD(task_id, pg_id, shard_id, "successfully copy blobs from move_from_chunk={} to move_to_chunk={}",
                move_from_chunk, move_to_chunk);
     }
-
     GCLOGD(task_id, pg_id, NO_SHARD_ID, "all valid blobs are copied from move_from_chunk={} to move_to_chunk={}",
            move_from_chunk, move_to_chunk);
 
@@ -1132,17 +1128,14 @@ bool GCManager::pdev_gc_actor::compare_blob_indexes(
             GCLOGW(task_id, pg_id, shard_id, "copied blob: move_to_chunk={}, blob_id={}, pba={}", k.chunk, k.blob,
                    v.pbas().to_string());
         }
-
         GCLOGW(task_id, pg_id, NO_SHARD_ID, "start printing valid blobs from gc index table:");
         for (const auto& [k, v] : valid_blob_indexes) {
             const auto shard_id = k.key().shard;
             GCLOGW(task_id, pg_id, shard_id, "valid blob: move_to_chunk={}, blob_id={}, pba={}", k.key().chunk,
                    k.key().blob, v.pbas().to_string());
         }
-
         RELEASE_ASSERT(false, "copied blobs are not the same as the valid blobs got from gc index table");
     }
-
     return ret;
 }
 
@@ -1325,6 +1318,7 @@ bool GCManager::pdev_gc_actor::process_after_gc_metablk_persisted(
         }
     }
 
+    // now, all the blob indexes have been replaced successfully, we can destroy the gc task superblk
     gc_task_sb.destroy();
 
     const auto reclaimed_blk_count = m_chunk_selector->get_extend_vchunk(move_from_chunk)->get_used_blks() -
diff --git a/src/lib/homestore_backend/hs_homeobject.cpp b/src/lib/homestore_backend/hs_homeobject.cpp
@@ -251,6 +251,14 @@ void HSHomeObject::init_homestore() {
         zpad_bufs_[i] = std::move(sisl::io_blob_safe(uint32_cast(size), io_align));
         std::memset(zpad_bufs_[i].bytes(), 0, size);
     }
+
+    // when reaching here, all the logs have been replayed, we can start gc now.
+    if (HS_BACKEND_DYNAMIC_CONFIG(enable_gc)) {
+        LOGI("Starting GC manager");
+        gc_mgr_->start();
+    } else {
+        LOGI("GC is disabled");
+    }
 }
 
 void HSHomeObject::on_replica_restart() {
@@ -349,14 +357,17 @@ void HSHomeObject::on_replica_restart() {
         homestore::meta_service().read_sub_sb(GCManager::_gc_reserved_chunk_meta_name);
         homestore::meta_service().read_sub_sb(GCManager::_gc_task_meta_name);
 
-        gc_mgr_->handle_all_recovered_gc_tasks();
+        //  when reaching here, log replay doest not start yet. we need to handle all the recovered gc tasks before
+        //  replaying log. when log replay done, in ReplicationStateMachine::on_log_replay_done, we need
+        //  select_specific_chunk for all the chunks with open shard to mark the states of these chunks to inuse. if
+        //  crash happens after the shard metablk has been updated(the pchunk of this shard is changed to
+        //  move_to_chunk)) but before reserved_chunk_superblk has been persisted (move_to_chunk is now still a reserved
+        //  chunk), when log replay is done and try to select_specific_chunk for the chunk with open shard, since the
+        //  state of move_to_chunk is reserved, and thus its state is GC and can not be selected, and will be stuck in
+        //  on_log_replay_done. after handling all the recovered gc tasks, move_to_chunk will be marked to inuse, and
+        //  thus can be selected in on_log_replay_done, and the log replay can be completed successfully.
 
-        if (HS_BACKEND_DYNAMIC_CONFIG(enable_gc)) {
-            LOGI("Starting GC manager");
-            gc_mgr_->start();
-        } else {
-            LOGI("GC is disabled");
-        }
+        gc_mgr_->handle_all_recovered_gc_tasks();
     });
 }
 
diff --git a/src/lib/homestore_backend/hs_shard_manager.cpp b/src/lib/homestore_backend/hs_shard_manager.cpp
@@ -535,6 +535,29 @@ void HSHomeObject::on_shard_message_commit(int64_t lsn, sisl::blob const& h, hom
             RELEASE_ASSERT(v_chunkID.has_value(), "v_chunk id not found");
             bool res = chunk_selector()->release_chunk(pg_id, v_chunkID.value());
             RELEASE_ASSERT(res, "Failed to release v_chunk_id={}, pg={}", v_chunkID.value(), pg_id);
+
+            // there is a corner case:
+            // let`s say cp_lsn and dc_lsn is 10, lsn 11 is put_blob (blob -> pba-chunk-1), and lsn 12 is
+            // seal_shard(shard-1 , chunk-1).
+
+            // 1 before crash, lsn 11 and lsn 12 are both committed. as a result , we have a blob -> pba-chunk-1 in the
+            // wbcache of indextable and a persisted superblk of shard-1 with a state sealed.
+
+            // 2 crash happens. after restart, blob -> pba-chunk-1 is lost since it only existes in wbcache and not be
+            // flushed to disk. but shard-1 has a stat of sealed since shard superblk is persisted before crash. now,
+            // since no open shard in chunk-1, chunk-1 is selected for gc and all the blobs of shard-1 are moved to
+            // chunk-2 , and chunk-1 becomes a reserved chunk.
+
+            // 3 since dc_lsn is 10, after log replay, we start committing lsn 11. since blob -> pba-chunk-1 does not
+            // exist in pg-index-table, on_blob_put_commit will insert a new item blob -> pba-chunk-1 to pg-index-table.
+            // this is where issue happens. blob belong to shard-1, which has already been moved to chunk-2. but
+            // on_blob_put_commit adds blob to indextable with a stale pba belongs to chunk-1 , which is now a reserved
+            // chunk and will be purged later.
+
+            //  the solution is before persisting the shard state change to SEALED, we persist dc_lsn of this
+            //  repl_dev , so that we can make sure all the logs before this sealing shard will be committed in log
+            //  replay before gc starts.
+            repl_dev->flush_durable_commit_lsn();
             update_shard_in_map(shard_info);
         } else
             SLOGW(tid, shard_info.id, "try to commit SEAL_SHARD_MSG but shard state is not sealed.");