support trigger gc for a specific pg

JacksonYao287 · JacksonYao287 · commit f1d478b5b923 · 2026-01-06T14:25:41.000+08:00
diff --git a/src/lib/homestore_backend/gc_manager.cpp b/src/lib/homestore_backend/gc_manager.cpp
@@ -176,7 +176,20 @@ void GCManager::stop() {
 }
 
 folly::SemiFuture< bool > GCManager::submit_gc_task(task_priority priority, chunk_id_t chunk_id) {
-    auto pdev_id = m_chunk_selector->get_extend_vchunk(chunk_id)->get_pdev_id();
+    auto ex_vchunk = m_chunk_selector->get_extend_vchunk(chunk_id);
+    if (ex_vchunk == nullptr) {
+        LOGERRORMOD(gcmgr, "chunk {} not found when submit gc task!", chunk_id);
+        return folly::makeFuture< bool >(false);
+    }
+
+    // if the chunk has no garbage to be reclaimed, we don`t need to gc it , return true directly
+    const auto defrag_blk_num = ex_vchunk->get_defrag_nblks();
+    if (!defrag_blk_num) {
+        LOGERRORMOD(gcmgr, "chunk {} has no garbage to be reclaimed, skip gc for this chunk!", chunk_id);
+        return folly::makeFuture< bool >(true);
+    }
+
+    auto pdev_id = ex_vchunk->get_pdev_id();
     auto it = m_pdev_gc_actors.find(pdev_id);
     if (it == m_pdev_gc_actors.end()) {
         LOGINFOMOD(gcmgr, "pdev gc actor not found for pdev_id={}, chunk={}", pdev_id, chunk_id);
diff --git a/src/lib/homestore_backend/hs_http_manager.cpp b/src/lib/homestore_backend/hs_http_manager.cpp
@@ -49,6 +49,12 @@ HttpManager::HttpManager(HSHomeObject& ho) : ho_(ho) {
          Pistache::Rest::Routes::bind(&HttpManager::dump_chunk, this)},
         {Pistache::Http::Method::Get, "/api/v1/shard/dump",
          Pistache::Rest::Routes::bind(&HttpManager::dump_shard, this)},
+
+        // we support triggering gc for:
+        // 1 all the chunks in all the pg: no input param
+        // 2 all the chunks in a specific pg: input param is pg_id
+        // 3 a specific chunk: input param is pchunk_id
+
         {Pistache::Http::Method::Post, "/api/v1/trigger_gc",
          Pistache::Rest::Routes::bind(&HttpManager::trigger_gc, this)},
         {Pistache::Http::Method::Get, "/api/v1/gc_job_status",
@@ -244,8 +250,6 @@ void HttpManager::dump_shard(const Pistache::Rest::Request& request, Pistache::H
 }
 
 void HttpManager::trigger_gc(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response) {
-    const auto chunk_id_param = request.query().get("chunk_id");
-
     auto gc_mgr = ho_.gc_manager();
     if (!gc_mgr) {
         response.send(Pistache::Http::Code::Internal_Server_Error, "GC manager not available");
@@ -258,101 +262,13 @@ void HttpManager::trigger_gc(const Pistache::Rest::Request& request, Pistache::H
         return;
     }
 
-    std::string job_id = generate_job_id();
-    nlohmann::json result;
-
-    // trigger gc for all chunks
-    if (!chunk_id_param || chunk_id_param.value().empty()) {
-        LOGINFO("Received trigger_gc request for all chunks, job_id={}", job_id);
-
-        auto job_info = std::make_shared< GCJobInfo >(job_id);
-        {
-            std::lock_guard< std::mutex > lock(gc_job_mutex_);
-            gc_jobs_map_.set(job_id, job_info);
-        }
-
-        result["job_id"] = job_id;
-        result["message"] = "GC triggered for all eligible chunks, pls query job status using gc_job_status API";
-        response.send(Pistache::Http::Code::Accepted, result.dump());
-
-        LOGINFO("GC job {} stopping GC scan timer", job_id);
-        gc_mgr->stop_gc_scan_timer();
-
-        std::vector< pg_id_t > pg_ids;
-        ho_.get_pg_ids(pg_ids);
-        LOGINFO("GC job {} will process {} PGs", job_id, pg_ids.size());
-
-        std::vector< folly::SemiFuture< bool > > gc_task_futures;
-
-        for (const auto& pg_id : pg_ids) {
-            auto hs_pg = const_cast< HSHomeObject::HS_PG* >(ho_.get_hs_pg(pg_id));
-            RELEASE_ASSERT(hs_pg, "HS PG {} not found during GC job {}", pg_id, job_id);
-
-            LOGINFO("GC job {} draining pending GC tasks for PG {}", job_id, pg_id);
-            gc_mgr->drain_pg_pending_gc_task(pg_id);
-
-            auto pg_sb = hs_pg->pg_sb_.get();
-            std::vector< homestore::chunk_num_t > pg_chunks(pg_sb->get_chunk_ids(),
-                                                            pg_sb->get_chunk_ids() + pg_sb->num_chunks);
-
-            LOGINFO("GC job {} processing PG {} with {} chunks", job_id, pg_id, pg_chunks.size());
-
-            // Resume accepting new requests for this pg
-            hs_pg->repl_dev_->quiesce_reqs();
-
-            for (const auto& chunk_id : pg_chunks) {
-                job_info->total_chunks++;
-                // Determine priority based on chunk state (INUSE means has open shard)
-                auto chunk = chunk_selector->get_extend_vchunk(chunk_id);
-                RELEASE_ASSERT(chunk, "Chunk {} not found during GC job {}", chunk_id, job_id);
-                auto priority = chunk->m_state == ChunkState::INUSE ? task_priority::emergent : task_priority::normal;
-
-                // Clear in-memory requests only for emergent priority chunks (chunks with open shards)
-                if (priority == task_priority::emergent) { hs_pg->repl_dev_->clear_chunk_req(chunk_id); }
-
-                // Submit GC task for this chunk
-                auto future = gc_mgr->submit_gc_task(priority, chunk_id);
-                gc_task_futures.push_back(std::move(future));
-                LOGDEBUG("GC job {} for chunk {} in PG {} with priority={}", job_id, chunk_id, pg_id,
-                         (priority == task_priority::emergent) ? "emergent" : "normal");
-            }
-        }
-
-        folly::collectAllUnsafe(gc_task_futures)
-            .thenValue([job_info](auto&& results) {
-                for (auto const& ok : results) {
-                    RELEASE_ASSERT(ok.hasValue(), "we never throw any exception when copying data");
-                    if (ok.value()) {
-                        job_info->success_count++;
-                    } else {
-                        job_info->failed_count++;
-                    }
-                }
-            })
-            .thenValue([this, pg_ids, job_info, gc_mgr](auto&& rets) {
-                LOGINFO("All GC tasks have been processed");
-                const auto& job_id = job_info->job_id;
-                for (const auto& pg_id : pg_ids) {
-                    auto hs_pg = const_cast< HSHomeObject::HS_PG* >(ho_.get_hs_pg(pg_id));
-                    RELEASE_ASSERT(hs_pg, "HS PG {} not found during GC job {}", pg_id, job_id);
-                    // Resume accepting new requests for this pg
-                    hs_pg->repl_dev_->resume_accepting_reqs();
-                    LOGINFO("GC job {} resumed accepting requests for PG {}", job_id, pg_id);
-                }
-
-                job_info->result = (job_info->failed_count == 0);
-                job_info->status = job_info->result.value() ? GCJobStatus::COMPLETED : GCJobStatus::FAILED;
-                LOGINFO("GC job {} completed: total={}, success={}, failed={}", job_id, job_info->total_chunks,
-                        job_info->success_count, job_info->failed_count);
+    const auto chunk_id_param = request.query().get("chunk_id");
+    const auto pg_id_param = request.query().get("pg_id");
 
-                // Restart the GC scan timer
-                LOGINFO("GC job {} restarting GC scan timer", job_id);
-                gc_mgr->start_gc_scan_timer();
-            });
-    } else {
-        // trigger gc for specific chunk
+    if (chunk_id_param && !chunk_id_param.value().empty()) {
+        // trigger gc for a specific chunk
         uint32_t chunk_id = std::stoul(chunk_id_param.value());
-        LOGINFO("Received trigger_gc request for chunk_id={}, job_id={}", chunk_id, job_id);
+        LOGINFO("Received trigger_gc request for chunk_id {}", chunk_id);
 
         auto chunk = chunk_selector->get_extend_vchunk(chunk_id);
         if (!chunk) {
@@ -372,28 +288,30 @@ void HttpManager::trigger_gc(const Pistache::Rest::Request& request, Pistache::H
         }
 
         const auto pg_id = chunk->m_pg_id.value();
-        auto pdev_id = chunk->get_pdev_id();
+        nlohmann::json result;
+        const auto job_id = generate_job_id();
 
         result["chunk_id"] = chunk_id;
-        result["pdev_id"] = pdev_id;
         result["pg_id"] = pg_id;
         result["job_id"] = job_id;
 
         if (chunk->m_state == ChunkState::GC) {
-            result["message"] = "chunk is already under GC now";
-            response.send(Pistache::Http::Code::Accepted, result.dump());
+            result["message"] = "chunk is already under GC now, this task will not be executed!";
+            response.send(Pistache::Http::Code::Ok, result.dump());
             return;
         }
+        result["message"] = "GC triggered for chunk, pls query job status using gc_job_status API";
 
-        // Check for active job and create new job atomically under the same lock
-        auto job_info = std::make_shared< GCJobInfo >(job_id, chunk_id, pdev_id);
+        // return response before starting the GC so that we don't block the client.
+        response.send(Pistache::Http::Code::Accepted, result.dump());
+
+        auto job_info = std::make_shared< GCJobInfo >(job_id, pg_id, chunk_id);
         {
-            std::lock_guard< std::mutex > lock(gc_job_mutex_);
+            std::lock_guard lock(gc_job_mutex_);
             gc_jobs_map_.set(job_id, job_info);
         }
 
-        result["message"] = "GC triggered for chunk, pls query job status using gc_job_status API";
-        response.send(Pistache::Http::Code::Accepted, result.dump());
+        // sumbit gc task for this chunk
 
         // Clear in-memory requests only for emergent priority chunks (chunks with open shards)
         auto hs_pg = const_cast< HSHomeObject::HS_PG* >(ho_.get_hs_pg(pg_id));
@@ -406,11 +324,73 @@ void HttpManager::trigger_gc(const Pistache::Rest::Request& request, Pistache::H
         gc_mgr->submit_gc_task(priority, chunk_id)
             .via(&folly::InlineExecutor::instance())
             .thenValue([this, job_info, repl_dev](bool res) {
-                job_info->result = res;
                 job_info->status = res ? GCJobStatus::COMPLETED : GCJobStatus::FAILED;
                 // Resume accepting new requests for this pg
                 repl_dev->resume_accepting_reqs();
             });
+    } else if (pg_id_param && !pg_id_param.value().empty()) {
+        // trigger gc for all chunks in a specific pg
+        const auto pg_id = std::stoul(pg_id_param.value());
+        LOGINFO("Received trigger_gc request for pg_id {}", pg_id);
+        auto hs_pg = const_cast< HSHomeObject::HS_PG* >(ho_.get_hs_pg(pg_id));
+        if (!hs_pg) {
+            nlohmann::json error;
+            error["pg_id"] = pg_id;
+            error["error"] = "PG not found";
+            response.send(Pistache::Http::Code::Not_Found, error.dump());
+            return;
+        }
+
+        nlohmann::json result;
+        const auto job_id = generate_job_id();
+        result["pg_id"] = pg_id;
+        result["job_id"] = job_id;
+        result["message"] = "GC triggered for a single pg, pls query job status using gc_job_status API";
+        // return response before starting the GC so that we don't block the client.
+        response.send(Pistache::Http::Code::Accepted, result.dump());
+
+        auto job_info = std::make_shared< GCJobInfo >(job_id, pg_id);
+        {
+            std::lock_guard lock(gc_job_mutex_);
+            gc_jobs_map_.set(job_id, job_info);
+        }
+
+        LOGINFO("GC job {} stopping GC scan timer", job_id);
+        gc_mgr->stop_gc_scan_timer();
+
+        // we block here until all gc tasks for the pg are done
+        trigger_gc_for_pg(pg_id, job_id);
+
+        LOGINFO("GC job {} restarting GC scan timer", job_id);
+        gc_mgr->start_gc_scan_timer();
+    } else {
+        LOGINFO("Received trigger_gc request for all chunks");
+        nlohmann::json result;
+        const auto job_id = generate_job_id();
+        result["job_id"] = job_id;
+        result["message"] = "GC triggered for all chunks, pls query job status using gc_job_status API";
+        // return response before starting the GC so that we don't block the client.
+        response.send(Pistache::Http::Code::Accepted, result.dump());
+
+        auto job_info = std::make_shared< GCJobInfo >(job_id);
+        {
+            std::lock_guard lock(gc_job_mutex_);
+            gc_jobs_map_.set(job_id, job_info);
+        }
+
+        std::vector< pg_id_t > pg_ids;
+        ho_.get_pg_ids(pg_ids);
+        LOGINFO("GC job {} will process {} PGs", job_id, pg_ids.size());
+        LOGINFO("GC job {} stopping GC scan timer", job_id);
+        gc_mgr->stop_gc_scan_timer();
+
+        // we block here until all gc tasks for the pg are done
+        for (const auto& pg_id : pg_ids) {
+            trigger_gc_for_pg(pg_id, job_id);
+        }
+
+        LOGINFO("GC job {} restarting GC scan timer", job_id);
+        gc_mgr->start_gc_scan_timer();
     }
 }
 
@@ -429,7 +409,7 @@ void HttpManager::get_gc_job_status(const Pistache::Rest::Request& request, Pist
     std::string job_id = job_id_param.value();
     std::shared_ptr< GCJobInfo > job_info;
     {
-        std::lock_guard< std::mutex > lock(gc_job_mutex_);
+        std::shared_lock lock(gc_job_mutex_);
         job_info = gc_jobs_map_.get(job_id);
     }
 
@@ -454,10 +434,7 @@ void HttpManager::get_gc_job_status(const Pistache::Rest::Request& request, Pist
         break;
     }
 
-    if (job_info->chunk_id.has_value()) {
-        result["chunk_id"] = job_info->chunk_id.value();
-        if (job_info->pdev_id.has_value()) { result["pdev_id"] = job_info->pdev_id.value(); }
-    }
+    if (job_info->chunk_id.has_value()) { result["chunk_id"] = job_info->chunk_id.value(); }
 
     if (job_info->total_chunks > 0) {
         nlohmann::json stats;
@@ -467,11 +444,76 @@ void HttpManager::get_gc_job_status(const Pistache::Rest::Request& request, Pist
         result["statistics"] = stats;
     }
 
-    if (job_info->result.has_value()) { result["result"] = job_info->result.value(); }
-
     response.send(Pistache::Http::Code::Ok, result.dump());
 }
 
+void HttpManager::trigger_gc_for_pg(uint16_t pg_id, const std::string& job_id) {
+    auto hs_pg = const_cast< HSHomeObject::HS_PG* >(ho_.get_hs_pg(pg_id));
+    RELEASE_ASSERT(hs_pg, "HS PG {} not found during GC job {}", pg_id, job_id);
+
+    LOGINFO("GC job {} draining pending GC tasks for PG {}", job_id, pg_id);
+    auto gc_mgr = ho_.gc_manager();
+    gc_mgr->drain_pg_pending_gc_task(pg_id);
+    auto pg_sb = hs_pg->pg_sb_.get();
+    std::vector< homestore::chunk_num_t > pg_chunks(pg_sb->get_chunk_ids(), pg_sb->get_chunk_ids() + pg_sb->num_chunks);
+
+    LOGINFO("GC job {} processing PG {} with {} chunks", job_id, pg_id, pg_chunks.size());
+    hs_pg->repl_dev_->quiesce_reqs();
+    std::vector< folly::SemiFuture< bool > > gc_task_futures;
+
+    std::shared_ptr< GCJobInfo > job_info;
+    {
+        std::shared_lock lock(gc_job_mutex_);
+        job_info = gc_jobs_map_.get(job_id);
+    }
+
+    auto chunk_selector = ho_.chunk_selector();
+
+    for (const auto& chunk_id : pg_chunks) {
+        job_info->total_chunks++;
+        // Determine priority based on chunk state (INUSE means has open shard)
+        auto chunk = chunk_selector->get_extend_vchunk(chunk_id);
+        RELEASE_ASSERT(chunk, "Chunk {} not found during GC job {}", chunk_id, job_id);
+        auto priority = chunk->m_state == ChunkState::INUSE ? task_priority::emergent : task_priority::normal;
+
+        // Clear in-memory requests only for emergent priority chunks (chunks with open shards)
+        if (priority == task_priority::emergent) { hs_pg->repl_dev_->clear_chunk_req(chunk_id); }
+
+        // Submit GC task for this chunk
+        auto future = gc_mgr->submit_gc_task(priority, chunk_id);
+        gc_task_futures.push_back(std::move(future));
+        LOGDEBUG("GC job {} for chunk {} in PG {} with priority={}", job_id, chunk_id, pg_id,
+                 (priority == task_priority::emergent) ? "emergent" : "normal");
+    }
+
+    folly::collectAllUnsafe(gc_task_futures)
+        .thenValue([job_info](auto&& results) {
+            for (auto const& ok : results) {
+                RELEASE_ASSERT(ok.hasValue(), "we never throw any exception when copying data");
+                if (ok.value()) {
+                    job_info->success_count++;
+                } else {
+                    job_info->failed_count++;
+                }
+            }
+        })
+        .thenValue([this, pg_id, job_info, gc_mgr](auto&& rets) {
+            LOGINFO("All GC tasks have been processed");
+            const auto& job_id = job_info->job_id;
+
+            auto hs_pg = const_cast< HSHomeObject::HS_PG* >(ho_.get_hs_pg(pg_id));
+            RELEASE_ASSERT(hs_pg, "HS PG {} not found during GC job {}", pg_id, job_id);
+            // Resume accepting new requests for this pg
+            hs_pg->repl_dev_->resume_accepting_reqs();
+            LOGINFO("GC job {} resumed accepting requests for PG {}", job_id, pg_id);
+
+            job_info->status = job_info->failed_count ? GCJobStatus::FAILED : GCJobStatus::COMPLETED;
+            LOGINFO("GC job {} completed: total={}, success={}, failed={}", job_id, job_info->total_chunks,
+                    job_info->success_count, job_info->failed_count);
+        })
+        .get();
+}
+
 #ifdef _PRERELEASE
 void HttpManager::crash_system(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response) {
     std::string crash_type;
diff --git a/src/lib/homestore_backend/hs_http_manager.hpp b/src/lib/homestore_backend/hs_http_manager.hpp