@@ -132,10 +132,11 @@ void GCManager::stop() {
132132
133133folly::SemiFuture< bool > GCManager::submit_gc_task (task_priority priority, chunk_id_t chunk_id) {
134134 if (!is_started ()) return folly::makeFuture< bool >(false );
135+
135136 auto pdev_id = m_chunk_selector->get_extend_vchunk (chunk_id)->get_pdev_id ();
136137 auto it = m_pdev_gc_actors.find (pdev_id);
137138 if (it == m_pdev_gc_actors.end ()) {
138- LOGINFO (" pdev gc actor not found for pdev_id: {}" , pdev_id);
139+ LOGINFO (" pdev gc actor not found for pdev_id={}, chunk= {}" , pdev_id, chunk_id );
139140 return folly::makeFuture< bool >(false );
140141 }
141142 auto & actor = it->second ;
@@ -178,22 +179,14 @@ bool GCManager::is_eligible_for_gc(chunk_id_t chunk_id) {
178179 return false ;
179180 }
180181
181- // it does not belong to any pg, so we don't need to gc it.
182- if (!chunk->m_pg_id .has_value ()) {
183- LOGDEBUG (" chunk_id={} belongs to no pg, not eligible for gc" , chunk_id)
184- return false ;
185- }
186-
187182 const auto total_blk_num = chunk->get_total_blks ();
188183
189184 const auto gc_garbage_rate_threshold = HS_BACKEND_DYNAMIC_CONFIG (gc_garbage_rate_threshold);
190185
191186 bool should_gc = 100 * defrag_blk_num >= total_blk_num * gc_garbage_rate_threshold;
192187
193- LOGDEBUG (" gc scan chunk_id={}, belongs to pg {}, use_blks={}, available_blks={}, total_blks={}, defrag_blks={}, "
194- " should_gc={}" ,
195- chunk_id, chunk->m_pg_id .value (), chunk->get_used_blks (), chunk->available_blks (), total_blk_num,
196- defrag_blk_num, should_gc);
188+ LOGDEBUG (" gc scan chunk_id={}, use_blks={}, available_blks={}, total_blks={}, defrag_blks={}, should_gc={}" ,
189+ chunk_id, chunk->get_used_blks (), chunk->available_blks (), total_blk_num, defrag_blk_num, should_gc);
197190
198191 return should_gc;
199192}
@@ -292,10 +285,27 @@ void GCManager::pdev_gc_actor::add_reserved_chunk(
292285}
293286
294287folly::SemiFuture< bool > GCManager::pdev_gc_actor::add_gc_task (uint8_t priority, chunk_id_t move_from_chunk) {
288+ auto EXvchunk = m_chunk_selector->get_extend_vchunk (move_from_chunk);
289+ // it does not belong to any pg, so we don't need to gc it.
290+ if (!EXvchunk->m_pg_id .has_value ()) {
291+ LOGDEBUG (" chunk_id={} belongs to no pg, not eligible for gc" , move_from_chunk)
292+ return folly::makeSemiFuture< bool >(false );
293+ }
294+
295+ const auto pg_id = EXvchunk->m_pg_id .value ();
296+
297+ m_hs_home_object->gc_manager ()->incr_pg_pending_gc_task (pg_id);
298+
299+ if (!m_hs_home_object->can_chunks_in_pg_be_gc (pg_id)) {
300+ LOGDEBUG (" chunk_id={} belongs to pg {}, which is not eligible for gc at this moment!" , move_from_chunk, pg_id)
301+ m_hs_home_object->gc_manager ()->decr_pg_pending_gc_task (pg_id);
302+ return folly::makeSemiFuture< bool >(false );
303+ }
304+
295305 if (m_chunk_selector->try_mark_chunk_to_gc_state (move_from_chunk,
296306 priority == static_cast < uint8_t >(task_priority::emergent))) {
297307 auto [promise, future] = folly::makePromiseContract< bool >();
298- auto gc_task_id = GCManager::_gc_task_id.fetch_add (1 );
308+ const auto gc_task_id = GCManager::_gc_task_id.fetch_add (1 );
299309
300310 if (sisl_unlikely (priority == static_cast < uint8_t >(task_priority::emergent))) {
301311 m_egc_executor->add ([this , gc_task_id, priority, move_from_chunk, promise = std::move (promise)]() mutable {
@@ -313,9 +323,49 @@ folly::SemiFuture< bool > GCManager::pdev_gc_actor::add_gc_task(uint8_t priority
313323 }
314324
315325 LOGWARN (" fail to submit gc task for chunk_id={}, priority={}" , move_from_chunk, priority);
326+ m_hs_home_object->gc_manager ()->decr_pg_pending_gc_task (pg_id);
316327 return folly::makeSemiFuture< bool >(false );
317328}
318329
330+ void GCManager::drain_pg_pending_gc_task (const pg_id_t pg_id) {
331+ while (true ) {
332+ uint64_t pending_gc_task_num{0 };
333+ {
334+ std::unique_lock lock (m_pending_gc_task_mtx);
335+ pending_gc_task_num = m_pending_gc_task_num_per_pg.try_emplace (pg_id, 0 ).first ->second .load ();
336+ }
337+
338+ if (pending_gc_task_num) {
339+ LOGDEBUG (" {} pending gc tasks to be completed for pg={}, wait for 2 seconds!" , pending_gc_task_num, pg_id);
340+ } else {
341+ break ;
342+ }
343+ // wait until all the pending gc tasks for this pg are completed
344+ std::this_thread::sleep_for (std::chrono::seconds (2 ));
345+ }
346+
347+ LOGDEBUG (" all pending gc tasks for pg_id={} are completed" , pg_id);
348+ }
349+
350+ void GCManager::decr_pg_pending_gc_task (const pg_id_t pg_id) {
351+ std::unique_lock lock (m_pending_gc_task_mtx);
352+ auto & pending_gc_task_num = m_pending_gc_task_num_per_pg.try_emplace (pg_id, 0 ).first ->second ;
353+ if (pending_gc_task_num.load ()) {
354+ // TODO::avoid overflow here.
355+ --pending_gc_task_num;
356+ LOGDEBUG (" decrease pending gc task num for pg_id={}, now it is {}" , pg_id, pending_gc_task_num.load ());
357+ return ;
358+ }
359+ LOGDEBUG (" pending gc task num for pg_id={} is already 0, no need to decrease it" , pg_id);
360+ }
361+
362+ void GCManager::incr_pg_pending_gc_task (const pg_id_t pg_id) {
363+ std::unique_lock lock (m_pending_gc_task_mtx);
364+ auto & pending_gc_task_num = m_pending_gc_task_num_per_pg.try_emplace (pg_id, 0 ).first ->second ;
365+ ++pending_gc_task_num;
366+ LOGDEBUG (" increase pending gc task num for pg_id={}, now it is {}" , pg_id, pending_gc_task_num.load ());
367+ }
368+
319369// this method is expected to be called sequentially when replaying metablk, so we don't need to worry about the
320370// concurrency issue.
321371void GCManager::pdev_gc_actor::handle_recovered_gc_task (
@@ -448,10 +498,11 @@ bool GCManager::pdev_gc_actor::replace_blob_index(
448498 }
449499
450500 if (existing_pbas.chunk_num () != move_from_chunk) {
451- LOGERROR (" gc task_id={}, existing pbas chunk={} should be equal to move_from_chunk={}, pg_id={}, "
452- " shard_id={}, blob_id={}, move_to_chunk={} , existing_pbas={}, new_pbas={}" ,
453- task_id, existing_pbas.chunk_num (), move_from_chunk, pg_id, shard, blob, move_to_chunk,
454- existing_pbas.to_string (), new_pbas.to_string ());
501+ LOGWARN (" gc task_id={}, existing pbas chunk={} should be equal to move_from_chunk={}, pg_id={}, "
502+ " shard_id={}, blob_id={}, move_to_chunk={} , existing_pbas={}, new_pbas={}, this case "
503+ " might happen when crash recovery" ,
504+ task_id, existing_pbas.chunk_num (), move_from_chunk, pg_id, shard, blob, move_to_chunk,
505+ existing_pbas.to_string (), new_pbas.to_string ());
455506 return homestore::put_filter_decision::keep;
456507 }
457508
@@ -676,8 +727,9 @@ bool GCManager::pdev_gc_actor::copy_valid_data(chunk_id_t move_from_chunk, chunk
676727 RELEASE_ASSERT (header_sgs.iovs .size () == 1 , " header_sgs.iovs.size() should be 1, but not!" );
677728 iomanager.iobuf_free (reinterpret_cast < uint8_t * >(header_sgs.iovs [0 ].iov_base ));
678729 if (err) {
679- LOGERROR (" gc task_id={}, Failed to write shard header for move_to_chunk={} shard_id={}, err={}" ,
680- task_id, move_to_chunk, shard_id, err.value ());
730+ LOGERROR (" gc task_id={}, Failed to write shard header for move_to_chunk={} shard_id={}, "
731+ " err={}, err_category={}, err_message={}" ,
732+ task_id, move_to_chunk, shard_id, err.value (), err.category ().name (), err.message ());
681733 return folly::makeFuture< bool >(false );
682734 }
683735
@@ -713,8 +765,9 @@ bool GCManager::pdev_gc_actor::copy_valid_data(chunk_id_t move_from_chunk, chunk
713765 if (err) {
714766 LOGERROR (
715767 " gc task_id={}, Failed to read blob from move_from_chunk={}, shard_id={}, "
716- " blob_id={}: err={}" ,
717- task_id, move_from_chunk, k.key ().shard , k.key ().blob , err.value ());
768+ " blob_id={}, err={}, err_category={}, err_message={}" ,
769+ task_id, move_from_chunk, k.key ().shard , k.key ().blob , err.value (),
770+ err.category ().name (), err.message ());
718771 iomanager.iobuf_free (reinterpret_cast < uint8_t * >(data_sgs.iovs [0 ].iov_base ));
719772 return folly::makeFuture< bool >(false );
720773 }
@@ -747,10 +800,11 @@ bool GCManager::pdev_gc_actor::copy_valid_data(chunk_id_t move_from_chunk, chunk
747800 iomanager.iobuf_free (
748801 reinterpret_cast < uint8_t * >(data_sgs.iovs [0 ].iov_base ));
749802 if (err) {
750- LOGERROR (" gc task_id={}, Failed to write blob to move_to_chunk={}, "
751- " shard_id={}, blob_id={}, err={}" ,
752- task_id, move_to_chunk, k.key ().shard , k.key ().blob ,
753- err.value ());
803+ LOGERROR (
804+ " gc task_id={}, Failed to write blob to move_to_chunk={}, "
805+ " shard_id={}, blob_id={}, err={}, err_category={}, err_message={}" ,
806+ task_id, move_to_chunk, k.key ().shard , k.key ().blob , err.value (),
807+ err.category ().name (), err.message ());
754808 return false ;
755809 }
756810
@@ -811,10 +865,10 @@ bool GCManager::pdev_gc_actor::copy_valid_data(chunk_id_t move_from_chunk, chunk
811865 iomanager.iobuf_free (reinterpret_cast < uint8_t * >(footer_sgs.iovs [0 ].iov_base ));
812866
813867 if (err) {
814- LOGERROR (
815- " gc task_id ={}, Failed to write shard footer for move_to_chunk ={} shard_id ={}, "
816- " err={} " ,
817- task_id, move_to_chunk, shard_id, err.value ());
868+ LOGERROR (" gc task_id={}, Failed to write shard footer for move_to_chunk={} "
869+ " shard_id ={}, err ={}, error_category ={}, error_message={} " ,
870+ task_id, move_to_chunk, shard_id, err. value (), err. category (). name () ,
871+ err.message ());
818872 return false ;
819873 }
820874 return true ;
@@ -1043,6 +1097,7 @@ void GCManager::pdev_gc_actor::process_gc_task(chunk_id_t move_from_chunk, uint8
10431097
10441098 task.setValue (true );
10451099 m_reserved_chunk_queue.blockingWrite (move_from_chunk);
1100+ m_hs_home_object->gc_manager ()->decr_pg_pending_gc_task (pg_id);
10461101 LOGINFO (" gc task_id={} for move_from_chunk={} to move_to_chunk={} with priority={} is completed" , task_id,
10471102 move_from_chunk, move_to_chunk, priority);
10481103}
0 commit comments