Skip to content

Commit cf39763

Browse files
committed
Refresh PG statistics after log replay on restart
Add refresh_pg_statistics() to recalculate and update PG statistics (active_blob_count, tombstone_blob_count, total_occupied_blk_count) after log replay completes. This ensures statistics accuracy after system crashes or restarts. The function is called in on_log_replay_done() before the raft group joins, scanning the index table and chunks to recompute all three statistics from actual data. Statistics are persisted at the next periodic checkpoint.
1 parent a5baea7 commit cf39763

File tree

4 files changed

+80
-21
lines changed

4 files changed

+80
-21
lines changed

conanfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
class HomeObjectConan(ConanFile):
1212
name = "homeobject"
13-
version = "4.1.1"
13+
version = "4.1.2"
1414

1515
homepage = "https://github.com/eBay/HomeObject"
1616
description = "Blob Store built on HomeStore"

src/lib/homestore_backend/hs_homeobject.hpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -576,12 +576,10 @@ class HSHomeObject : public HomeObjectImpl {
576576
REGISTER_COUNTER(snp_dnr_error_count, "Error times when reading blobs in baseline resync");
577577
REGISTER_HISTOGRAM(snp_dnr_blob_process_latency,
578578
"Time cost(us) of successfully process a blob in baseline resync",
579-
HistogramBucketsType(LowResolutionLatecyBuckets),
580-
_publish_as::publish_as_sum_count);
579+
HistogramBucketsType(LowResolutionLatecyBuckets), _publish_as::publish_as_sum_count);
581580
REGISTER_HISTOGRAM(snp_dnr_batch_process_latency,
582581
"Time cost(ms) of successfully process a batch in baseline resync",
583-
HistogramBucketsType(LowResolutionLatecyBuckets),
584-
_publish_as::publish_as_sum_count);
582+
HistogramBucketsType(LowResolutionLatecyBuckets), _publish_as::publish_as_sum_count);
585583
REGISTER_HISTOGRAM(snp_dnr_batch_e2e_latency,
586584
"Time cost(ms) of a batch end-to-end round trip in baseline resync",
587585
HistogramBucketsType(LowResolutionLatecyBuckets));
@@ -684,12 +682,10 @@ class HSHomeObject : public HomeObjectImpl {
684682
REGISTER_GAUGE(snp_rcvr_error_count, "Error count in baseline resync");
685683
REGISTER_HISTOGRAM(snp_rcvr_blob_process_time,
686684
"Time cost(us) of successfully process a blob in baseline resync",
687-
HistogramBucketsType(LowResolutionLatecyBuckets),
688-
_publish_as::publish_as_sum_count);
685+
HistogramBucketsType(LowResolutionLatecyBuckets), _publish_as::publish_as_sum_count);
689686
REGISTER_HISTOGRAM(snp_rcvr_batch_process_time,
690687
"Time cost(ms) of successfully process a batch in baseline resync",
691-
HistogramBucketsType(LowResolutionLatecyBuckets),
692-
_publish_as::publish_as_sum_count);
688+
HistogramBucketsType(LowResolutionLatecyBuckets), _publish_as::publish_as_sum_count);
693689

694690
attach_gather_cb(std::bind(&ReceiverSnapshotMetrics::on_gather, this));
695691
register_me_to_farm();
@@ -1059,6 +1055,9 @@ class HSHomeObject : public HomeObjectImpl {
10591055

10601056
BlobManager::Result< std::vector< BlobInfo > > get_shard_blobs(shard_id_t shard_id);
10611057

1058+
// Refresh PG statistics (called after log replay)
1059+
void refresh_pg_statistics(pg_id_t pg_id);
1060+
10621061
private:
10631062
BlobManager::Result< std::string > do_verify_blob(const void* blob, shard_id_t expected_shard_id,
10641063
blob_id_t expected_blob_id = 0) const;

src/lib/homestore_backend/hs_pg_manager.cpp

Lines changed: 68 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -398,10 +398,10 @@ void HSHomeObject::on_pg_complete_replace_member(group_id_t group_id, const std:
398398
boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id), tid);
399399
}
400400

401-
//This function actually perform rollback for replace member task: Remove in_member, and ensure out_member exists
401+
// This function actually perform rollback for replace member task: Remove in_member, and ensure out_member exists
402402
void HSHomeObject::on_pg_clean_replace_member_task(group_id_t group_id, const std::string& task_id,
403-
const replica_member_info& member_out,
404-
const replica_member_info& member_in, trace_id_t tid) {
403+
const replica_member_info& member_out,
404+
const replica_member_info& member_in, trace_id_t tid) {
405405
std::unique_lock lck(_pg_lock);
406406
for (const auto& iter : _pg_map) {
407407
auto& pg = iter.second;
@@ -421,13 +421,13 @@ void HSHomeObject::on_pg_clean_replace_member_task(group_id_t group_id, const st
421421

422422
LOGI("PG clean replace member task done (rollback), task_id={}, removed in_member={} (removed={}), "
423423
"ensured out_member={} (inserted={}), member_nums={}, trace_id={}",
424-
task_id, boost::uuids::to_string(member_in.id), removed_count,
425-
boost::uuids::to_string(member_out.id), inserted, pg->pg_info_.members.size(), tid);
424+
task_id, boost::uuids::to_string(member_in.id), removed_count, boost::uuids::to_string(member_out.id),
425+
inserted, pg->pg_info_.members.size(), tid);
426426
return;
427427
}
428428
}
429-
LOGE("PG clean replace member task failed, pg not found, task_id={}, member_out={}, member_in={}, trace_id={}", task_id,
430-
boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id), tid);
429+
LOGE("PG clean replace member task failed, pg not found, task_id={}, member_out={}, member_in={}, trace_id={}",
430+
task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id), tid);
431431
}
432432

433433
bool HSHomeObject::reconcile_membership(pg_id_t pg_id) {
@@ -457,7 +457,8 @@ bool HSHomeObject::reconcile_membership(pg_id_t pg_id) {
457457
// New member not in our records, add with default name
458458
PGMember new_member(member_id);
459459
new_members.insert(std::move(new_member));
460-
LOGE("Adding new member {} to pg={} membership, should not happen!", boost::uuids::to_string(member_id), hs_pg->pg_info_.id);
460+
LOGE("Adding new member {} to pg={} membership, should not happen!", boost::uuids::to_string(member_id),
461+
hs_pg->pg_info_.id);
461462
}
462463
}
463464
// Check if membership changed
@@ -471,9 +472,7 @@ bool HSHomeObject::reconcile_membership(pg_id_t pg_id) {
471472
std::vector< peer_id_t > added_members;
472473

473474
for (auto& old_member : hs_pg->pg_info_.members) {
474-
if (new_members.find(old_member) == new_members.end()) {
475-
removed_members.push_back(old_member.id);
476-
}
475+
if (new_members.find(old_member) == new_members.end()) { removed_members.push_back(old_member.id); }
477476
}
478477

479478
for (auto& new_member : new_members) {
@@ -482,7 +481,8 @@ bool HSHomeObject::reconcile_membership(pg_id_t pg_id) {
482481
}
483482
}
484483

485-
LOGI("Reconciling PG {} membership: removing {} members, adding {} members, old membership count {}, new membership count: {}",
484+
LOGI("Reconciling PG {} membership: removing {} members, adding {} members, old membership count {}, new "
485+
"membership count: {}",
486486
pg_id, removed_members.size(), added_members.size(), hs_pg->pg_info_.members.size(), new_members.size());
487487

488488
for (auto& member_id : removed_members) {
@@ -1245,6 +1245,62 @@ uint32_t HSHomeObject::get_pg_tombstone_blob_count(pg_id_t pg_id) const {
12451245
return tombstone_blob_count;
12461246
}
12471247

1248+
void HSHomeObject::refresh_pg_statistics(pg_id_t pg_id) {
1249+
auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pg_id));
1250+
RELEASE_ASSERT(hs_pg != nullptr, "Failed to get pg={} for statistics refresh", pg_id);
1251+
1252+
// Step 1: Scan index table to count active and tombstone blobs in one pass
1253+
uint64_t active_count = 0;
1254+
uint64_t tombstone_count = 0;
1255+
1256+
auto start_key =
1257+
BlobRouteKey{BlobRoute{uint64_t(pg_id) << homeobject::shard_width, std::numeric_limits< uint64_t >::min()}};
1258+
auto end_key =
1259+
BlobRouteKey{BlobRoute{uint64_t(pg_id + 1) << homeobject::shard_width, std::numeric_limits< uint64_t >::min()}};
1260+
1261+
homestore::BtreeQueryRequest< BlobRouteKey > query_req{
1262+
homestore::BtreeKeyRange< BlobRouteKey >{std::move(start_key), true /* inclusive */, std::move(end_key),
1263+
false /* inclusive */},
1264+
homestore::BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY,
1265+
std::numeric_limits< uint32_t >::max() /* blob count in a pg will not exceed uint32_t_max*/,
1266+
[&active_count, &tombstone_count](homestore::BtreeKey const& key,
1267+
homestore::BtreeValue const& value) mutable -> bool {
1268+
BlobRouteValue blob_value{value};
1269+
if (blob_value.pbas() == HSHomeObject::tombstone_pbas) {
1270+
tombstone_count++;
1271+
} else {
1272+
active_count++;
1273+
}
1274+
return false; // Continue scanning
1275+
}};
1276+
1277+
std::vector< std::pair< BlobRouteKey, BlobRouteValue > > dummy_out;
1278+
auto ret = hs_pg->index_table_->query(query_req, dummy_out);
1279+
RELEASE_ASSERT(ret == homestore::btree_status_t::success || ret == homestore::btree_status_t::has_more,
1280+
"Failed to scan index table for pg={}, status={}", pg_id, ret);
1281+
1282+
// Step 2: Scan chunks to calculate total occupied blocks
1283+
auto chunk_ids = chunk_selector()->get_pg_chunks(pg_id);
1284+
RELEASE_ASSERT(chunk_ids != nullptr, "Failed to get chunks for pg={}", pg_id);
1285+
1286+
uint64_t total_occupied = 0;
1287+
for (const auto& chunk_id : *chunk_ids) {
1288+
auto vchunk = chunk_selector()->get_extend_vchunk(chunk_id);
1289+
RELEASE_ASSERT(vchunk != nullptr, "Failed to get vchunk={} for pg={}", chunk_id, pg_id);
1290+
total_occupied += vchunk->get_used_blks();
1291+
}
1292+
1293+
// Step 3: Update durable_entities (atomic variables in memory)
1294+
hs_pg->durable_entities_update([active_count, tombstone_count, total_occupied](auto& de) {
1295+
de.active_blob_count.store(active_count, std::memory_order_relaxed);
1296+
de.tombstone_blob_count.store(tombstone_count, std::memory_order_relaxed);
1297+
de.total_occupied_blk_count.store(total_occupied, std::memory_order_relaxed);
1298+
});
1299+
1300+
LOGI("Refreshed statistics for pg={}: active_blobs={}, tombstone_blobs={}, occupied_blocks={}", pg_id, active_count,
1301+
tombstone_count, total_occupied);
1302+
}
1303+
12481304
void HSHomeObject::update_pg_meta_after_gc(const pg_id_t pg_id, const homestore::chunk_num_t move_from_chunk,
12491305
const homestore::chunk_num_t move_to_chunk, const uint64_t task_id) {
12501306
// 1 update pg metrics

src/lib/homestore_backend/replication_state_machine.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,6 +1046,10 @@ void ReplicationStateMachine::on_log_replay_done(const homestore::group_id_t& gr
10461046
LOGD("vchunk={} is selected for shard={} in pg={} when recovery", vchunk_id, shard_sb->info.id, pg_id);
10471047
}
10481048
}
1049+
1050+
// Refresh PG statistics after log replay
1051+
LOGI("Starting statistics refresh for pg={}", pg_id);
1052+
home_object_->refresh_pg_statistics(pg_id);
10491053
}
10501054

10511055
} // namespace homeobject

0 commit comments

Comments
 (0)