Skip to content

Commit d1e66aa

Browse files
authored
[fix](cloud) fix tablet stats for versioned keys (#59193)
1 parent 1bb5b4e commit d1e66aa

File tree

5 files changed

+218
-8
lines changed

5 files changed

+218
-8
lines changed

cloud/src/meta-service/meta_service.cpp

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5528,16 +5528,48 @@ std::pair<std::string, std::string> init_key_pair(std::string instance_id, int64
55285528
}
55295529

55305530
MetaServiceResponseStatus MetaServiceImpl::fix_tablet_stats(std::string cloud_unique_id_str,
5531-
std::string table_id_str) {
5531+
std::string table_id_str,
5532+
std::string tablet_id_str) {
55325533
// parse params
55335534
int64_t table_id;
5535+
int64_t tablet_id = -1;
55345536
std::string instance_id;
5535-
MetaServiceResponseStatus st = parse_fix_tablet_stats_param(
5536-
resource_mgr_, table_id_str, cloud_unique_id_str, table_id, instance_id);
5537+
MetaServiceResponseStatus st =
5538+
parse_fix_tablet_stats_param(resource_mgr_, table_id_str, cloud_unique_id_str,
5539+
tablet_id_str, table_id, instance_id, tablet_id);
55375540
if (st.code() != MetaServiceCode::OK) {
55385541
return st;
55395542
}
55405543

5544+
bool is_versioned_read = is_version_read_enabled(instance_id);
5545+
bool is_versioned_write = is_version_write_enabled(instance_id);
5546+
if (is_versioned_write) {
5547+
if (tablet_id < 0) {
5548+
st.set_code(MetaServiceCode::INVALID_ARGUMENT);
5549+
st.set_msg(
5550+
"cannot fix tablet stats for all tablets of a table when versioned write is "
5551+
"enabled, consider specifying tablet_id");
5552+
return st;
5553+
}
5554+
5555+
TabletIndexPB tablet_idx;
5556+
CloneChainReader reader(instance_id, txn_kv_.get(), resource_mgr_.get());
5557+
TxnErrorCode err = reader.get_tablet_index(tablet_id, &tablet_idx);
5558+
if (err != TxnErrorCode::TXN_OK) {
5559+
st.set_code(cast_as<ErrCategory::READ>(err));
5560+
st.set_msg(fmt::format("failed to get tablet index for tablet_id={}, err={}", tablet_id,
5561+
err));
5562+
return st;
5563+
}
5564+
5565+
auto&& [code, msg] = fix_versioned_tablet_stats_internal(
5566+
txn_kv_.get(), instance_id, tablet_idx, is_versioned_read, is_versioned_write,
5567+
resource_mgr_.get());
5568+
st.set_code(code);
5569+
st.set_msg(std::move(msg));
5570+
return st;
5571+
}
5572+
55415573
std::pair<std::string, std::string> key_pair = init_key_pair(instance_id, table_id);
55425574
std::string old_begin_key;
55435575
while (old_begin_key < key_pair.first) {

cloud/src/meta-service/meta_service.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ class MetaServiceImpl : public cloud::MetaService {
368368
InstanceInfoPB* instance);
369369

370370
MetaServiceResponseStatus fix_tablet_stats(std::string cloud_unique_id_str,
371-
std::string table_id_str);
371+
std::string table_id_str, std::string tablet_id_str);
372372

373373
std::pair<MetaServiceCode, std::string> fix_tablet_db_id(const std::string& instance_id,
374374
int64_t tablet_id, int64_t db_id);

cloud/src/meta-service/meta_service_http.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -609,9 +609,10 @@ static HttpResponse process_fix_tablet_stats(MetaServiceImpl* service, brpc::Con
609609
auto& uri = ctrl->http_request().uri();
610610
std::string_view cloud_unique_id = http_query(uri, "cloud_unique_id");
611611
std::string_view table_id = http_query(uri, "table_id");
612+
std::string_view tablet_id = http_query(uri, "tablet_id");
612613

613-
MetaServiceResponseStatus st =
614-
service->fix_tablet_stats(std::string(cloud_unique_id), std::string(table_id));
614+
MetaServiceResponseStatus st = service->fix_tablet_stats(
615+
std::string(cloud_unique_id), std::string(table_id), std::string(tablet_id));
615616
return http_text_reply(st, st.DebugString());
616617
}
617618

cloud/src/meta-service/meta_service_tablet_stats.cpp

Lines changed: 173 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,10 @@
3232
#include "meta-service/meta_service_helper.h"
3333
#include "meta-store/clone_chain_reader.h"
3434
#include "meta-store/keys.h"
35+
#include "meta-store/meta_reader.h"
3536
#include "meta-store/txn_kv.h"
3637
#include "meta-store/txn_kv_error.h"
38+
#include "meta-store/versioned_value.h"
3739

3840
namespace doris::cloud {
3941

@@ -302,7 +304,8 @@ void internal_get_load_tablet_stats_batch(MetaServiceCode& code, std::string& ms
302304

303305
MetaServiceResponseStatus parse_fix_tablet_stats_param(
304306
std::shared_ptr<ResourceManager> resource_mgr, const std::string& table_id_str,
305-
const std::string& cloud_unique_id_str, int64_t& table_id, std::string& instance_id) {
307+
const std::string& cloud_unique_id_str, const std::string& tablet_id_str, int64_t& table_id,
308+
std::string& instance_id, int64_t& tablet_id) {
306309
MetaServiceCode code = MetaServiceCode::OK;
307310
std::string msg;
308311
MetaServiceResponseStatus st;
@@ -317,6 +320,16 @@ MetaServiceResponseStatus parse_fix_tablet_stats_param(
317320
return st;
318321
}
319322

323+
if (!tablet_id_str.empty()) {
324+
try {
325+
tablet_id = std::stoll(tablet_id_str);
326+
} catch (...) {
327+
st.set_code(MetaServiceCode::INVALID_ARGUMENT);
328+
st.set_msg("Invalid tablet_id, tablet_id: " + tablet_id_str);
329+
return st;
330+
}
331+
}
332+
320333
instance_id = get_instance_id(resource_mgr, cloud_unique_id_str);
321334
if (instance_id.empty()) {
322335
code = MetaServiceCode::INVALID_ARGUMENT;
@@ -636,4 +649,163 @@ MetaServiceResponseStatus check_new_tablet_stats(
636649
return st;
637650
}
638651

652+
std::pair<MetaServiceCode, std::string> fix_versioned_tablet_stats_internal(
653+
TxnKv* txn_kv, const std::string& instance_id, const TabletIndexPB& tablet_idx,
654+
bool is_versioned_read, bool is_versioned_write, ResourceManager* resource_mgr) {
655+
int64_t tablet_id = tablet_idx.tablet_id();
656+
std::unique_ptr<Transaction> txn;
657+
MetaServiceCode code = MetaServiceCode::OK;
658+
std::string msg;
659+
660+
TxnErrorCode err = txn_kv->create_txn(&txn);
661+
if (err != TxnErrorCode::TXN_OK) {
662+
code = cast_as<ErrCategory::CREATE>(err);
663+
msg = "failed to create txn";
664+
return {code, msg};
665+
}
666+
667+
TabletStatsPB original_tablet_stat;
668+
TabletStatsPB existing_compact_stats;
669+
TabletStatsPB existing_load_stats;
670+
Versionstamp compact_versionstamp;
671+
Versionstamp load_versionstamp;
672+
GetRowsetResponse resp;
673+
674+
CloneChainReader meta_reader(instance_id, resource_mgr);
675+
if (is_versioned_read) {
676+
// Get existing compact stats
677+
err = meta_reader.get_tablet_compact_stats(txn.get(), tablet_id, &existing_compact_stats,
678+
&compact_versionstamp, true);
679+
if (err != TxnErrorCode::TXN_OK && err != TxnErrorCode::TXN_KEY_NOT_FOUND) {
680+
code = cast_as<ErrCategory::READ>(err);
681+
msg = fmt::format("failed to get versioned compact stats, tablet_id={}, err={}",
682+
tablet_id, err);
683+
return {code, msg};
684+
}
685+
686+
// Get existing load stats
687+
err = meta_reader.get_tablet_load_stats(txn.get(), tablet_id, &existing_load_stats,
688+
&load_versionstamp, true);
689+
if (err != TxnErrorCode::TXN_OK && err != TxnErrorCode::TXN_KEY_NOT_FOUND) {
690+
code = cast_as<ErrCategory::READ>(err);
691+
msg = fmt::format("failed to get versioned load stats, tablet_id={}, err={}", tablet_id,
692+
err);
693+
return {code, msg};
694+
}
695+
MetaReader::merge_tablet_stats(existing_compact_stats, existing_load_stats,
696+
&original_tablet_stat);
697+
698+
std::vector<RowsetMetaCloudPB> rowset_metas;
699+
int64_t start = 0, end = std::numeric_limits<int64_t>::max() - 1;
700+
err = meta_reader.get_rowset_metas(txn.get(), tablet_id, start, end, &rowset_metas);
701+
if (err != TxnErrorCode::TXN_OK) {
702+
code = cast_as<ErrCategory::READ>(err);
703+
msg = fmt::format("failed to get versioned rowset, err={}, tablet_id={}", err,
704+
tablet_id);
705+
return {code, msg};
706+
}
707+
708+
std::move(rowset_metas.begin(), rowset_metas.end(),
709+
google::protobuf::RepeatedPtrFieldBackInserter(resp.mutable_rowset_meta()));
710+
} else {
711+
internal_get_tablet_stats(code, msg, txn.get(), instance_id, tablet_idx,
712+
original_tablet_stat, true);
713+
if (code != MetaServiceCode::OK) {
714+
return {code, msg};
715+
}
716+
// get rowsets in tablet and accumulate disk size
717+
internal_get_rowset(txn.get(), 0, std::numeric_limits<int64_t>::max() - 1, instance_id,
718+
tablet_id, code, msg, &resp);
719+
if (code != MetaServiceCode::OK) {
720+
return {code, msg};
721+
}
722+
}
723+
724+
int64_t table_id = original_tablet_stat.idx().table_id();
725+
int64_t index_id = original_tablet_stat.idx().index_id();
726+
int64_t partition_id = original_tablet_stat.idx().partition_id();
727+
728+
int64_t total_disk_size = 0;
729+
int64_t index_disk_size = 0;
730+
int64_t data_disk_size = 0;
731+
for (const auto& rs_meta : resp.rowset_meta()) {
732+
total_disk_size += rs_meta.total_disk_size();
733+
index_disk_size += rs_meta.index_disk_size();
734+
data_disk_size += rs_meta.data_disk_size();
735+
}
736+
737+
// set new disk size to tabletPB and write it back
738+
TabletStatsPB tablet_stat;
739+
tablet_stat.CopyFrom(original_tablet_stat);
740+
tablet_stat.set_data_size(total_disk_size);
741+
tablet_stat.set_index_size(index_disk_size);
742+
tablet_stat.set_segment_size(data_disk_size);
743+
744+
// Write single version stats
745+
std::string tablet_stat_key;
746+
std::string tablet_stat_value;
747+
tablet_stat_key = stats_tablet_key({instance_id, table_id, index_id, partition_id, tablet_id});
748+
if (!tablet_stat.SerializeToString(&tablet_stat_value)) {
749+
code = MetaServiceCode::PROTOBUF_SERIALIZE_ERR;
750+
msg = "failed to serialize tablet stat";
751+
return {code, msg};
752+
}
753+
txn->put(tablet_stat_key, tablet_stat_value);
754+
755+
std::string num_segs_key =
756+
stats_tablet_num_segs_key({instance_id, table_id, index_id, partition_id, tablet_id});
757+
std::string num_rows_key =
758+
stats_tablet_num_rows_key({instance_id, table_id, index_id, partition_id, tablet_id});
759+
std::string num_rowsets_key = stats_tablet_num_rowsets_key(
760+
{instance_id, table_id, index_id, partition_id, tablet_id});
761+
std::string data_size_key =
762+
stats_tablet_data_size_key({instance_id, table_id, index_id, partition_id, tablet_id});
763+
std::string index_size_key =
764+
stats_tablet_index_size_key({instance_id, table_id, index_id, partition_id, tablet_id});
765+
std::string segment_size_key = stats_tablet_segment_size_key(
766+
{instance_id, table_id, index_id, partition_id, tablet_id});
767+
txn->remove(num_segs_key);
768+
txn->remove(num_rows_key);
769+
txn->remove(num_rowsets_key);
770+
txn->remove(data_size_key);
771+
txn->remove(index_size_key);
772+
txn->remove(segment_size_key);
773+
774+
if (is_versioned_write) {
775+
// Write compact stats (aggregate stats with accurate disk sizes)
776+
std::string compact_stats_key =
777+
versioned::tablet_compact_stats_key({instance_id, tablet_id});
778+
TabletStatsPB compact_stats = tablet_stat; // Use the fixed stats with accurate disk sizes
779+
versioned_put(txn.get(), compact_stats_key, compact_versionstamp, tablet_stat_value);
780+
LOG(INFO) << "put versioned tablet compact stats key=" << hex(compact_stats_key)
781+
<< " tablet_id=" << tablet_id << " with existing versionstamp";
782+
783+
// Write load stats (detached stats, set to 0 since we recalculated from rowsets)
784+
std::string load_stats_key = versioned::tablet_load_stats_key({instance_id, tablet_id});
785+
TabletStatsPB load_stats;
786+
load_stats.mutable_idx()->CopyFrom(tablet_stat.idx());
787+
788+
std::string load_stats_value;
789+
if (!load_stats.SerializeToString(&load_stats_value)) {
790+
code = MetaServiceCode::PROTOBUF_SERIALIZE_ERR;
791+
msg = "failed to serialize load stats";
792+
return {code, msg};
793+
}
794+
795+
// Overwrite with existing versionstamp
796+
versioned_put(txn.get(), load_stats_key, load_versionstamp, load_stats_value);
797+
LOG(INFO) << "put versioned tablet load stats key=" << hex(load_stats_key)
798+
<< " tablet_id=" << tablet_id << " with existing versionstamp";
799+
}
800+
801+
err = txn->commit();
802+
if (err != TxnErrorCode::TXN_OK) {
803+
code = cast_as<ErrCategory::COMMIT>(err);
804+
msg = "failed to commit txn";
805+
return {code, msg};
806+
}
807+
808+
return {MetaServiceCode::OK, ""};
809+
}
810+
639811
} // namespace doris::cloud

cloud/src/meta-service/meta_service_tablet_stats.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,18 @@ void internal_get_load_tablet_stats_batch(MetaServiceCode& code, std::string& ms
104104

105105
MetaServiceResponseStatus parse_fix_tablet_stats_param(
106106
std::shared_ptr<ResourceManager> resource_mgr, const std::string& table_id_str,
107-
const std::string& cloud_unique_id_str, int64_t& table_id, std::string& instance_id);
107+
const std::string& cloud_unique_id_str, const std::string& tablet_id_str, int64_t& table_id,
108+
std::string& instance_id, int64_t& tablet_id);
108109

109110
MetaServiceResponseStatus fix_tablet_stats_internal(
110111
std::shared_ptr<TxnKv> txn_kv, std::pair<std::string, std::string>& key_pair,
111112
std::vector<std::shared_ptr<TabletStatsPB>>& tablet_stat_shared_ptr_vec_batch,
112113
const std::string& instance_id, size_t batch_size = 20);
113114

115+
std::pair<MetaServiceCode, std::string> fix_versioned_tablet_stats_internal(
116+
TxnKv* txn_kv, const std::string& instance_id, const TabletIndexPB& tablet_idx,
117+
bool is_versioned_read, bool is_versioned_write, ResourceManager* resource_mgr);
118+
114119
MetaServiceResponseStatus check_new_tablet_stats(
115120
std::shared_ptr<TxnKv> txn_kv, const std::string& instance_id,
116121
const std::vector<std::shared_ptr<TabletStatsPB>>& tablet_stat_shared_ptr_vec_batch);

0 commit comments

Comments
 (0)