Skip to content

Commit 25c864d

Browse files
authored
[fix](cloud) Fix balanced_tablets_shards leak memory and fix some file cache case (#59093)
### What problem does this PR solve? 1. When fixing the issue of CloudWarmUpManager leaking a small amount of memory and failing to remove (tablet_id, peer_addr) from memory during warm-up balance when file cache meta is not retrieved from source, the issue was resolved. 2. Fixed several file cache Docker cases affected by small file optimization PR(#57770) Issue Number: close #xxx Related PR: #xxx Problem Summary: ### Release note None ### Check List (For Author) - Test <!-- At least one of them must be included. --> - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [x] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason <!-- Add your reason? --> - Behavior changed: - [x] No. - [ ] Yes. <!-- Explain the behavior change --> - Does this need documentation? - [x] No. - [ ] Yes. <!-- Add document PR link here. eg: apache/doris-website#1214 --> ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label <!-- Add branch pick label that this PR should merge into -->
1 parent 2fa6f72 commit 25c864d

14 files changed

+55
-38
lines changed

be/src/cloud/cloud_warm_up_manager.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
#include <bthread/condition_variable.h>
2121
#include <bthread/mutex.h>
22+
#include <bthread/unstable.h>
23+
#include <butil/time.h>
2224
#include <bvar/bvar.h>
2325
#include <bvar/reducer.h>
2426

@@ -795,10 +797,39 @@ void CloudWarmUpManager::record_balanced_tablet(int64_t tablet_id, const std::st
795797
meta.brpc_port = brpc_port;
796798
shard.tablets.emplace(tablet_id, std::move(meta));
797799
g_balance_tablet_be_mapping_size << 1;
800+
schedule_remove_balanced_tablet(tablet_id);
798801
VLOG_DEBUG << "Recorded balanced warm up cache tablet: tablet_id=" << tablet_id
799802
<< ", host=" << host << ":" << brpc_port;
800803
}
801804

805+
void CloudWarmUpManager::schedule_remove_balanced_tablet(int64_t tablet_id) {
806+
// Use std::make_unique to avoid raw pointer allocation
807+
auto tablet_id_ptr = std::make_unique<int64_t>(tablet_id);
808+
unsigned long expired_ms = g_tablet_report_inactive_duration_ms;
809+
if (doris::config::cache_read_from_peer_expired_seconds > 0 &&
810+
doris::config::cache_read_from_peer_expired_seconds <=
811+
g_tablet_report_inactive_duration_ms / 1000) {
812+
expired_ms = doris::config::cache_read_from_peer_expired_seconds * 1000;
813+
}
814+
bthread_timer_t timer_id;
815+
// ATTN: The timer callback will reclaim ownership of the tablet_id_ptr, so we need to release it after the timer is added.
816+
if (const int rc = bthread_timer_add(&timer_id, butil::milliseconds_from_now(expired_ms),
817+
clean_up_expired_mappings, tablet_id_ptr.get());
818+
rc == 0) {
819+
tablet_id_ptr.release();
820+
} else {
821+
LOG(WARNING) << "Fail to add timer for clean up expired mappings for tablet_id="
822+
<< tablet_id << " rc=" << rc;
823+
}
824+
}
825+
826+
void CloudWarmUpManager::clean_up_expired_mappings(void* arg) {
827+
std::unique_ptr<int64_t> tid(static_cast<int64_t*>(arg));
828+
auto& manager = ExecEnv::GetInstance()->storage_engine().to_cloud().cloud_warm_up_manager();
829+
manager.remove_balanced_tablet(*tid);
830+
VLOG_DEBUG << "Removed expired balanced warm up cache tablet: tablet_id=" << *tid;
831+
}
832+
802833
std::optional<std::pair<std::string, int32_t>> CloudWarmUpManager::get_balanced_tablet_info(
803834
int64_t tablet_id) {
804835
auto& shard = get_shard(tablet_id);

be/src/cloud/cloud_warm_up_manager.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ class CloudWarmUpManager {
9898
std::unordered_map<int64_t, std::pair<std::string, int32_t>> get_all_balanced_tablets() const;
9999

100100
private:
101+
void schedule_remove_balanced_tablet(int64_t tablet_id);
102+
static void clean_up_expired_mappings(void* arg);
101103
void handle_jobs();
102104

103105
Status _do_warm_up_rowset(RowsetMeta& rs_meta, std::vector<TReplicaInfo>& replicas,

be/src/io/cache/block_file_cache_downloader.cpp

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -173,14 +173,6 @@ std::unordered_map<std::string, RowsetMetaSharedPtr> snapshot_rs_metas(BaseTable
173173
return id_to_rowset_meta_map;
174174
}
175175

176-
static void clean_up_expired_mappings(void* arg) {
177-
// Reclaim ownership with unique_ptr for automatic memory management
178-
std::unique_ptr<int64_t> tablet_id(static_cast<int64_t*>(arg));
179-
auto& manager = ExecEnv::GetInstance()->storage_engine().to_cloud().cloud_warm_up_manager();
180-
manager.remove_balanced_tablet(*tablet_id);
181-
VLOG_DEBUG << "Removed expired balanced warm up cache tablet: tablet_id=" << *tablet_id;
182-
}
183-
184176
void FileCacheBlockDownloader::download_file_cache_block(
185177
const DownloadTask::FileCacheBlockMetaVec& metas) {
186178
std::unordered_set<int64_t> synced_tablets;
@@ -236,27 +228,8 @@ void FileCacheBlockDownloader::download_file_cache_block(
236228
<< "]";
237229
}
238230
}
239-
// Use std::make_unique to avoid raw pointer allocation
240-
auto tablet_id_ptr = std::make_unique<int64_t>(tablet_id);
241-
unsigned long expired_ms = g_tablet_report_inactive_duration_ms;
242-
if (doris::config::cache_read_from_peer_expired_seconds > 0 &&
243-
doris::config::cache_read_from_peer_expired_seconds <=
244-
g_tablet_report_inactive_duration_ms / 1000) {
245-
expired_ms = doris::config::cache_read_from_peer_expired_seconds * 1000;
246-
}
247-
bthread_timer_t timer_id;
248-
// ATTN: The timer callback will reclaim ownership of the tablet_id_ptr, so we need to release it after the timer is added.
249-
if (const int rc =
250-
bthread_timer_add(&timer_id, butil::milliseconds_from_now(expired_ms),
251-
clean_up_expired_mappings, tablet_id_ptr.get());
252-
rc == 0) {
253-
tablet_id_ptr.release();
254-
} else {
255-
LOG(WARNING) << "Fail to add timer for clean up expired mappings for tablet_id="
256-
<< tablet_id << " rc=" << rc;
257-
}
258231
LOG(INFO) << "download_file_cache_block: download_done, tablet_Id=" << tablet_id
259-
<< " status=" << st.to_string() << " expired_ms=" << expired_ms;
232+
<< " status=" << st.to_string();
260233
};
261234

262235
std::string path;

regression-test/suites/cloud_p0/balance/test_balance_use_compute_group_properties.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ suite('test_balance_use_compute_group_properties', 'docker') {
3636
'report_tablet_interval_seconds=1',
3737
'schedule_sync_tablets_interval_s=18000',
3838
'disable_auto_compaction=true',
39-
'sys_log_verbose_modules=*'
39+
'sys_log_verbose_modules=*',
40+
'enable_packed_file=false',
4041
]
4142
options.setFeNum(1)
4243
options.setBeNum(1)

regression-test/suites/cloud_p0/balance/test_balance_warm_up.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ suite('test_balance_warm_up', 'docker') {
3737
'disable_auto_compaction=true',
3838
'sys_log_verbose_modules=*',
3939
'cache_read_from_peer_expired_seconds=100',
40-
'enable_cache_read_from_peer=true'
40+
'enable_cache_read_from_peer=true',
41+
'enable_packed_file=false',
4142
]
4243
options.setFeNum(1)
4344
options.setBeNum(1)

regression-test/suites/cloud_p0/balance/test_balance_warm_up_sync_global_config.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ suite('test_balance_warm_up_sync_cache', 'docker') {
3636
'report_tablet_interval_seconds=1',
3737
'schedule_sync_tablets_interval_s=18000',
3838
'disable_auto_compaction=true',
39-
'sys_log_verbose_modules=*'
39+
'sys_log_verbose_modules=*',
40+
'enable_packed_file=false',
4041
]
4142
options.setFeNum(1)
4243
options.setBeNum(1)

regression-test/suites/cloud_p0/balance/test_balance_warm_up_task_abnormal.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ suite('test_balance_warm_up_task_abnormal', 'docker') {
3636
'report_tablet_interval_seconds=1',
3737
'schedule_sync_tablets_interval_s=18000',
3838
'disable_auto_compaction=true',
39-
'sys_log_verbose_modules=*'
39+
'sys_log_verbose_modules=*',
40+
'enable_packed_file=false',
4041
]
4142
options.setFeNum(1)
4243
options.setBeNum(1)

regression-test/suites/cloud_p0/balance/test_balance_warm_up_use_peer_cache.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ suite('test_balance_warm_up_use_peer_cache', 'docker') {
4040
'disable_auto_compaction=true',
4141
'sys_log_verbose_modules=*',
4242
'cache_read_from_peer_expired_seconds=100',
43-
'enable_cache_read_from_peer=true'
43+
'enable_cache_read_from_peer=true',
44+
'enable_packed_file=false',
4445
]
4546
options.setFeNum(1)
4647
options.setBeNum(1)

regression-test/suites/cloud_p0/balance/test_balance_warm_up_with_compaction_use_peer_cache.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ suite('test_balance_warm_up_with_compaction_use_peer_cache', 'docker') {
4141
'sys_log_verbose_modules=*',
4242
'cumulative_compaction_min_deltas=5',
4343
'cache_read_from_peer_expired_seconds=100',
44-
'enable_cache_read_from_peer=true'
44+
'enable_cache_read_from_peer=true',
45+
'enable_packed_file=false',
4546
]
4647
options.setFeNum(1)
4748
options.setBeNum(1)

regression-test/suites/cloud_p0/balance/test_peer_read_async_warmup.groovy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ suite('test_peer_read_async_warmup', 'docker') {
3939
'disable_auto_compaction=true',
4040
'sys_log_verbose_modules=*',
4141
'enable_cache_read_from_peer=true',
42+
'enable_packed_file=false',
4243
]
4344
options.setFeNum(1)
4445
options.setBeNum(1)

0 commit comments

Comments
 (0)