@@ -38,6 +38,10 @@ namespace doris {
3838
3939bvar::Adder<uint64_t > g_file_cache_warm_up_cache_async_submitted_segment_num (
4040 " file_cache_warm_up_cache_async_submitted_segment_num" );
41+ bvar::Adder<uint64_t > g_file_cache_warm_up_cache_async_submitted_task_num (
42+ " file_cache_warm_up_cache_async_submitted_task_num" );
43+ bvar::Adder<uint64_t > g_file_cache_warm_up_cache_async_submitted_tablet_num (
44+ " file_cache_warm_up_cache_async_submitted_tablet_num" );
4145
4246CloudBackendService::CloudBackendService (CloudStorageEngine& engine, ExecEnv* exec_env)
4347 : BaseBackendService(exec_env), _engine(engine) {}
@@ -169,79 +173,89 @@ void CloudBackendService::warm_up_tablets(TWarmUpTabletsResponse& response,
169173
170174void CloudBackendService::warm_up_cache_async (TWarmUpCacheAsyncResponse& response,
171175 const TWarmUpCacheAsyncRequest& request) {
172- std::ostringstream oss;
173- oss << " [" ;
174- for (size_t i = 0 ; i < request.tablet_ids .size () && i < 10 ; ++i) {
175- if (i > 0 ) oss << " ," ;
176- oss << request.tablet_ids [i];
177- }
178- oss << " ]" ;
179- LOG (INFO) << " warm_up_cache_async: enter, request=" << request.host << " :" << request.brpc_port
180- << " , tablets num=" << request.tablet_ids .size () << " , tablet_ids=" << oss.str ();
176+ // just submit the task to the thread pool, no need to wait for the result
177+ auto do_warm_up = [this , request]() {
178+ std::ostringstream oss;
179+ oss << " [" ;
180+ for (size_t i = 0 ; i < request.tablet_ids .size () && i < 10 ; ++i) {
181+ if (i > 0 ) oss << " ," ;
182+ oss << request.tablet_ids [i];
183+ }
184+ oss << " ]" ;
185+ g_file_cache_warm_up_cache_async_submitted_tablet_num << request.tablet_ids .size ();
186+ LOG (INFO) << " warm_up_cache_async: enter, request=" << request.host << " :"
187+ << request.brpc_port << " , tablets num=" << request.tablet_ids .size ()
188+ << " , tablet_ids=" << oss.str ();
181189
182- auto & manager = ExecEnv::GetInstance ()->storage_engine ().to_cloud ().cloud_warm_up_manager ();
183- // Record each tablet in manager
184- for (int64_t tablet_id : request.tablet_ids ) {
185- manager.record_balanced_tablet (tablet_id, request.host , request.brpc_port );
186- }
190+ auto & manager = ExecEnv::GetInstance ()->storage_engine ().to_cloud ().cloud_warm_up_manager ();
191+ // Record each tablet in manager
192+ for (int64_t tablet_id : request.tablet_ids ) {
193+ manager.record_balanced_tablet (tablet_id, request.host , request.brpc_port );
194+ }
187195
188- std::string host = request.host ;
189- auto dns_cache = ExecEnv::GetInstance ()->dns_cache ();
190- if (dns_cache == nullptr ) {
191- LOG (WARNING) << " DNS cache is not initialized, skipping hostname resolve" ;
192- } else if (!is_valid_ip (request.host )) {
193- Status status = dns_cache->get (request.host , &host);
194- if (!status.ok ()) {
195- LOG (WARNING) << " failed to get ip from host " << request.host << " : "
196- << status.to_string ();
197- // Remove failed tablets from tracking
198- manager.remove_balanced_tablets (request.tablet_ids );
196+ std::string host = request.host ;
197+ auto dns_cache = ExecEnv::GetInstance ()->dns_cache ();
198+ if (dns_cache == nullptr ) {
199+ LOG (WARNING) << " DNS cache is not initialized, skipping hostname resolve" ;
200+ } else if (!is_valid_ip (request.host )) {
201+ Status status = dns_cache->get (request.host , &host);
202+ if (!status.ok ()) {
203+ LOG (WARNING) << " failed to get ip from host " << request.host << " : "
204+ << status.to_string ();
205+ return ;
206+ }
207+ }
208+ std::string brpc_addr = get_host_port (host, request.brpc_port );
209+ std::shared_ptr<PBackendService_Stub> brpc_stub =
210+ _exec_env->brpc_internal_client_cache ()->get_new_client_no_cache (brpc_addr);
211+ if (!brpc_stub) {
212+ LOG (WARNING) << " warm_up_cache_async: failed to get brpc_stub for addr " << brpc_addr;
199213 return ;
200214 }
201- }
202- std::string brpc_addr = get_host_port (host, request.brpc_port );
203- Status st = Status::OK ();
204- TStatus t_status;
205- std::shared_ptr<PBackendService_Stub> brpc_stub =
206- _exec_env->brpc_internal_client_cache ()->get_new_client_no_cache (brpc_addr);
207- if (!brpc_stub) {
208- st = Status::RpcError (" Address {} is wrong" , brpc_addr);
209- LOG (WARNING) << " warm_up_cache_async: failed to get brpc_stub for addr " << brpc_addr;
210- // Remove failed tablets from tracking
211- manager.remove_balanced_tablets (request.tablet_ids );
212- return ;
213- }
214- brpc::Controller cntl;
215- PGetFileCacheMetaRequest brpc_request;
216- std::for_each (request.tablet_ids .cbegin (), request.tablet_ids .cend (),
217- [&](int64_t tablet_id) { brpc_request.add_tablet_ids (tablet_id); });
218- PGetFileCacheMetaResponse brpc_response;
215+ PGetFileCacheMetaRequest brpc_request;
216+ std::for_each (request.tablet_ids .cbegin (), request.tablet_ids .cend (),
217+ [&](int64_t tablet_id) { brpc_request.add_tablet_ids (tablet_id); });
219218
220- brpc_stub->get_file_cache_meta_by_tablet_id (&cntl, &brpc_request, &brpc_response, nullptr );
221- VLOG_DEBUG << " warm_up_cache_async: request=" << brpc_request.DebugString ()
222- << " , response=" << brpc_response.DebugString ();
223- if (!cntl.Failed ()) {
224- g_file_cache_warm_up_cache_async_submitted_segment_num
225- << brpc_response.file_cache_block_metas ().size ();
226- auto & file_cache_block_metas = *brpc_response.mutable_file_cache_block_metas ();
227- if (!file_cache_block_metas.empty ()) {
219+ auto run_rpc = [this , brpc_stub,
220+ brpc_addr](PGetFileCacheMetaRequest request_copy) -> Status {
221+ brpc::Controller cntl;
222+ cntl.set_timeout_ms (20 * 1000 ); // 20s
223+ PGetFileCacheMetaResponse brpc_response;
224+ brpc_stub->get_file_cache_meta_by_tablet_id (&cntl, &request_copy, &brpc_response,
225+ nullptr );
226+ if (cntl.Failed ()) {
227+ LOG (WARNING) << " warm_up_cache_async: brpc call failed, addr=" << brpc_addr
228+ << " , error=" << cntl.ErrorText ()
229+ << " , error code=" << cntl.ErrorCode ();
230+ return Status::RpcError (" {} isn't connected, error code={}" , brpc_addr,
231+ cntl.ErrorCode ());
232+ }
233+ VLOG_DEBUG << " warm_up_cache_async: request=" << request_copy.DebugString ()
234+ << " , response=" << brpc_response.DebugString ();
235+ g_file_cache_warm_up_cache_async_submitted_segment_num
236+ << brpc_response.file_cache_block_metas ().size ();
228237 _engine.file_cache_block_downloader ().submit_download_task (
229- std::move (file_cache_block_metas));
230- LOG (INFO) << " warm_up_cache_async: successfully submitted download task for tablets="
231- << oss.str ();
232- } else {
233- LOG (INFO) << " warm_up_cache_async: no file cache block meta found, addr=" << brpc_addr;
234- manager.remove_balanced_tablets (request.tablet_ids );
238+ std::move (*brpc_response.mutable_file_cache_block_metas ()));
239+ return Status::OK ();
240+ };
241+
242+ Status rpc_status = run_rpc (std::move (brpc_request));
243+ if (!rpc_status.ok ()) {
244+ LOG (WARNING) << " warm_up_cache_async: rpc failed for addr=" << brpc_addr
245+ << " , status=" << rpc_status;
235246 }
236- } else {
237- st = Status::RpcError (" {} isn't connected" , brpc_addr);
238- // Remove failed tablets from tracking
239- manager.remove_balanced_tablets (request.tablet_ids );
240- LOG (WARNING) << " warm_up_cache_async: brpc call failed, addr=" << brpc_addr
241- << " , error=" << cntl.ErrorText ();
247+ };
248+ g_file_cache_warm_up_cache_async_submitted_task_num << 1 ;
249+ Status submit_st = _engine.warmup_cache_async_thread_pool ().submit_func (std::move (do_warm_up));
250+ if (!submit_st.ok ()) {
251+ LOG (WARNING) << " warm_up_cache_async: fail to submit heavy task to "
252+ " warmup_cache_async_thread_pool, status="
253+ << submit_st.to_string () << " , execute synchronously" ;
254+ do_warm_up ();
242255 }
243- st.to_thrift (&t_status);
244- response.status = t_status;
256+ TStatus t_status;
257+ submit_st.to_thrift (&t_status);
258+ response.status = std::move (t_status);
245259}
246260
247261void CloudBackendService::check_warm_up_cache_async (TCheckWarmUpCacheAsyncResponse& response,
0 commit comments