Skip to content

Commit 73cf386

Browse files
authored
feat(query): Support flight_connection_max_retry_times and flight_connection_retry_interval setting (#16856)
* feat(query): Support `query_max_failures` setting * fix * cluster do_action retry * fix
1 parent 2ede35d commit 73cf386

File tree

15 files changed

+128
-33
lines changed

15 files changed

+128
-33
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/common/exception/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ bincode = { workspace = true }
2121
geozero = { workspace = true }
2222
gimli = { workspace = true }
2323
http = { workspace = true }
24+
hyper = { workspace = true }
2425
libc = { workspace = true }
2526
object = { workspace = true }
2627
once_cell = { workspace = true }

src/common/exception/src/exception_into.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,13 @@ impl From<tonic::Status> for ErrorCode {
353353
tonic::Code::Unknown => {
354354
let details = status.details();
355355
if details.is_empty() {
356+
if status.source().map_or(false, |e| e.is::<hyper::Error>()) {
357+
return ErrorCode::CannotConnectNode(format!(
358+
"{}, source: {:?}",
359+
status.message(),
360+
status.source()
361+
));
362+
}
356363
return ErrorCode::UnknownException(format!(
357364
"{}, source: {:?}",
358365
status.message(),

src/query/service/src/clusters/cluster.rs

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,14 @@ use futures::future::Either;
5050
use futures::Future;
5151
use futures::StreamExt;
5252
use log::error;
53+
use log::info;
5354
use log::warn;
5455
use parking_lot::RwLock;
5556
use rand::thread_rng;
5657
use rand::Rng;
5758
use serde::Deserialize;
5859
use serde::Serialize;
60+
use tokio::time::sleep;
5961

6062
use crate::servers::flight::FlightClient;
6163

@@ -81,11 +83,11 @@ pub trait ClusterHelper {
8183

8284
fn get_nodes(&self) -> Vec<Arc<NodeInfo>>;
8385

84-
async fn do_action<T: Serialize + Send, Res: for<'de> Deserialize<'de> + Send>(
86+
async fn do_action<T: Serialize + Send + Clone, Res: for<'de> Deserialize<'de> + Send>(
8587
&self,
8688
path: &str,
8789
message: HashMap<String, T>,
88-
timeout: u64,
90+
flight_params: FlightParams,
8991
) -> Result<HashMap<String, Res>>;
9092
}
9193

@@ -118,11 +120,11 @@ impl ClusterHelper for Cluster {
118120
self.nodes.to_vec()
119121
}
120122

121-
async fn do_action<T: Serialize + Send, Res: for<'de> Deserialize<'de> + Send>(
123+
async fn do_action<T: Serialize + Send + Clone, Res: for<'de> Deserialize<'de> + Send>(
122124
&self,
123125
path: &str,
124126
message: HashMap<String, T>,
125-
timeout: u64,
127+
flight_params: FlightParams,
126128
) -> Result<HashMap<String, Res>> {
127129
fn get_node<'a>(nodes: &'a [Arc<NodeInfo>], id: &str) -> Result<&'a Arc<NodeInfo>> {
128130
for node in nodes {
@@ -137,23 +139,47 @@ impl ClusterHelper for Cluster {
137139
)))
138140
}
139141

140-
let mut response = HashMap::with_capacity(message.len());
142+
let mut futures = Vec::with_capacity(message.len());
141143
for (id, message) in message {
142144
let node = get_node(&self.nodes, &id)?;
143145

144-
let config = GlobalConfig::instance();
145-
let flight_address = node.flight_address.clone();
146-
let node_secret = node.secret.clone();
147-
148-
let mut conn = create_client(&config, &flight_address).await?;
149-
response.insert(
150-
id,
151-
conn.do_action::<_, Res>(path, node_secret, message, timeout)
152-
.await?,
153-
);
146+
futures.push({
147+
let config = GlobalConfig::instance();
148+
let flight_address = node.flight_address.clone();
149+
let node_secret = node.secret.clone();
150+
151+
async move {
152+
let mut attempt = 0;
153+
154+
loop {
155+
let mut conn = create_client(&config, &flight_address).await?;
156+
match conn
157+
.do_action::<_, Res>(
158+
path,
159+
node_secret.clone(),
160+
message.clone(),
161+
flight_params.timeout,
162+
)
163+
.await
164+
{
165+
Ok(result) => return Ok((id, result)),
166+
Err(e)
167+
if e.code() == ErrorCode::CANNOT_CONNECT_NODE
168+
&& attempt < flight_params.retry_times =>
169+
{
170+
// only retry when error is network problem
171+
info!("retry do_action, attempt: {}", attempt);
172+
attempt += 1;
173+
sleep(Duration::from_secs(flight_params.retry_interval)).await;
174+
}
175+
Err(e) => return Err(e),
176+
}
177+
}
178+
}
179+
});
154180
}
155-
156-
Ok(response)
181+
let responses: Vec<(String, Res)> = futures::future::try_join_all(futures).await?;
182+
Ok(responses.into_iter().collect::<HashMap<String, Res>>())
157183
}
158184
}
159185

@@ -537,3 +563,10 @@ pub async fn create_client(config: &InnerConfig, address: &str) -> Result<Flight
537563
ConnectionFactory::create_rpc_channel(address.to_owned(), timeout, rpc_tls_config).await?,
538564
)))
539565
}
566+
567+
#[derive(Clone, Copy, Debug)]
568+
pub struct FlightParams {
569+
pub(crate) timeout: u64,
570+
pub(crate) retry_times: u64,
571+
pub(crate) retry_interval: u64,
572+
}

src/query/service/src/clusters/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ mod cluster;
1717
pub use cluster::Cluster;
1818
pub use cluster::ClusterDiscovery;
1919
pub use cluster::ClusterHelper;
20+
pub use cluster::FlightParams;

src/query/service/src/interpreters/interpreter_kill.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use databend_common_exception::Result;
2121
use databend_common_sql::plans::KillPlan;
2222

2323
use crate::clusters::ClusterHelper;
24+
use crate::clusters::FlightParams;
2425
use crate::interpreters::Interpreter;
2526
use crate::pipelines::PipelineBuildResult;
2627
use crate::servers::flight::v1::actions::KILL_QUERY;
@@ -54,7 +55,11 @@ impl KillInterpreter {
5455
async fn kill_cluster_query(&self) -> Result<PipelineBuildResult> {
5556
let cluster = self.ctx.get_cluster();
5657
let settings = self.ctx.get_settings();
57-
let timeout = settings.get_flight_client_timeout()?;
58+
let flight_params = FlightParams {
59+
timeout: settings.get_flight_client_timeout()?,
60+
retry_times: settings.get_flight_max_retry_times()?,
61+
retry_interval: settings.get_flight_retry_interval()?,
62+
};
5863

5964
let mut message = HashMap::with_capacity(cluster.nodes.len());
6065

@@ -65,7 +70,7 @@ impl KillInterpreter {
6570
}
6671

6772
let res = cluster
68-
.do_action::<_, bool>(KILL_QUERY, message, timeout)
73+
.do_action::<_, bool>(KILL_QUERY, message, flight_params)
6974
.await?;
7075

7176
match res.values().any(|x| *x) {

src/query/service/src/interpreters/interpreter_set_priority.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use databend_common_exception::Result;
2121
use databend_common_sql::plans::SetPriorityPlan;
2222

2323
use crate::clusters::ClusterHelper;
24+
use crate::clusters::FlightParams;
2425
use crate::interpreters::Interpreter;
2526
use crate::pipelines::PipelineBuildResult;
2627
use crate::servers::flight::v1::actions::SET_PRIORITY;
@@ -61,9 +62,13 @@ impl SetPriorityInterpreter {
6162
}
6263

6364
let settings = self.ctx.get_settings();
64-
let timeout = settings.get_flight_client_timeout()?;
65+
let flight_params = FlightParams {
66+
timeout: settings.get_flight_client_timeout()?,
67+
retry_times: settings.get_flight_max_retry_times()?,
68+
retry_interval: settings.get_flight_retry_interval()?,
69+
};
6570
let res = cluster
66-
.do_action::<_, bool>(SET_PRIORITY, message, timeout)
71+
.do_action::<_, bool>(SET_PRIORITY, message, flight_params)
6772
.await?;
6873

6974
match res.values().any(|x| *x) {

src/query/service/src/interpreters/interpreter_system_action.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use databend_common_sql::plans::SystemAction;
2222
use databend_common_sql::plans::SystemPlan;
2323

2424
use crate::clusters::ClusterHelper;
25+
use crate::clusters::FlightParams;
2526
use crate::interpreters::Interpreter;
2627
use crate::pipelines::PipelineBuildResult;
2728
use crate::servers::flight::v1::actions::SYSTEM_ACTION;
@@ -74,9 +75,13 @@ impl Interpreter for SystemActionInterpreter {
7475
}
7576

7677
let settings = self.ctx.get_settings();
77-
let timeout = settings.get_flight_client_timeout()?;
78+
let flight_params = FlightParams {
79+
timeout: settings.get_flight_client_timeout()?,
80+
retry_times: settings.get_flight_max_retry_times()?,
81+
retry_interval: settings.get_flight_retry_interval()?,
82+
};
7883
cluster
79-
.do_action::<_, ()>(SYSTEM_ACTION, message, timeout)
84+
.do_action::<_, ()>(SYSTEM_ACTION, message, flight_params)
8085
.await?;
8186
}
8287

src/query/service/src/interpreters/interpreter_table_truncate.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use databend_common_exception::Result;
2121
use databend_common_sql::plans::TruncateTablePlan;
2222

2323
use crate::clusters::ClusterHelper;
24+
use crate::clusters::FlightParams;
2425
use crate::interpreters::Interpreter;
2526
use crate::pipelines::PipelineBuildResult;
2627
use crate::servers::flight::v1::actions::TRUNCATE_TABLE;
@@ -95,9 +96,13 @@ impl Interpreter for TruncateTableInterpreter {
9596
}
9697

9798
let settings = self.ctx.get_settings();
98-
let timeout = settings.get_flight_client_timeout()?;
99+
let flight_params = FlightParams {
100+
timeout: settings.get_flight_client_timeout()?,
101+
retry_times: settings.get_flight_max_retry_times()?,
102+
retry_interval: settings.get_flight_retry_interval()?,
103+
};
99104
cluster
100-
.do_action::<_, ()>(TRUNCATE_TABLE, message, timeout)
105+
.do_action::<_, ()>(TRUNCATE_TABLE, message, flight_params)
101106
.await?;
102107
}
103108

src/query/service/src/servers/admin/v1/query_profiling.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ use poem::IntoResponse;
3030

3131
use crate::clusters::ClusterDiscovery;
3232
use crate::clusters::ClusterHelper;
33+
use crate::clusters::FlightParams;
3334
use crate::servers::flight::v1::actions::GET_PROFILE;
3435
use crate::sessions::SessionManager;
3536

@@ -104,8 +105,13 @@ async fn get_cluster_profile(query_id: &str) -> Result<Vec<PlanProfile>, ErrorCo
104105
}
105106
}
106107

108+
let flight_params = FlightParams {
109+
timeout: 60,
110+
retry_times: 3,
111+
retry_interval: 3,
112+
};
107113
let res = cluster
108-
.do_action::<_, Option<Vec<PlanProfile>>>(GET_PROFILE, message, 60)
114+
.do_action::<_, Option<Vec<PlanProfile>>>(GET_PROFILE, message, flight_params)
109115
.await?;
110116

111117
match res.into_values().find(Option::is_some) {

0 commit comments

Comments
 (0)