-
Notifications
You must be signed in to change notification settings - Fork 261
feat: make gRPC timeout configurations user-configurable #1337
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
384c4ec
23cd355
9e2cf6f
c6b8d87
3d9f9f0
10ab817
c83bc84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,18 @@ pub const BALLISTA_SHUFFLE_READER_FORCE_REMOTE_READ: &str = | |
| pub const BALLISTA_SHUFFLE_READER_REMOTE_PREFER_FLIGHT: &str = | ||
| "ballista.shuffle.remote_read_prefer_flight"; | ||
|
|
||
| // gRPC client timeout configurations | ||
| pub const BALLISTA_GRPC_CLIENT_CONNECT_TIMEOUT_SECONDS: &str = | ||
| "ballista.grpc.client.connect_timeout_seconds"; | ||
| pub const BALLISTA_GRPC_CLIENT_TIMEOUT_SECONDS: &str = | ||
| "ballista.grpc.client.timeout_seconds"; | ||
| pub const BALLISTA_GRPC_CLIENT_TCP_KEEPALIVE_SECONDS: &str = | ||
| "ballista.grpc.client.tcp_keepalive_seconds"; | ||
| pub const BALLISTA_GRPC_CLIENT_HTTP2_KEEPALIVE_INTERVAL_SECONDS: &str = | ||
| "ballista.grpc.client.http2_keepalive_interval_seconds"; | ||
| pub const BALLISTA_STANDALONE_GRPC_CLIENT_KEEPALIVE_TIMEOUT_SECONDS: &str = | ||
|
||
| "ballista.standalone.grpc.client.keepalive_timeout_seconds"; | ||
|
|
||
| pub type ParseResult<T> = result::Result<T, String>; | ||
| use std::sync::LazyLock; | ||
|
|
||
|
|
@@ -48,8 +60,8 @@ static CONFIG_ENTRIES: LazyLock<HashMap<String, ConfigEntry>> = LazyLock::new(|| | |
| "Sets the job name that will appear in the web user interface for any submitted jobs".to_string(), | ||
| DataType::Utf8, None), | ||
| ConfigEntry::new(BALLISTA_STANDALONE_PARALLELISM.to_string(), | ||
| "Standalone processing parallelism ".to_string(), | ||
| DataType::UInt16, Some(std::thread::available_parallelism().map(|v| v.get()).unwrap_or(1).to_string())), | ||
| "Standalone processing parallelism ".to_string(), | ||
| DataType::UInt16, Some(std::thread::available_parallelism().map(|v| v.get()).unwrap_or(1).to_string())), | ||
| ConfigEntry::new(BALLISTA_GRPC_CLIENT_MAX_MESSAGE_SIZE.to_string(), | ||
| "Configuration for max message size in gRPC clients".to_string(), | ||
| DataType::UInt64, | ||
|
|
@@ -66,7 +78,26 @@ static CONFIG_ENTRIES: LazyLock<HashMap<String, ConfigEntry>> = LazyLock::new(|| | |
| "Forces the shuffle reader to use flight reader instead of block reader for remote read. Block reader usually has better performance and resource utilization".to_string(), | ||
| DataType::Boolean, | ||
| Some((false).to_string())), | ||
|
|
||
| ConfigEntry::new(BALLISTA_GRPC_CLIENT_CONNECT_TIMEOUT_SECONDS.to_string(), | ||
| "Connection timeout for gRPC client in seconds".to_string(), | ||
| DataType::UInt64, | ||
| Some((20).to_string())), | ||
| ConfigEntry::new(BALLISTA_GRPC_CLIENT_TIMEOUT_SECONDS.to_string(), | ||
| "Request timeout for gRPC client in seconds".to_string(), | ||
| DataType::UInt64, | ||
| Some((20).to_string())), | ||
| ConfigEntry::new(BALLISTA_GRPC_CLIENT_TCP_KEEPALIVE_SECONDS.to_string(), | ||
| "TCP keep-alive interval for gRPC client in seconds".to_string(), | ||
| DataType::UInt64, | ||
| Some((3600).to_string())), | ||
| ConfigEntry::new(BALLISTA_GRPC_CLIENT_HTTP2_KEEPALIVE_INTERVAL_SECONDS.to_string(), | ||
| "HTTP/2 keep-alive interval for gRPC client in seconds".to_string(), | ||
| DataType::UInt64, | ||
| Some((300).to_string())), | ||
| ConfigEntry::new(BALLISTA_STANDALONE_GRPC_CLIENT_KEEPALIVE_TIMEOUT_SECONDS.to_string(), | ||
| "Keep-alive timeout for gRPC client in seconds (standalone mode only)".to_string(), | ||
| DataType::UInt64, | ||
| Some((20).to_string())) | ||
| ]; | ||
| entries | ||
| .into_iter() | ||
|
|
@@ -188,6 +219,26 @@ impl BallistaConfig { | |
| self.get_usize_setting(BALLISTA_SHUFFLE_READER_MAX_REQUESTS) | ||
| } | ||
|
|
||
| pub fn default_grpc_client_connect_timeout_seconds(&self) -> usize { | ||
| self.get_usize_setting(BALLISTA_GRPC_CLIENT_CONNECT_TIMEOUT_SECONDS) | ||
| } | ||
|
|
||
| pub fn default_grpc_client_timeout_seconds(&self) -> usize { | ||
| self.get_usize_setting(BALLISTA_GRPC_CLIENT_TIMEOUT_SECONDS) | ||
| } | ||
|
|
||
| pub fn default_grpc_client_tcp_keepalive_seconds(&self) -> usize { | ||
| self.get_usize_setting(BALLISTA_GRPC_CLIENT_TCP_KEEPALIVE_SECONDS) | ||
| } | ||
|
|
||
| pub fn default_grpc_client_http2_keepalive_interval_seconds(&self) -> usize { | ||
| self.get_usize_setting(BALLISTA_GRPC_CLIENT_HTTP2_KEEPALIVE_INTERVAL_SECONDS) | ||
| } | ||
|
|
||
| pub fn default_standalone_grpc_client_keepalive_timeout_seconds(&self) -> usize { | ||
| self.get_usize_setting(BALLISTA_STANDALONE_GRPC_CLIENT_KEEPALIVE_TIMEOUT_SECONDS) | ||
| } | ||
|
|
||
| /// Forces the shuffle reader to always read partitions via the Arrow Flight client, | ||
| /// even when partitions are local to the node. | ||
| /// | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,7 @@ | |
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| use crate::config::BallistaConfig; | ||
| use crate::error::{BallistaError, Result}; | ||
| use crate::extension::SessionConfigExt; | ||
| use crate::serde::scheduler::PartitionStats; | ||
|
|
@@ -36,6 +37,64 @@ use std::{fs::File, pin::Pin}; | |
| use tonic::codegen::StdError; | ||
| use tonic::transport::{Channel, Error, Server}; | ||
|
|
||
| #[derive(Debug, Clone)] | ||
| pub struct GrpcClientConfig { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you please put some docs please, we will try to catch up with documentation at some point so it would help.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Absolutely! |
||
| pub connect_timeout_seconds: u64, | ||
| pub timeout_seconds: u64, | ||
| pub tcp_keepalive_seconds: u64, | ||
| pub http2_keepalive_interval_seconds: u64, | ||
| pub keepalive_timeout_seconds: u64, | ||
| } | ||
|
|
||
| impl GrpcClientConfig { | ||
| pub fn from_ballista_config(config: &BallistaConfig) -> Self { | ||
|
||
| Self { | ||
| connect_timeout_seconds: config.default_grpc_client_connect_timeout_seconds() | ||
| as u64, | ||
| timeout_seconds: config.default_grpc_client_timeout_seconds() as u64, | ||
| tcp_keepalive_seconds: config.default_grpc_client_tcp_keepalive_seconds() | ||
| as u64, | ||
| http2_keepalive_interval_seconds: config | ||
| .default_grpc_client_http2_keepalive_interval_seconds() | ||
| as u64, | ||
| keepalive_timeout_seconds: config | ||
| .default_standalone_grpc_client_keepalive_timeout_seconds() | ||
| as u64, | ||
| } | ||
| } | ||
| } | ||
|
|
||
| impl Default for GrpcClientConfig { | ||
| fn default() -> Self { | ||
| Self { | ||
| connect_timeout_seconds: 20, | ||
| timeout_seconds: 20, | ||
| tcp_keepalive_seconds: 3600, | ||
| http2_keepalive_interval_seconds: 300, | ||
| keepalive_timeout_seconds: 20, | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #[derive(Debug, Clone)] | ||
| pub struct GrpcServerConfig { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you please put some docs please, we will try to catch up with documentation at some point so it would help.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Absolutely! |
||
| pub timeout_seconds: u64, | ||
| pub tcp_keepalive_seconds: u64, | ||
| pub http2_keepalive_interval_seconds: u64, | ||
| pub http2_keepalive_timeout_seconds: u64, | ||
| } | ||
|
|
||
| impl Default for GrpcServerConfig { | ||
| fn default() -> Self { | ||
| Self { | ||
| timeout_seconds: 20, | ||
| tcp_keepalive_seconds: 3600, | ||
| http2_keepalive_interval_seconds: 300, | ||
| http2_keepalive_timeout_seconds: 20, | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Default session builder using the provided configuration | ||
| pub fn default_session_builder( | ||
| config: SessionConfig, | ||
|
|
@@ -106,31 +165,38 @@ pub async fn collect_stream( | |
|
|
||
| pub async fn create_grpc_client_connection<D>( | ||
| dst: D, | ||
| config: &GrpcClientConfig, | ||
| ) -> std::result::Result<Channel, Error> | ||
| where | ||
| D: std::convert::TryInto<tonic::transport::Endpoint>, | ||
| D::Error: Into<StdError>, | ||
| { | ||
| let endpoint = tonic::transport::Endpoint::new(dst)? | ||
| .connect_timeout(Duration::from_secs(20)) | ||
| .timeout(Duration::from_secs(20)) | ||
| .connect_timeout(Duration::from_secs(config.connect_timeout_seconds)) | ||
| .timeout(Duration::from_secs(config.timeout_seconds)) | ||
| // Disable Nagle's Algorithm since we don't want packets to wait | ||
| .tcp_nodelay(true) | ||
| .tcp_keepalive(Option::Some(Duration::from_secs(3600))) | ||
| .http2_keep_alive_interval(Duration::from_secs(300)) | ||
| .keep_alive_timeout(Duration::from_secs(20)) | ||
| .tcp_keepalive(Some(Duration::from_secs(config.tcp_keepalive_seconds))) | ||
| .http2_keep_alive_interval(Duration::from_secs( | ||
| config.http2_keepalive_interval_seconds, | ||
| )) | ||
| .keep_alive_timeout(Duration::from_secs(config.keepalive_timeout_seconds)) | ||
| .keep_alive_while_idle(true); | ||
| endpoint.connect().await | ||
| } | ||
|
|
||
| pub fn create_grpc_server() -> Server { | ||
| pub fn create_grpc_server(config: &GrpcServerConfig) -> Server { | ||
| Server::builder() | ||
| .timeout(Duration::from_secs(20)) | ||
| .timeout(Duration::from_secs(config.timeout_seconds)) | ||
| // Disable Nagle's Algorithm since we don't want packets to wait | ||
| .tcp_nodelay(true) | ||
| .tcp_keepalive(Option::Some(Duration::from_secs(3600))) | ||
| .http2_keepalive_interval(Option::Some(Duration::from_secs(300))) | ||
| .http2_keepalive_timeout(Option::Some(Duration::from_secs(20))) | ||
| .tcp_keepalive(Some(Duration::from_secs(config.tcp_keepalive_seconds))) | ||
| .http2_keepalive_interval(Some(Duration::from_secs( | ||
| config.http2_keepalive_interval_seconds, | ||
| ))) | ||
| .http2_keepalive_timeout(Some(Duration::from_secs( | ||
| config.http2_keepalive_timeout_seconds, | ||
| ))) | ||
| } | ||
|
|
||
| pub fn collect_plan_metrics(plan: &dyn ExecutionPlan) -> Vec<MetricsSet> { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we please update query to return
ballista.job.nameonly. so we don't change the test every time we add new option.