Skip to content

Commit 22dab09

Browse files
committed
[Introspection] Support for partial results for internal queries
When distributed fan-out queries (e.g. `select * from loglet_workers`) encounter unreachable nodes, return results from the available nodes instead of failing the entire query. Per-node errors are surfaced as structured warnings alongside the partial result set. Implementation: - ErrorCatchingStream wraps each per-node stream in NodeFanOutExecutionPlan, catching errors into a shared NodeWarnings (Arc<parking_lot::Mutex<Vec>>) and terminating the individual stream gracefully. - QueryContext::execute() returns a QueryResult that bundles the record batch stream with collected warning handles, walking the physical plan tree to extract them from any NodeFanOutExecutionPlan nodes. - QueryWarningStream in the gRPC handler buffers the last response and attaches accumulated warnings to it before yielding, piggybacking on existing data rather than sending a trailing empty message. - Proto: added QueryWarning message and repeated warnings field to QueryResponse. - CLI: restatectl collects warnings and displays them on stderr after results.
1 parent dee5824 commit 22dab09

File tree

13 files changed

+356
-93
lines changed

13 files changed

+356
-93
lines changed

crates/admin/src/cluster_controller/grpc_svc_handler.rs

Lines changed: 100 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,15 @@
88
// the Business Source License, use of this software will be governed
99
// by the Apache License, Version 2.0.
1010

11+
use std::pin::Pin;
12+
use std::task::{Context, Poll};
13+
1114
use bytes::{Bytes, BytesMut};
1215
use bytestring::ByteString;
1316
use datafusion::arrow::ipc::writer::StreamWriter;
1417
use datafusion::error::DataFusionError;
15-
use futures::StreamExt;
1618
use futures::stream::BoxStream;
19+
use futures::{Stream, StreamExt};
1720
use tonic::codec::CompressionEncoding;
1821
use tonic::{Request, Response, Status, async_trait};
1922
use tracing::info;
@@ -25,14 +28,15 @@ use restate_core::protobuf::cluster_ctrl_svc::{
2528
CreatePartitionSnapshotResponse, DescribeLogRequest, DescribeLogResponse, FindTailRequest,
2629
FindTailResponse, GetClusterConfigurationRequest, GetClusterConfigurationResponse,
2730
ListLogsRequest, ListLogsResponse, MigrateMetadataRequest, MigrateMetadataResponse,
28-
QueryRequest, QueryResponse, SealAndExtendChainRequest, SealAndExtendChainResponse,
29-
SealChainRequest, SealChainResponse, SealedSegment, SetClusterConfigurationRequest,
30-
SetClusterConfigurationResponse, TailState, TrimLogRequest,
31+
QueryRequest, QueryResponse, QueryWarning, SealAndExtendChainRequest,
32+
SealAndExtendChainResponse, SealChainRequest, SealChainResponse, SealedSegment,
33+
SetClusterConfigurationRequest, SetClusterConfigurationResponse, TailState, TrimLogRequest,
3134
cluster_ctrl_svc_server::{ClusterCtrlSvc, ClusterCtrlSvcServer},
3235
};
3336
use restate_core::{Metadata, MetadataWriter};
3437
use restate_metadata_store::WriteError;
3538
use restate_storage_query_datafusion::context::QueryContext;
39+
use restate_storage_query_datafusion::node_fan_out::NodeWarnings;
3640
use restate_types::config::{MetadataClientKind, MetadataClientOptions, NetworkingOptions};
3741
use restate_types::identifiers::PartitionId;
3842
use restate_types::logs::metadata::{Logs, SegmentIndex};
@@ -419,21 +423,37 @@ impl ClusterCtrlSvc for ClusterCtrlSvcHandler {
419423
request: Request<QueryRequest>,
420424
) -> std::result::Result<Response<Self::QueryStream>, tonic::Status> {
421425
let request = request.into_inner();
422-
let stream = self
426+
let query_result = self
423427
.query_context
424428
.execute(&request.query)
425429
.await
426430
.map_err(datafusion_error_to_status)?;
427431

428-
Ok(Response::new(
429-
WriteRecordBatchStream::<StreamWriter<Vec<u8>>>::new(stream, request.query)
430-
.map_err(datafusion_error_to_status)?
431-
.map(|item| {
432-
item.map(|encoded| QueryResponse { encoded })
433-
.map_err(datafusion_error_to_status)
434-
})
435-
.boxed(),
436-
))
432+
let node_warnings = query_result.node_warnings;
433+
434+
let data_stream = WriteRecordBatchStream::<StreamWriter<Vec<u8>>>::new(
435+
query_result.stream,
436+
request.query,
437+
)
438+
.map_err(datafusion_error_to_status)?
439+
.map(|item| {
440+
item.map(|encoded| QueryResponse {
441+
encoded,
442+
..Default::default()
443+
})
444+
.map_err(datafusion_error_to_status)
445+
});
446+
447+
// Wrap the data stream to attach per-node warnings to the final
448+
// response message, avoiding an extra trailing empty-data message.
449+
let stream = QueryWarningStream {
450+
inner: data_stream.boxed(),
451+
node_warnings,
452+
last_response: None,
453+
done: false,
454+
};
455+
456+
Ok(Response::new(stream.boxed()))
437457
}
438458

439459
/// Migrate metadata from the current metadata store to a target store
@@ -539,6 +559,72 @@ fn serialize_value<T: StorageEncode>(value: &T) -> Bytes {
539559
buf.freeze()
540560
}
541561

562+
/// Stream wrapper that buffers the last response from the inner stream, and
563+
/// when the inner stream ends, attaches any accumulated per-node warnings
564+
/// to that final response before yielding it.
565+
struct QueryWarningStream {
566+
inner: BoxStream<'static, Result<QueryResponse, Status>>,
567+
node_warnings: Vec<NodeWarnings>,
568+
last_response: Option<Result<QueryResponse, Status>>,
569+
done: bool,
570+
}
571+
572+
impl Stream for QueryWarningStream {
573+
type Item = Result<QueryResponse, Status>;
574+
575+
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
576+
if self.done {
577+
return Poll::Ready(None);
578+
}
579+
580+
loop {
581+
match self.inner.poll_next_unpin(cx) {
582+
Poll::Pending => return Poll::Pending,
583+
Poll::Ready(None) => {
584+
self.done = true;
585+
// Inner stream ended. Yield the buffered last response
586+
// with warnings attached, or a warnings-only response.
587+
let warnings = drain_node_warnings(&self.node_warnings);
588+
return match self.last_response.take() {
589+
Some(Ok(mut resp)) => {
590+
resp.warnings = warnings;
591+
Poll::Ready(Some(Ok(resp)))
592+
}
593+
Some(Err(err)) => Poll::Ready(Some(Err(err))),
594+
None if !warnings.is_empty() => {
595+
// No data at all, but we have warnings
596+
Poll::Ready(Some(Ok(QueryResponse {
597+
encoded: Default::default(),
598+
warnings,
599+
})))
600+
}
601+
None => Poll::Ready(None),
602+
};
603+
}
604+
Poll::Ready(Some(item)) => {
605+
// Yield the previously buffered response, buffer this one
606+
if let Some(prev) = self.last_response.replace(item) {
607+
return Poll::Ready(Some(prev));
608+
}
609+
// First item — buffer it and poll for the next
610+
continue;
611+
}
612+
}
613+
}
614+
}
615+
}
616+
617+
fn drain_node_warnings(node_warnings: &[NodeWarnings]) -> Vec<QueryWarning> {
618+
let mut out = Vec::new();
619+
for nw in node_warnings {
620+
out.extend(nw.lock().drain(..).map(|w| QueryWarning {
621+
node_id: w.node_id,
622+
message: w.message,
623+
}));
624+
}
625+
out
626+
}
627+
542628
fn datafusion_error_to_status(err: DataFusionError) -> Status {
543629
match err {
544630
DataFusionError::SQL(..)

crates/admin/src/rest_api/query.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,18 +98,18 @@ where
9898
return Err(QueryError::Unavailable);
9999
};
100100

101-
let record_batch_stream = query_context.execute(&payload.query).await?;
101+
let query_result = query_context.execute(&payload.query).await?;
102102

103103
let (result_stream, content_type) = match headers.get(http::header::ACCEPT) {
104104
Some(v) if v == HeaderValue::from_static("application/json") => (
105-
WriteRecordBatchStream::<JsonWriter>::new(record_batch_stream, payload.query)?
105+
WriteRecordBatchStream::<JsonWriter>::new(query_result.stream, payload.query)?
106106
.map_ok(Frame::data)
107107
.left_stream(),
108108
"application/json",
109109
),
110110
_ => (
111111
WriteRecordBatchStream::<StreamWriter<Vec<u8>>>::new(
112-
record_batch_stream,
112+
query_result.stream,
113113
payload.query,
114114
)?
115115
.map_ok(Frame::data)

crates/admin/src/storage_accounting.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ impl StorageAccountingTask {
8787
.query_context
8888
.execute(STORAGE_QUERY)
8989
.await?
90+
.stream
9091
.collect::<Vec<_>>()
9192
.await
9293
.into_iter()

crates/core/protobuf/cluster_ctrl_svc.proto

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,17 @@ message QueryRequest {
184184
message QueryResponse {
185185
// arrow encoded record batch
186186
bytes encoded = 1;
187+
// Per-node errors encountered during distributed query execution.
188+
// Populated on the final response message when some nodes failed but
189+
// partial results are still available.
190+
repeated QueryWarning warnings = 2;
191+
}
192+
193+
message QueryWarning {
194+
// Node identifier (e.g. "N1", "N2") where the error originated
195+
string node_id = 1;
196+
// Human-readable error message
197+
string message = 2;
187198
}
188199

189200
message MigrateMetadataRequest {

crates/storage-query-datafusion/src/context.rs

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use datafusion::execution::SessionStateBuilder;
2222
use datafusion::execution::TaskContext;
2323
use datafusion::execution::context::SQLOptions;
2424
use datafusion::execution::runtime_env::RuntimeEnvBuilder;
25-
use datafusion::physical_plan::SendableRecordBatchStream;
25+
use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream, execute_stream};
2626
use datafusion::prelude::{SessionConfig, SessionContext};
2727
use datafusion::sql::TableReference;
2828

@@ -40,6 +40,7 @@ use restate_types::partitions::state::PartitionReplicaSetStates;
4040
use restate_types::schema::deployment::DeploymentResolver;
4141
use restate_types::schema::service::ServiceMetadataResolver;
4242

43+
use crate::node_fan_out::NodeWarnings;
4344
use crate::remote_query_scanner_manager::RemoteScannerManager;
4445

4546
const SYS_INVOCATION_VIEW: &str = "CREATE VIEW sys_invocation as SELECT
@@ -424,16 +425,25 @@ impl QueryContext {
424425
})
425426
}
426427

427-
pub async fn execute(
428-
&self,
429-
sql: &str,
430-
) -> datafusion::common::Result<SendableRecordBatchStream> {
428+
pub async fn execute(&self, sql: &str) -> datafusion::common::Result<QueryResult> {
431429
let state = self.datafusion_context.state();
432430
let statement = state.sql_to_statement(sql, &datafusion::config::Dialect::PostgreSQL)?;
433431
let plan = state.statement_to_plan(statement).await?;
434432
self.sql_options.verify_plan(&plan)?;
435433
let df = self.datafusion_context.execute_logical_plan(plan).await?;
436-
df.execute_stream().await
434+
435+
let task_ctx = Arc::new(df.task_ctx());
436+
let physical_plan = df.create_physical_plan().await?;
437+
438+
// Collect NodeWarnings handles from any NodeFanOutExecutionPlan nodes
439+
// in the plan tree before execution begins.
440+
let node_warnings = collect_node_warnings(&physical_plan);
441+
442+
let stream = execute_stream(physical_plan, task_ctx)?;
443+
Ok(QueryResult {
444+
stream,
445+
node_warnings,
446+
})
437447
}
438448

439449
pub fn task_ctx(&self) -> Arc<TaskContext> {
@@ -447,6 +457,31 @@ impl AsRef<SessionContext> for QueryContext {
447457
}
448458
}
449459

460+
/// Result of a SQL query execution, containing the record batch stream
461+
/// and any per-node warning collectors from fan-out execution plans.
462+
pub struct QueryResult {
463+
pub stream: SendableRecordBatchStream,
464+
pub node_warnings: Vec<NodeWarnings>,
465+
}
466+
467+
/// Walks the physical plan tree and collects [`NodeWarnings`] handles from
468+
/// any [`NodeFanOutExecutionPlan`] nodes found.
469+
fn collect_node_warnings(plan: &Arc<dyn ExecutionPlan>) -> Vec<NodeWarnings> {
470+
use crate::node_fan_out::NodeFanOutExecutionPlan;
471+
472+
let mut warnings = Vec::new();
473+
let mut stack = vec![Arc::clone(plan)];
474+
while let Some(node) = stack.pop() {
475+
if let Some(fan_out) = node.as_any().downcast_ref::<NodeFanOutExecutionPlan>() {
476+
warnings.push(fan_out.node_warnings().clone());
477+
}
478+
for child in node.children() {
479+
stack.push(Arc::clone(child));
480+
}
481+
}
482+
warnings
483+
}
484+
450485
/// Newtype to add debug implementation which is required for [`SelectPartitions`].
451486
#[derive(Clone, derive_more::Debug)]
452487
pub struct SelectPartitionsFromMetadata;

crates/storage-query-datafusion/src/idempotency/tests.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ async fn get_idempotency_key() {
6161
)
6262
.await
6363
.unwrap()
64-
.collect::<Vec<Result<RecordBatch, _>>>()
64+
.stream
65+
.collect::<Vec<datafusion::common::Result<RecordBatch>>>()
6566
.await
6667
.remove(0)
6768
.unwrap();

crates/storage-query-datafusion/src/inbox/tests.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ async fn get_inbox() {
4545
.execute("SELECT * FROM sys_inbox ORDER BY sequence_number")
4646
.await
4747
.unwrap()
48-
.collect::<Vec<Result<RecordBatch, _>>>()
48+
.stream
49+
.collect::<Vec<datafusion::common::Result<RecordBatch>>>()
4950
.await
5051
.remove(0)
5152
.unwrap();

crates/storage-query-datafusion/src/journal/tests.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ async fn get_entries() {
8686
)
8787
.await
8888
.unwrap()
89-
.collect::<Vec<Result<RecordBatch, _>>>()
89+
.stream
90+
.collect::<Vec<datafusion::common::Result<RecordBatch>>>()
9091
.await
9192
.remove(0)
9293
.unwrap();
@@ -179,7 +180,8 @@ async fn select_count_star() {
179180
.execute("SELECT COUNT(*) AS count FROM sys_journal")
180181
.await
181182
.unwrap()
182-
.collect::<Vec<Result<RecordBatch, _>>>()
183+
.stream
184+
.collect::<Vec<datafusion::common::Result<RecordBatch>>>()
183185
.await
184186
.remove(0)
185187
.unwrap();

crates/storage-query-datafusion/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ mod keyed_service_status;
2323
mod log;
2424
pub mod loglet_worker;
2525
mod node;
26-
mod node_fan_out;
26+
pub mod node_fan_out;
2727
mod partition;
2828
mod partition_replica_set;
2929
mod partition_state;

crates/storage-query-datafusion/src/mocks.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use async_trait::async_trait;
1616
use datafusion::arrow::array::ArrayRef;
1717
use datafusion::arrow::record_batch::RecordBatch;
1818
use datafusion::common::DataFusionError;
19-
use datafusion::execution::SendableRecordBatchStream;
19+
2020
use googletest::matcher::{Matcher, MatcherResult};
2121
use serde_json::Value;
2222

@@ -202,7 +202,7 @@ impl MockQueryEngine {
202202
pub async fn execute(
203203
&self,
204204
sql: impl AsRef<str> + Send,
205-
) -> datafusion::common::Result<SendableRecordBatchStream> {
205+
) -> datafusion::common::Result<crate::context::QueryResult> {
206206
self.2.execute(sql.as_ref()).await
207207
}
208208
}

0 commit comments

Comments
 (0)