Introduce FetchChunks API (#11054)

zehiko · web-flow · commit 0ba6de3f9a52 · 2025-09-02T11:23:17.000+02:00
# What Supersedes * #10852 Introducing the new ``FetchChunks`` API as per approach we aligned on last week: * output of ``Query`` call is a dataframe we can now pass to the new ``FetchChunks`` call (this can be see on the Cloud side and the new integration tests) * since we're dealing with dataframes, definition of ``ChunkKey`` has moved to Cloud
diff --git a/crates/store/re_protos/proto/rerun/v1alpha1/cloud.proto b/crates/store/re_protos/proto/rerun/v1alpha1/cloud.proto
@@ -74,7 +74,11 @@ service RerunCloudService {
   // Search a previously created index.
   rpc SearchDataset(SearchDatasetRequest) returns (stream SearchDatasetResponse) {}
 
-  // Perform Rerun-native queries on a dataset, returning the matching chunk IDs.
+  // Perform Rerun-native queries on a dataset, returning the matching chunk IDs, as well
+  // as information that can be sent back to Rerun Cloud to fetch the actual chunks as part
+  // of `FetchChunks` request. In this 2-step query process, 1st step is getting information
+  // from the server about the chunks that contain relevant information. 2nd step is fetching
+  // those chunks (the actual data).
   //
   // These Rerun-native queries include:
   // * Filtering by specific partition and chunk IDs.
@@ -96,6 +100,13 @@ service RerunCloudService {
   // To fetch only the actual chunk IDs rather than the chunks themselves, see `QueryDataset`.
   rpc GetChunks(GetChunksRequest) returns (stream GetChunksResponse) {}
 
+  // Fetch specific chunks from Rerun Cloud. In a 2-step query process, result of 1st phase,
+  // that is, the result of `QueryDataset` should include all the necessary information to send
+  // the actual chunk requests, which is the 2nd step of the query process.
+  //
+  // See `FetchChunksRequest` for details on the fields that describe each individual chunk.
+  rpc FetchChunks(FetchChunksRequest) returns (stream FetchChunksResponse) {}
+
   // --- Tables ---
   // TODO(jleibs): This will be replaced / extended by Arrow Flight
 
@@ -533,6 +544,21 @@ message GetChunksResponse {
   repeated rerun.log_msg.v1alpha1.ArrowMsg chunks = 1;
 }
 
+message FetchChunksRequest {
+  // Information about the chunks to fetch. These dataframes have to include the following columns:
+  // * `chunk_id` - Chunk unique identifier
+  // * `partition_id` - partition this chunk belongs to. Currently needed as we pass this metadata back and forth
+  // * `partition_layer` - specific partition layer. Currently needed as we pass this metadata back and forth
+  // * `chunk_key` - chunk location details
+  repeated rerun.common.v1alpha1.DataframePart chunk_infos = 1;
+}
+
+message FetchChunksResponse {
+  // Every gRPC response, even within the confines of a stream, involves HTTP2 overhead, which isn't
+  // cheap by any means, which is why we're returning a batch of `ArrowMsg` rather than a single one.
+  repeated rerun.log_msg.v1alpha1.ArrowMsg chunks = 1;
+}
+
 // --- Table Apis ---
 
 message GetTableSchemaRequest {
diff --git a/crates/store/re_protos/src/v1alpha1/rerun.cloud.v1alpha1.ext.rs b/crates/store/re_protos/src/v1alpha1/rerun.cloud.v1alpha1.ext.rs
@@ -1255,7 +1255,7 @@ impl ScanPartitionTableResponse {
 // --- DataSource --
 
 // NOTE: Match the values of the Protobuf definition to keep life simple.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub enum DataSourceKind {
     Rrd = 1,
 }
diff --git a/crates/store/re_protos/src/v1alpha1/rerun.cloud.v1alpha1.rs b/crates/store/re_protos/src/v1alpha1/rerun.cloud.v1alpha1.rs
diff --git a/crates/store/re_server/src/rerun_cloud.rs b/crates/store/re_server/src/rerun_cloud.rs
@@ -162,6 +162,7 @@ macro_rules! decl_stream {
     };
 }
 
+decl_stream!(FetchChunksResponseStream<manifest:FetchChunksResponse>);
 decl_stream!(GetChunksResponseStream<manifest:GetChunksResponse>);
 decl_stream!(QueryDatasetResponseStream<manifest:QueryDatasetResponse>);
 decl_stream!(ScanPartitionTableResponseStream<manifest:ScanPartitionTableResponse>);
@@ -894,6 +895,16 @@ impl RerunCloudService for RerunCloudHandler {
         ))
     }
 
+    type FetchChunksStream = FetchChunksResponseStream;
+
+    async fn fetch_chunks(
+        &self,
+        _request: tonic::Request<re_protos::cloud::v1alpha1::FetchChunksRequest>,
+    ) -> std::result::Result<tonic::Response<Self::FetchChunksStream>, tonic::Status> {
+        // TODO(zehiko) implement fetch_chunks
+        Err(tonic::Status::unimplemented("fetch_chunks not implemented"))
+    }
+
     // --- Table APIs ---
 
     async fn register_table(

Original file line number	Diff line number	Diff line change
`@@ -1255,7 +1255,7 @@ impl ScanPartitionTableResponse {`
`1255`	`1255`	`// --- DataSource --`
`1256`	`1256`
`1257`	`1257`	`// NOTE: Match the values of the Protobuf definition to keep life simple.`
`1258`		`-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]`
	`1258`	`+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]`
`1259`	`1259`	`pub enum DataSourceKind {`
`1260`	`1260`	`Rrd = 1,`
`1261`	`1261`	`}`