Skip to content

Commit 7376956

Browse files
committed
Add arrow flight reads
1 parent 24984f9 commit 7376956

File tree

7 files changed

+275
-49
lines changed

7 files changed

+275
-49
lines changed

Cargo.lock

Lines changed: 23 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

benchmarks/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ serde = "1.0.219"
1616
serde_json = "1.0.141"
1717
env_logger = "0.11.8"
1818
async-trait = "0.1.88"
19+
async-stream = "0.3.6"
1920
chrono = "0.4.41"
2021
futures = "0.3.31"
2122
dashmap = "6.1.0"

benchmarks/cdk/bin/@bench-common.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ export const ROOT = path.join(__dirname, '../../..')
77
// Simple data structures
88
export type QueryResult = {
99
query: string;
10-
iterations: { elapsed: number; row_count: number }[];
10+
iterations: { elapsed: number; row_count?: number }[];
1111
}
1212

1313
export type BenchmarkResults = {
@@ -19,7 +19,7 @@ export const BenchmarkResults = z.object({
1919
query: z.string(),
2020
iterations: z.array(z.object({
2121
elapsed: z.number(),
22-
row_count: z.number()
22+
row_count: z.number().optional()
2323
}))
2424
}))
2525
})
@@ -69,7 +69,7 @@ export async function compareWithPrevious(results: BenchmarkResults, outputPath:
6969
export interface BenchmarkRunner {
7070
createTables(sf: number): Promise<void>;
7171

72-
executeQuery(query: string): Promise<{ rowCount: number }>;
72+
executeQuery(query: string): Promise<{ rowCount?: number }>;
7373
}
7474

7575
export async function runBenchmark(
@@ -103,6 +103,9 @@ export async function runBenchmark(
103103
iterations: []
104104
};
105105

106+
console.log(`Warming up query ${id}...`)
107+
await runner.executeQuery(queryToExecute);
108+
106109
for (let i = 0; i < iterations; i++) {
107110
const start = new Date()
108111
const response = await runner.executeQuery(queryToExecute);

benchmarks/cdk/bin/datafusion-bench.ts

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import path from "path";
22
import {Command} from "commander";
3-
import {z} from 'zod';
43
import {BenchmarkRunner, ROOT, runBenchmark} from "./@bench-common";
54

65
// Remember to port-forward a worker with
@@ -40,12 +39,6 @@ async function main() {
4039
});
4140
}
4241

43-
const QueryResponse = z.object({
44-
count: z.number(),
45-
plan: z.string()
46-
})
47-
type QueryResponse = z.infer<typeof QueryResponse>
48-
4942
class DataFusionRunner implements BenchmarkRunner {
5043
private url = 'http://localhost:9000';
5144

@@ -55,7 +48,7 @@ class DataFusionRunner implements BenchmarkRunner {
5548
}) {
5649
}
5750

58-
async executeQuery(sql: string): Promise<{ rowCount: number }> {
51+
async executeQuery(sql: string): Promise<{ rowCount?: number }> {
5952
let response
6053
if (sql.includes("create view")) {
6154
// This is query 15
@@ -67,10 +60,10 @@ class DataFusionRunner implements BenchmarkRunner {
6760
response = await this.query(sql)
6861
}
6962

70-
return {rowCount: response.count};
63+
return response
7164
}
7265

73-
private async query(sql: string): Promise<QueryResponse> {
66+
private async query(sql: string): Promise<{ rowCount?: number }> {
7467
const url = new URL(this.url);
7568
url.searchParams.set('sql', sql);
7669

@@ -80,11 +73,13 @@ class DataFusionRunner implements BenchmarkRunner {
8073
const msg = await response.text();
8174
throw new Error(`Query failed: ${response.status} ${msg}`);
8275
}
83-
84-
const unparsed = await response.json();
85-
return QueryResponse.parse(unparsed);
76+
await response.arrayBuffer()
77+
// TODO: the library still cannot decode Utf8Views, and some queries fail because of that
78+
// const table = await tableFromIPC(ipcData)
79+
return {}
8680
}
8781

82+
8883
async createTables(sf: number): Promise<void> {
8984
let stmt = '';
9085
for (const tbl of [

benchmarks/cdk/bin/worker.rs

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@ use arrow_flight::flight_service_client::FlightServiceClient;
22
use async_trait::async_trait;
33
use aws_config::BehaviorVersion;
44
use aws_sdk_ec2::Client as Ec2Client;
5+
use axum::body::Body;
6+
use axum::http::Response;
57
use axum::{Json, Router, extract::Query, http::StatusCode, routing::get};
68
use dashmap::{DashMap, Entry};
9+
use datafusion::arrow::ipc::writer::{IpcWriteOptions, StreamWriter};
710
use datafusion::common::DataFusionError;
811
use datafusion::common::instant::Instant;
912
use datafusion::common::runtime::SpawnedTask;
@@ -15,7 +18,7 @@ use datafusion_distributed::{
1518
DistributedPhysicalOptimizerRule, DistributedSessionBuilder, DistributedSessionBuilderContext,
1619
create_flight_client, display_plan_ascii,
1720
};
18-
use futures::{StreamExt, TryFutureExt};
21+
use futures::{StreamExt, TryFutureExt, TryStreamExt};
1922
use log::{error, info, warn};
2023
use object_store::ObjectStore;
2124
use object_store::aws::AmazonS3Builder;
@@ -27,15 +30,10 @@ use std::sync::atomic::AtomicBool;
2730
use std::sync::{Arc, RwLock};
2831
use std::time::Duration;
2932
use structopt::StructOpt;
33+
use tonic::Status;
3034
use tonic::transport::{Channel, Server};
3135
use url::Url;
3236

33-
#[derive(Serialize)]
34-
struct QueryResult {
35-
plan: String,
36-
count: usize,
37-
}
38-
3937
#[derive(Debug, StructOpt, Clone)]
4038
#[structopt(about = "worker spawn command")]
4139
struct Cmd {
@@ -115,7 +113,10 @@ async fn main() -> Result<(), Box<dyn Error>> {
115113
let ctx = ctx.clone();
116114

117115
async move {
118-
let sql = params.get("sql").ok_or(err("Missing 'sql' parameter"))?;
116+
let sql = params
117+
.get("sql")
118+
.ok_or(err("Missing 'sql' parameter"))?
119+
.clone();
119120

120121
let mut df_opt = None;
121122
for sql in sql.split(";") {
@@ -129,12 +130,10 @@ async fn main() -> Result<(), Box<dyn Error>> {
129130
return Err(err("Empty 'sql' parameter"));
130131
};
131132

132-
let start = Instant::now();
133-
134133
info!("Executing query...");
135134
let abort_notifier = AbortNotifier::new("Query aborted");
136135
let abort_notifier_clone = abort_notifier.clone();
137-
let task = SpawnedTask::spawn(async move {
136+
let still_running_log_task = SpawnedTask::spawn(async move {
138137
let _ = abort_notifier_clone;
139138
loop {
140139
tokio::time::sleep(Duration::from_secs(5)).await;
@@ -144,25 +143,42 @@ async fn main() -> Result<(), Box<dyn Error>> {
144143
let physical = df.create_physical_plan().await.map_err(err)?;
145144
let mut stream =
146145
execute_stream(physical.clone(), ctx.task_ctx()).map_err(err)?;
147-
let mut count = 0;
148-
while let Some(batch) = stream.next().await {
149-
count += batch.map_err(err)?.num_rows();
150-
info!("Gathered {count} rows, query still in progress..")
151-
}
152-
let plan = display_plan_ascii(physical.as_ref(), true);
153-
drop(task);
154146

155-
let elapsed = start.elapsed();
156-
let ms = elapsed.as_secs_f64() * 1000.0;
157-
info!("Finished executing query:\n{sql}\n\n{plan}");
158-
info!("Returned {count} rows in {ms} ms");
159-
abort_notifier.finished();
147+
let start = Instant::now();
148+
let mut count: usize = 0;
149+
150+
let stream = async_stream::stream! {
151+
// Stream the data
152+
while let Some(batch) = stream.next().await {
153+
let batch = batch?;
154+
count += batch.num_rows();
155+
info!("Gathered {count} rows, query still in progress..");
156+
157+
let mut writer = StreamWriter::try_new(vec![], batch.schema().as_ref())?;
158+
writer.write(&batch)?;
159+
yield writer.into_inner()
160+
}
161+
162+
// After stream completes gracefully - all cleanup code runs here
163+
let elapsed = start.elapsed();
164+
let ms = elapsed.as_secs_f64() * 1000.0;
165+
info!("Finished executing query:\n{sql}");
166+
info!("Returned {count} rows in {ms} ms");
167+
abort_notifier.finished();
168+
// keep the task alive
169+
drop(still_running_log_task);
170+
};
160171

161-
Ok::<_, (StatusCode, String)>(Json(QueryResult { count, plan }))
172+
Ok(Response::builder()
173+
.header("content-type", "application/octet-stream")
174+
.body(Body::from_stream(
175+
stream.map_err(|err| Status::internal(err.to_string())),
176+
))
177+
.expect("building a Response from a body should never fail"))
162178
}
163-
.inspect_err(|(_, msg)| {
164-
error!("Error executing query: {msg}");
165-
})
179+
.inspect_err(|(_, msg)| {
180+
error!("Error executing query: {msg}");
181+
})
166182
}),
167183
),
168184
);

0 commit comments

Comments
 (0)