Skip to content

Commit 15a1267

Browse files
committed
Fix statpopgen
Signed-off-by: Adam Gutglick <[email protected]>
1 parent a7d501f commit 15a1267

File tree

6 files changed

+24
-14
lines changed

6 files changed

+24
-14
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ termtree = { version = "0.5" }
196196
thiserror = "2.0.3"
197197
tokio = { version = "1.47" }
198198
tokio-stream = "0.1.17"
199+
tokio-util = "0.7.17"
199200
tpchgen = { version = "2" }
200201
tpchgen-arrow = { version = "2" }
201202
tracing = { version = "0.1.41", default-features = false }

benchmarks/ddb-bench/src/lib.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ impl DuckClient {
199199
.unwrap_or_else(|| format!("*.{}", extension));
200200

201201
let table_path = format!("{base_dir}{pattern}");
202-
dbg!(&table_path);
202+
203203
commands.push_str(&format!(
204204
"CREATE {} IF NOT EXISTS {} AS SELECT * FROM read_{}('{}');\n",
205205
object.to_str(),
@@ -211,8 +211,6 @@ impl DuckClient {
211211

212212
trace!("Executing table registration commands: {commands}");
213213

214-
dbg!(&commands);
215-
216214
self.execute_query(&commands)?;
217215

218216
Ok(())

vortex-bench/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,15 @@ parking_lot = { workspace = true }
4040
parquet = { workspace = true, features = ["async"] }
4141
rand = { workspace = true }
4242
regex = { workspace = true }
43-
reqwest = { workspace = true }
43+
reqwest = { workspace = true, features = ["stream"] }
4444
serde = { workspace = true }
4545
serde_json = { workspace = true }
4646
sysinfo = { workspace = true }
4747
tabled = { workspace = true, features = ["std"] }
4848
target-lexicon = { workspace = true }
4949
tokio = { workspace = true, features = ["full"] }
5050
tokio-stream = { workspace = true }
51+
tokio-util = { workspace = true }
5152
tpchgen = { workspace = true }
5253
tpchgen-arrow = { workspace = true }
5354
tracing = { workspace = true, features = ["max_level_debug"] }

vortex-bench/src/statpopgen/download_vcf.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ use parquet::arrow::AsyncArrowWriter;
1111
use reqwest::Client;
1212
use tokio::fs::File;
1313
use tokio::io::BufReader;
14+
use tokio_stream::StreamExt;
15+
use tokio_util::io::StreamReader;
1416
use tracing::info;
15-
use vortex::io::IoBuf;
1617

1718
use super::StatPopGenBenchmark;
1819
use crate::idempotent_async;
@@ -30,6 +31,7 @@ impl StatPopGenBenchmark {
3031
StatPopGenBenchmark::FILE_NAME
3132
);
3233
let parquet_output_path = self.parquet_path()?;
34+
3335
idempotent_async(
3436
&parquet_output_path,
3537
async |parquet_output_path| -> Result<()> {
@@ -48,10 +50,14 @@ impl StatPopGenBenchmark {
4850
.error_for_status()
4951
.context("reqwest bad status")?;
5052

51-
let body = response.bytes().await?;
53+
// The file is BIG so we only want to download a part of it
54+
let byte_stream = response
55+
.bytes_stream()
56+
.map(|x| x.map_err(std::io::Error::other));
57+
let stream_reader = StreamReader::new(byte_stream);
5258

5359
// Wrap in BGZF reader
54-
let buf_reader = BufReader::new(body.as_slice());
60+
let buf_reader = BufReader::new(stream_reader);
5561
let mut bgzf_reader = noodles_bgzf::r#async::io::Reader::new(buf_reader);
5662

5763
// Read and parse VCF header

vortex-bench/src/statpopgen/statpopgen_benchmark.rs

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
use std::fs;
5-
use std::path::Path;
65
use std::path::PathBuf;
76

87
use anyhow::Result;
@@ -14,6 +13,7 @@ use crate::BenchmarkDataset;
1413
use crate::Format;
1514
use crate::IdempotentPath;
1615
use crate::TableSpec;
16+
use crate::workspace_root;
1717

1818
/// Statistical population genetics benchmark implementation.
1919
///
@@ -75,8 +75,10 @@ impl StatPopGenBenchmark {
7575
}
7676
};
7777

78-
let data_url = Url::from_directory_path("statpopgen".to_data_path())
79-
.map_err(|_| anyhow::anyhow!("bad data path?"))?;
78+
let data_path = "statspopgen".to_data_path().join(format!("{n_rows}/"));
79+
80+
let data_url =
81+
Url::from_directory_path(data_path).map_err(|_| anyhow::anyhow!("bad data path?"))?;
8082

8183
Ok(Self {
8284
data_url,
@@ -92,7 +94,7 @@ impl StatPopGenBenchmark {
9294
/// The path follows the pattern: `{data_url}/{n_rows}/parquet/gnomad.genomes.v3.1.2.hgdp_tgp.chr21.parquet`
9395
pub fn parquet_path(&self) -> Result<PathBuf> {
9496
self.data_url
95-
.join(&(self.n_rows.to_string() + "/parquet/"))?
97+
.join("parquet/")?
9698
.join(&format!("{}.parquet", StatPopGenBenchmark::FILE_NAME))?
9799
.to_file_path()
98100
.map_err(|_| anyhow::anyhow!("Failed to convert data URL to filesystem path - ensure data_url uses 'file://' scheme"))
@@ -104,7 +106,7 @@ impl StatPopGenBenchmark {
104106
/// The path follows the pattern: `{data_url}/{n_rows}/vortex-file-compressed/gnomad.genomes.v3.1.2.hgdp_tgp.chr21.vortex`
105107
pub fn vortex_path(&self) -> Result<PathBuf> {
106108
self.data_url
107-
.join(&(self.n_rows.to_string() + "/vortex-file-compressed/"))?
109+
.join("vortex-file-compressed/")?
108110
.join(&format!("{}.vortex", StatPopGenBenchmark::FILE_NAME))?
109111
.to_file_path()
110112
.map_err(|_| anyhow::anyhow!("Failed to convert data URL to filesystem path - ensure data_url uses 'file://' scheme"))
@@ -116,7 +118,7 @@ impl StatPopGenBenchmark {
116118
/// The path follows the pattern: `{data_url}/{n_rows}/vortex-compact/{StatPopGenBenchmark::FILE_NAME}.vortex`
117119
pub fn vortex_compact_path(&self) -> Result<PathBuf> {
118120
self.data_url
119-
.join(&(self.n_rows.to_string() + "/vortex-compact/"))?
121+
.join("vortex-compact/")?
120122
.join(&format!("{}.vortex", StatPopGenBenchmark::FILE_NAME))?
121123
.to_file_path()
122124
.map_err(|_| anyhow::anyhow!("Failed to convert data URL to filesystem path - ensure data_url uses 'file://' scheme"))
@@ -126,7 +128,8 @@ impl StatPopGenBenchmark {
126128
#[async_trait::async_trait]
127129
impl Benchmark for StatPopGenBenchmark {
128130
fn queries(&self) -> Result<Vec<(usize, String)>> {
129-
let queries_file = Path::new(env!("CARGO_MANIFEST_DIR"))
131+
let queries_file = workspace_root()
132+
.join("vortex-bench")
130133
.join("statpopgen")
131134
.with_extension("sql");
132135
let contents = fs::read_to_string(queries_file)?;

0 commit comments

Comments
 (0)