wip

joseph-isaacs · joseph-isaacs · commit 446a37712777 · 2025-10-31T12:48:01.000Z
Signed-off-by: Joe Isaacs &lt;joe.isaacs@live.co.uk&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/vortex-file/Cargo.toml b/vortex-file/Cargo.toml
@@ -65,6 +65,8 @@ vortex-array = { path = "../vortex-array", features = ["test-harness"] }
 vortex-btrblocks = { path = "../vortex-btrblocks" }
 vortex-io = { path = "../vortex-io", features = ["tokio"] }
 vortex-scan = { path = "../vortex-scan", features = ["tokio"] }
+criterion = { version = "0.7.0", features = ["html_reports", "async", "async_tokio"] }
+vortex-gpu = { workspace = true, features = ["cuda"] }
 
 [lints]
 workspace = true
@@ -86,3 +88,8 @@ gpu = [
     "vortex-layout/gpu",
     "vortex-scan/gpu",
 ]
+
+[[bench]]
+name = "bench_read"
+harness = false
+test = false
diff --git a/vortex-file/benches/bench_read.rs b/vortex-file/benches/bench_read.rs
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(clippy::unwrap_used)]
+
+use std::sync::Arc;
+
+use criterion::{Criterion, Throughput, criterion_group, criterion_main};
+use cudarc::driver::CudaContext;
+use futures::TryStreamExt;
+use tokio::runtime::Runtime;
+use vortex_array::arrays::{ChunkedArray, StructArray};
+use vortex_array::{ArrayRef, IntoArray};
+use vortex_buffer::Buffer;
+use vortex_error::VortexUnwrap;
+use vortex_file::{FileGpuSegmentSource, VortexOpenOptions, VortexWriteOptions};
+
+// Data sizes: 1GB, 2.5GB, 5GB, 10GB
+// These are approximate sizes in bytes, accounting for bit-packing compression
+const DATA_SIZES: &[(usize, &str)] = &[
+    (268_435_456, "1GB"), // ~1GB when unpacked (268M * 4 bytes)
+];
+
+#[allow(clippy::cast_possible_truncation)]
+fn make_test_array(len: usize) -> ArrayRef {
+    let numbers = ChunkedArray::from_iter([
+        (0..len / 2)
+            .map(|i| (i as u32) % 64)
+            .collect::<Buffer<u32>>()
+            .into_array(),
+        (0..len / 2)
+            .map(|i| (i as u32) % 64)
+            .collect::<Buffer<u32>>()
+            .into_array(),
+    ])
+    .into_array();
+    let floats = ChunkedArray::from_iter([
+        (0..len / 2)
+            .map(|i| (i % 2) as f32 + 0.1)
+            .collect::<Buffer<f32>>()
+            .into_array(),
+        (0..len / 2)
+            .map(|i| (i % 2) as f32 + 4.1)
+            .collect::<Buffer<f32>>()
+            .into_array(),
+    ])
+    .into_array();
+
+    StructArray::from_fields(&[("numbers", numbers), ("floats", floats)])
+        .vortex_unwrap()
+        .into_array()
+}
+
+fn benchmark_gpu_scan(c: &mut Criterion) {
+    let runtime = Runtime::new().unwrap();
+    let mut group = c.benchmark_group("gpu_scan");
+
+    group.sample_size(10);
+    let bench_file_name = "/tmp/test-vx/bench_out.vortex";
+
+    for (len, label) in DATA_SIZES {
+        let len = len.next_multiple_of(1024);
+        let array = make_test_array(len);
+
+        runtime.block_on(async {
+            VortexWriteOptions::default()
+                .write(
+                    tokio::fs::File::create(bench_file_name).await.unwrap(),
+                    array.to_array_stream(),
+                )
+                .await
+                .unwrap();
+        });
+
+        let cuda_ctx = CudaContext::new(0).unwrap();
+        cuda_ctx.set_blocking_synchronize().unwrap();
+        group.throughput(Throughput::Bytes((len * size_of::<u32>() * 2) as u64));
+        group.bench_function(*label, |b| {
+            b.to_async(&runtime).iter_with_large_drop(async || {
+                let file = std::fs::File::open(bench_file_name).unwrap();
+                let vx_file = VortexOpenOptions::new()
+                    .open(bench_file_name)
+                    .await
+                    .vortex_unwrap();
+                let stream = vx_file
+                    .gpu_scan(
+                        cuda_ctx.clone(),
+                        Arc::new(FileGpuSegmentSource::new(
+                            vx_file.footer.segment_map().clone(),
+                            cuda_ctx.default_stream(),
+                            file,
+                        )),
+                    )
+                    .vortex_unwrap()
+                    .into_array_stream()
+                    .vortex_unwrap()
+                    .try_collect::<Vec<_>>()
+                    .await
+                    .vortex_unwrap();
+                stream
+
+                // VortexOpenOptions::new()
+                //     .open(bench_file_name)
+                //     .await
+                //     .vortex_unwrap()
+                //     .gpu_scan(ctx.clone())
+                //     .vortex_unwrap()
+                //     .into_array_stream()
+                //     .vortex_unwrap()
+                //     .try_collect::<Vec<_>>()
+                //     .await
+                //     .vortex_unwrap()
+            });
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, benchmark_gpu_scan);
+
+criterion_main!(benches);
diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs
@@ -33,7 +33,7 @@ use crate::pruning::extract_relevant_file_stats_as_struct_row;
 #[derive(Clone)]
 pub struct VortexFile {
     /// The footer of the Vortex file, containing metadata and layout information.
-    pub(crate) footer: Footer,
+    pub footer: Footer,
     /// The segment source used to read segments from this file.
     pub(crate) segment_source: Arc<dyn SegmentSource>,
     /// Metrics tied to the file.
diff --git a/vortex-file/src/lib.rs b/vortex-file/src/lib.rs
@@ -107,6 +107,8 @@ pub use file::*;
 pub use footer::*;
 pub use forever_constant::*;
 pub use open::*;
+#[cfg(feature = "gpu")]
+pub use segments::FileGpuSegmentSource;
 pub use strategy::*;
 use vortex_alp::{ALPEncoding, ALPRDEncoding};
 use vortex_array::{ArrayRegistry, EncodingRef};
diff --git a/vortex-file/src/segments/gpu_source.rs b/vortex-file/src/segments/gpu_source.rs
@@ -48,14 +48,12 @@ impl GpuSegmentSource for FileGpuSegmentSource {
             .vortex_expect("missing segment id")
             .clone();
 
-        let mut cu_slice = self
-            .stream
-            .alloc_zeros::<u8>(spec.length as usize)
+        let mut cu_slice = unsafe { self.stream.alloc::<u8>(spec.length as usize) }
             .map_err(|e| vortex_err!("cu slice {e}"))
             .vortex_expect("Failed to allocate cu slice");
 
         // this is optional? and has strange perf characteristics.
-        // cu_file
+        // self.cu_file
         //     .buf_register(&cu_slice)
         //     .map_err(|e| vortex_err!("cu file {e}"))
         //     .vortex_unwrap();
@@ -64,14 +62,18 @@ impl GpuSegmentSource for FileGpuSegmentSource {
         let file_handle = self.file_handle.clone();
         let stream = self.stream.clone();
         async move {
+            // println!("try read");
+            file_handle.sync_read(offset, &mut cu_slice);
             let read = stream
                 .memcpy_ftod(&file_handle, offset, &mut cu_slice)
                 .ok()
                 .vortex_expect("memcpy_ftod");
+            // println!("did read");
 
-            read.synchronize()
-                .map_err(|e| vortex_err!("sync write {e}"))
-                .vortex_unwrap();
+            // read.synchronize()
+            //     .map_err(|e| vortex_err!("sync write {e}"))
+            //     .vortex_unwrap();
+            // println!("did sync");
             Ok(cu_slice)
         }
         .boxed()
diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs
@@ -1475,6 +1475,7 @@ async fn test_writer_with_statistics() -> VortexResult<()> {
 #[cfg_attr(miri, ignore)]
 #[tokio::test]
 async fn test_gpu_read_simple() -> VortexResult<()> {
+    use vortex_array::compute::take;
     use vortex_btrblocks::BtrBlocksCompressor;
 
     use crate::segments::FileGpuSegmentSource;
@@ -1550,7 +1551,59 @@ async fn test_gpu_read_simple() -> VortexResult<()> {
         gpu_chunks.push(array.into_array());
     }
 
-    assert_arrays_eq!(ChunkedArray::from_iter(gpu_chunks), cpu_read);
+    // assert_arrays_eq!(
+    //     ChunkedArray::from_iter(gpu_chunks).into_array().as_ref(),
+    //     cpu_read
+    // );
+
+    let left = ChunkedArray::from_iter(gpu_chunks).into_array();
+    let right = cpu_read;
+    if left.dtype() != right.dtype() {
+        panic!(
+            "assertion left == right failed: arrays differ in type: {} != {}.\n  left: {}\n right: {}",
+            left.dtype(),
+            right.dtype(),
+            "x",
+            "x" // left.display_values(),
+                // right.display_values()
+        )
+    }
+
+    if left.len() != right.len() {
+        panic!(
+            "assertion left == right failed: arrays differ in length: {} != {}.\n  left: {}\n right: {}",
+            left.len(),
+            right.len(),
+            "x",
+            "x" // left.display_values(),
+                // right.display_values()
+        )
+    }
+    let n = left.len();
+    let mismatched_indices = (0..n)
+        .filter(|i| left.scalar_at(*i) != right.scalar_at(*i))
+        .collect::<Vec<_>>();
+    if mismatched_indices.len() != 0 {
+        let idx = PrimitiveArray::from_iter(
+            mismatched_indices
+                .clone()
+                .into_iter()
+                .map(|x| x as u64)
+                .take(20),
+        );
+        panic!(
+            "assertion left == right failed: arrays do not match at indices: {}.\n  left: {}\n right: {}",
+            Itertools::format(mismatched_indices.into_iter(), ", "),
+            take(&left, idx.as_ref())
+                .unwrap()
+                .slice(0..20)
+                .display_values(),
+            take(&right, idx.as_ref())
+                .unwrap()
+                .slice(0..20)
+                .display_values(),
+        )
+    }
 
     assert_eq!(row_count, 32768);
     Ok(())
diff --git a/vortex-gpu/src/array_parts.rs b/vortex-gpu/src/array_parts.rs
@@ -16,9 +16,7 @@ use vortex_flatbuffers::array::Array;
 use vortex_scalar::Scalar;
 
 use crate::CudaByteBuffer;
-use crate::jit::{
-    AlpEncodingTree, BitPackedEncodingTree, EncodingTree, EncodingTreeRef, FoREncodingTree,
-};
+use crate::jit::{AlpEncodingTree, BitPackedEncodingTree, EncodingTreeRef, FoREncodingTree};
 
 pub struct GpuArrayParts<'a> {
     buffers: Vec<Option<CudaByteBuffer>>,
@@ -100,7 +98,7 @@ impl<'a> GpuArrayParts<'a> {
                 len,
             );
             let reference = Scalar::new(dtype.clone(), deser);
-            return Arc::new(FoREncodingTree { reference, child }) as Arc<dyn EncodingTree>;
+            return Arc::new(FoREncodingTree { reference, child }) as EncodingTreeRef;
         } else if enc.id() == BitPackedEncoding.id() {
             assert!(array_node.children().unwrap_or_default().is_empty());
             let deser =
diff --git a/vortex-gpu/src/jit/encoding_tree.rs b/vortex-gpu/src/jit/encoding_tree.rs
@@ -4,7 +4,7 @@
 use std::any::Any;
 use std::sync::Arc;
 
-pub type EncodingTreeRef = Arc<dyn EncodingTree + 'static>;
+pub type EncodingTreeRef = Arc<dyn EncodingTree + Send + Sync + 'static>;
 
 pub trait EncodingTree {
     fn as_any(&self) -> &dyn Any;
diff --git a/vortex-gpu/src/jit/kernel_fmt.rs b/vortex-gpu/src/jit/kernel_fmt.rs
@@ -2,14 +2,18 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fmt::Write;
-use std::sync::Arc;
+use std::hash::BuildHasher;
+use std::sync::{Arc, LazyLock};
 
 use cudarc::driver::{CudaContext, CudaFunction};
 use vortex_error::{VortexExpect, VortexResult, vortex_err};
+use vortex_utils::aliases::dash_map::{DashMap, Entry};
 
 use crate::indent::{IndentedWrite, IndentedWriter};
 use crate::jit::{GPUKernelParameter, GPUPipelineJIT, GPUVisitor};
 
+static JIT_CACHE: LazyLock<DashMap<u64, CudaFunction>> = LazyLock::new(DashMap::default);
+
 struct DeclPrinter<'a, 'b: 'a> {
     w: &'a mut IndentedWrite<'b>,
 }
@@ -115,15 +119,22 @@ pub fn create_kernel(
     create_kernel_str(w, array, kernel_out_array)
         .map_err(|e| vortex_err!("jit str cannot fail {e}"))?;
 
-    let module =
-        cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?;
-
-    // Dynamically load it into the device
-    let module = ctx
-        .load_module(module)
-        .map_err(|e| vortex_err!("load module {e}"))?;
-
-    module
-        .load_function("kernel")
-        .map_err(|e| vortex_err!("load_function {e}"))
+    match JIT_CACHE.entry(JIT_CACHE.hasher().hash_one(&s)) {
+        Entry::Occupied(oc) => Ok(oc.get().clone()),
+        Entry::Vacant(vac) => {
+            let module =
+                cudarc::nvrtc::compile_ptx(s).map_err(|e| vortex_err!("compile ptx {e}"))?;
+
+            // Dynamically load it into the device
+            let module = ctx
+                .load_module(module)
+                .map_err(|e| vortex_err!("load module {e}"))?;
+
+            let cuf = module
+                .load_function("kernel")
+                .map_err(|e| vortex_err!("load_function {e}"))?;
+            vac.insert(cuf.clone());
+            Ok(cuf)
+        }
+    }
 }
diff --git a/vortex-layout/src/gpu/layouts/flat/reader.rs b/vortex-layout/src/gpu/layouts/flat/reader.rs
@@ -64,6 +64,7 @@ impl GpuFlatReader {
             Ok(parts.create_array(&dtype, row_count))
         }
         .boxed()
+        .shared()
     }
 }
 
diff --git a/vortex-layout/src/gpu/mod.rs b/vortex-layout/src/gpu/mod.rs
@@ -8,7 +8,7 @@ use std::collections::BTreeSet;
 use std::ops::Range;
 use std::sync::Arc;
 
-use futures::future::BoxFuture;
+use futures::future::{BoxFuture, Shared};
 use vortex_array::stats::Precision;
 use vortex_dtype::{DType, FieldMask};
 use vortex_error::{SharedVortexResult, VortexResult};
@@ -19,7 +19,7 @@ pub type GpuLayoutReaderRef = Arc<dyn GpuLayoutReader>;
 
 pub type GpuArrayFuture = BoxFuture<'static, VortexResult<Vec<GpuVector>>>;
 
-pub type ShareGpuArrayFuture = BoxFuture<'static, SharedVortexResult<EncodingTreeRef>>;
+pub type ShareGpuArrayFuture = Shared<BoxFuture<'static, SharedVortexResult<EncodingTreeRef>>>;
 
 /// A [`crate::gpu::GpuLayoutReader`] is used to read a [`crate::Layout`] in a way that can cache state across multiple
 /// evaluation operations.