vortex-data · robert3005 · Nov 18, 2025 · Nov 17, 2025
diff --git a/vortex-gpu/benches/gpu_bitunpack.rs b/vortex-gpu/benches/gpu_bitunpack.rs
@@ -2,7 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 #![allow(clippy::unwrap_used)]
-#![cfg(gpu_unstable)]
+#![allow(dead_code)]
 
 use std::sync::Arc;
 use std::time::Duration;
@@ -17,9 +17,6 @@ use vortex_buffer::BufferMut;
 use vortex_dtype::NativePType;
 use vortex_error::VortexUnwrap;
 use vortex_fastlanes::{BitPackedArray, FoRArray};
-use vortex_gpu::{
-    create_run_jit_kernel, cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed,
-};
 
 // Data sizes: 1GB, 2.5GB, 5GB, 10GB
 // These are approximate sizes in bytes, accounting for bit-packing compression
@@ -91,6 +88,7 @@ fn make_alp_array(len: usize) -> ArrayRef {
     .into_array()
 }
 
+#[cfg(gpu_unstable)]
 fn benchmark_gpu_decompress_kernel_only(c: &mut Criterion) {
     let mut group = c.benchmark_group("gpu_decompress_kernel_only");
 
@@ -112,7 +110,8 @@ fn benchmark_gpu_decompress_kernel_only(c: &mut Criterion) {
                 let mut total_time = Duration::ZERO;
                 for _ in 0..iters {
                     // This only measures kernel execution time, not memory transfers
-                    let kernel_time_ns = cuda_bit_unpack_timed(array, Arc::clone(&ctx)).unwrap();
+                    let kernel_time_ns =
+                        vortex_gpu::cuda_bit_unpack_timed(array, Arc::clone(&ctx)).unwrap();
                     total_time += kernel_time_ns;
                 }
                 total_time
@@ -123,6 +122,7 @@ fn benchmark_gpu_decompress_kernel_only(c: &mut Criterion) {
     group.finish();
 }
 
+#[cfg(gpu_unstable)]
 fn benchmark_gpu_for_decompress_kernel_only(c: &mut Criterion) {
     let mut group = c.benchmark_group("gpu_for_decompress_kernel_only");
 
@@ -143,7 +143,7 @@ fn benchmark_gpu_for_decompress_kernel_only(c: &mut Criterion) {
                 for _ in 0..iters {
                     // This only measures kernel execution time, not memory transfers
                     let (_result, kernel_time) =
-                        cuda_for_unpack_timed(array, Arc::clone(&ctx)).unwrap();
+                        vortex_gpu::cuda_for_unpack_timed(array, Arc::clone(&ctx)).unwrap();
                     total_time += kernel_time;
                 }
                 total_time
@@ -154,6 +154,7 @@ fn benchmark_gpu_for_decompress_kernel_only(c: &mut Criterion) {
     group.finish();
 }
 
+#[cfg(gpu_unstable)]
 fn benchmark_gpu_for_bp_fused_decompress_kernel_only(c: &mut Criterion) {
     let mut group = c.benchmark_group("gpu_for_bp_fused_decompress_kernel_only");
 
@@ -174,7 +175,7 @@ fn benchmark_gpu_for_bp_fused_decompress_kernel_only(c: &mut Criterion) {
                 for _ in 0..iters {
                     // This only measures kernel execution time, not memory transfers
                     let (_result, kernel_time) =
-                        cuda_for_bp_unpack_timed(array, Arc::clone(&ctx)).unwrap();
+                        vortex_gpu::cuda_for_bp_unpack_timed(array, Arc::clone(&ctx)).unwrap();
                     total_time += kernel_time;
                 }
                 total_time
@@ -185,6 +186,7 @@ fn benchmark_gpu_for_bp_fused_decompress_kernel_only(c: &mut Criterion) {
     group.finish();
 }
 
+#[cfg(gpu_unstable)]
 fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) {
     let mut group = c.benchmark_group("benchmark_gpu_for_bp_jit_decompress_kernel_only");
 
@@ -205,7 +207,8 @@ fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) {
                 let mut total_time = Duration::ZERO;
                 for _ in 0..iters {
                     // This only measures kernel execution time, not memory transfers
-                    let (_result, kernel_time) = create_run_jit_kernel(&ctx, array).unwrap();
+                    let (_result, kernel_time) =
+                        vortex_gpu::create_run_jit_kernel(&ctx, array).unwrap();
                     total_time += kernel_time.elapsed().unwrap();
                 }
                 total_time
@@ -216,23 +219,7 @@ fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) {
     group.finish();
 }
 
-#[allow(dead_code)]
-fn benchmark_cpu_canonicalize(c: &mut Criterion) {
-    let mut group = c.benchmark_group("cpu_canonicalize");
-
-    for (len, label) in DATA_SIZES {
-        let len = len.next_multiple_of(1024);
-        let array = make_bitpackable_array::<u32>(len);
-
-        group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
-        group.bench_with_input(BenchmarkId::new("u32", label), &array, |b, array| {
-            b.iter(|| array.clone().into_array().to_canonical());
-        });
-    }
-
-    group.finish();
-}
-
+#[cfg(gpu_unstable)]
 criterion_group!(
     benches,
     benchmark_gpu_decompress_kernel_only,
@@ -242,4 +229,8 @@ criterion_group!(
 );
 
 // criterion_group!(benches, benchmark_gpu_for_bp_jit_decompress_kernel_only);
+#[cfg(gpu_unstable)]
 criterion_main!(benches);
+
+#[cfg(not(gpu_unstable))]
+fn main() {}