feature: Add generator for fastlanes bit unpacking cuda kernels

robert3005 · robert3005 · commit ac7da6d17743 · 2026-01-26T17:00:21.000Z
Signed-off-by: Robert Kruszewski &lt;github@robertk.io&gt;
diff --git a/encodings/fastlanes/.gitignore b/encodings/fastlanes/.gitignore
@@ -0,0 +1 @@
+kernels
diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml
@@ -42,6 +42,9 @@ vortex-alp = { path = "../alp" }
 vortex-array = { workspace = true, features = ["_test-harness"] }
 vortex-fastlanes = { path = ".", features = ["_test-harness"] }
 
+[build-dependencies]
+fastlanes = { workspace = true }
+
 [features]
 _test-harness = ["dep:rand"]
 
diff --git a/encodings/fastlanes/build.rs b/encodings/fastlanes/build.rs
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(clippy::unwrap_used)]
+#![allow(clippy::expect_used)]
+#![allow(clippy::use_debug)]
+
+use std::fs::File;
+use std::io;
+use std::path::Path;
+use std::process::Command;
+
+use fastlanes::FastLanes;
+
+use crate::cuda_kernel_generator::IndentedWriter;
+use crate::cuda_kernel_generator::generate_cuda_unpack_for_width;
+
+mod cuda_kernel_generator;
+
+fn main() {
+    // Declare the cfg so rustc doesn't warn about unexpected cfg.
+    println!("cargo::rustc-check-cfg=cfg(cuda_available)");
+
+    let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir");
+    let kernels_dir = Path::new(&manifest_dir).join("kernels");
+
+    println!("cargo:rerun-if-changed={}", kernels_dir.to_str().unwrap());
+
+    generate_unpack::<u8>(&kernels_dir, 32).expect("Failed to generate unpack for u8");
+    generate_unpack::<u16>(&kernels_dir, 32).expect("Failed to generate unpack for u16");
+    generate_unpack::<u32>(&kernels_dir, 32).expect("Failed to generate unpack for u32");
+    generate_unpack::<u64>(&kernels_dir, 16).expect("Failed to generate unpack for u64");
+
+    if cfg!(not(target_os = "linux")) || !has_nvcc() {
+        // cuda is only support on linux right now
+        return;
+    }
+
+    if let Ok(entries) = std::fs::read_dir(&kernels_dir) {
+        for path in entries.flatten().map(|entry| entry.path()) {
+            match path.extension().and_then(|e| e.to_str()) {
+                // Track header files - changes should trigger recompilation of all .cu files
+                Some("cuh") => {
+                    println!("cargo:rerun-if-changed={}", path.display());
+                }
+                // Compile .cu files to PTX
+                Some("cu") => {
+                    println!("cargo:rerun-if-changed={}", path.display());
+                    nvcc_compile_ptx(&kernels_dir, &path)
+                        .map_err(|e| {
+                            format!("Failed to compile CUDA kernel {}: {}", path.display(), e)
+                        })
+                        .unwrap();
+                }
+                _ => {}
+            }
+        }
+    }
+
+    // Signal that CUDA kernels are available for conditional compilation.
+    println!("cargo:rustc-cfg=cuda_available");
+}
+
+fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<()> {
+    let cu_path = output_dir.join(&format!("bit_unpack_{}.cu", T::T));
+    let mut cu_file = File::create(&cu_path)?;
+    let mut cu_writer = IndentedWriter::new(&mut cu_file);
+    generate_cuda_unpack_for_width::<T, _>(&mut cu_writer, thread_count)
+}
+
+fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> io::Result<()> {
+    // https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts
+    let profile = std::env::var("PROFILE").unwrap();
+
+    let mut cmd = Command::new("nvcc");
+    if profile.as_str() == "debug" {
+        cmd.arg("-O0");
+
+        // NVCC debugging options:
+        // https://docs.nvidia.com/cuda/cuda-programming-guide/02-basics/nvcc.html#debugging-options
+
+        // Include debug symbols for host code.
+        cmd.arg("-g");
+
+        // Include debug symbols for device code.
+        cmd.arg("-G");
+
+        // Generate line-number information for device code. This option does
+        // not affect execution performance and is useful in conjunction with
+        // the compute-sanitizer tool to trace the kernel execution.
+        cmd.arg("-lineinfo");
+
+        // CUDA Sanitizers
+        // - memory: https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html#using-memcheck
+        // - thread: https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html#using-racecheck
+        // - init: https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html#using-initcheck
+        // - synchronize : https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html#using-synccheck
+    } else {
+        cmd.arg("-O3");
+    }
+
+    cmd.arg("-std=c++17")
+        .arg("-arch=native")
+        // Flags forwarded to Clang.
+        .arg("--compiler-options=-Wall -Wextra -Wpedantic -Werror")
+        .arg("--restrict")
+        .arg("--ptx")
+        .arg("--include-path")
+        .arg(kernel_dir)
+        .arg("-c")
+        .arg(cu_path)
+        .arg("-o")
+        .arg(cu_path.with_extension("ptx"));
+
+    let res = cmd.output()?;
+
+    if !res.status.success() {
+        let stderr = String::from_utf8_lossy(&res.stderr);
+        let stdout = String::from_utf8_lossy(&res.stdout);
+
+        println!(
+            "cargo:warning=Failed to compile CUDA kernel: {}",
+            cu_path.display()
+        );
+        println!("cargo:warning=Command: {:?}", cmd);
+
+        if !stdout.is_empty() {
+            for line in stdout.lines() {
+                println!("cargo:warning=stdout: {}", line);
+            }
+        }
+        if !stderr.is_empty() {
+            for line in stderr.lines() {
+                println!("cargo:warning=stderr: {}", line);
+            }
+        }
+
+        return Err(io::Error::other(format!(
+            "nvcc compilation failed for {}",
+            cu_path.display()
+        )));
+    }
+    Ok(())
+}
+
+fn has_nvcc() -> bool {
+    Command::new("nvcc")
+        .arg("--version")
+        .output()
+        .is_ok_and(|o| o.status.success())
+}
diff --git a/encodings/fastlanes/cuda_kernel_generator/indent.rs b/encodings/fastlanes/cuda_kernel_generator/indent.rs
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt;
+use std::io;
+use std::io::Write;
+
+pub struct IndentedWriter<W: Write> {
+    write: W,
+    indent: String,
+}
+
+impl<W: Write> IndentedWriter<W> {
+    pub fn new(write: W) -> Self {
+        Self {
+            write,
+            indent: String::new(),
+        }
+    }
+
+    /// # Errors
+    ///
+    /// Will return Err if writing to the underlying writer fails.
+    pub fn indent<F>(&mut self, indented: F) -> io::Result<()>
+    where
+        F: FnOnce(&mut IndentedWriter<W>) -> io::Result<()>,
+    {
+        let original_ident = self.indent.clone();
+        self.indent += "    ";
+        let res = indented(self);
+        self.indent = original_ident;
+        res
+    }
+
+    /// # Errors
+    ///
+    /// Will return Err if writing to the underlying writer fails.
+    pub fn write_fmt(&mut self, fmt: fmt::Arguments<'_>) -> io::Result<()> {
+        write!(self.write, "{}{}", self.indent, fmt)
+    }
+}
diff --git a/encodings/fastlanes/cuda_kernel_generator/mod.rs b/encodings/fastlanes/cuda_kernel_generator/mod.rs
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+mod indent;
+
+use std::io;
+use std::io::Write;
+
+use fastlanes::FastLanes;
+pub use indent::IndentedWriter;
+
+fn generate_lane_decoder<T: FastLanes, W: Write>(
+    output: &mut IndentedWriter<W>,
+    bit_width: usize,
+) -> io::Result<()> {
+    let bits = <T>::T;
+    let lanes = T::LANES;
+
+    let func_name = format!("fls_unpack_{bit_width}bw_{bits}ow_lane");
+
+    writeln!(
+        output,
+        "__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, unsigned int lane) {{"
+    )?;
+
+    output.indent(|output| {
+        writeln!(output, "unsigned int LANE_COUNT = {lanes};")?;
+        if bit_width == 0 {
+            writeln!(output, "uint{bits}_t zero = 0ULL;")?;
+            writeln!(output)?;
+            for row in 0..bits {
+                writeln!(output, "out[INDEX({row}, lane)] = zero;")?;
+            }
+        } else if bit_width == bits {
+            writeln!(output)?;
+            for row in 0..bits {
+                writeln!(
+                    output,
+                    "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];",
+                )?;
+            }
+        } else {
+            writeln!(output, "uint{bits}_t src;")?;
+            writeln!(output, "uint{bits}_t tmp;")?;
+
+            writeln!(output)?;
+            writeln!(output, "src = in[lane];")?;
+            for row in 0..bits {
+                let curr_word = (row * bit_width) / bits;
+                let next_word = ((row + 1) * bit_width) / bits;
+                let shift = (row * bit_width) % bits;
+
+                if next_word > curr_word {
+                    let remaining_bits = ((row + 1) * bit_width) % bits;
+                    let current_bits = bit_width - remaining_bits;
+                    writeln!(
+                        output,
+                        "tmp = (src >> {shift}) & MASK(uint{bits}_t, {current_bits});"
+                    )?;
+
+                    if next_word < bit_width {
+                        writeln!(output, "src = in[lane + LANE_COUNT * {next_word}];")?;
+                        writeln!(
+                            output,
+                            "tmp |= (src & MASK(uint{bits}_t, {remaining_bits})) << {current_bits};"
+                        )?;
+                    }
+                } else {
+                    writeln!(
+                        output,
+                        "tmp = (src >> {shift}) & MASK(uint{bits}_t, {bit_width});"
+                    )?;
+                }
+
+                writeln!(output, "out[INDEX({row}, lane)] = tmp;")?;
+            }
+        }
+        Ok(())
+    })?;
+
+    writeln!(output, "}}")
+}
+
+fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
+    output: &mut IndentedWriter<W>,
+    bit_width: usize,
+    thread_count: usize,
+) -> io::Result<()> {
+    let bits = <T>::T;
+    let lanes = T::LANES;
+    let per_thread_loop_count = lanes / thread_count;
+
+    let func_name = format!("fls_unpack_{bit_width}bw_{bits}ow_{thread_count}t");
+
+    let local_func_params = format!(
+        "(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, int thread_idx)"
+    );
+
+    writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?;
+
+    output.indent(|output| {
+        for thread_lane in 0..per_thread_loop_count {
+            writeln!(output, "_fls_unpack_{bit_width}bw_{bits}ow_lane(in, out, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
+        }
+        Ok(())
+    })?;
+
+    writeln!(output, "}}")
+}
+
+fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
+    output: &mut IndentedWriter<W>,
+    bit_width: usize,
+    thread_count: usize,
+) -> io::Result<()> {
+    let bits = <T>::T;
+
+    let func_name = format!("fls_unpack_{bit_width}bw_{bits}ow_{thread_count}t");
+    let func_params =
+        format!("(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out)");
+
+    writeln!(
+        output,
+        "extern \"C\" __global__ void {func_name}{func_params} {{"
+    )?;
+
+    output.indent(|output| {
+        writeln!(output, "int thread_idx = threadIdx.x;")?;
+        writeln!(
+            output,
+            "auto in = full_in + (blockIdx.x * (128 * {bit_width} / sizeof(uint{bits}_t)));"
+        )?;
+        writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?;
+
+        writeln!(output, "_{func_name}(in, out, thread_idx);")
+    })?;
+
+    writeln!(output, "}}")
+}
+
+/// # Errors
+///
+/// Will return Err if writing to the underlying writer fails.
+pub fn generate_cuda_unpack_for_width<T: FastLanes, W: Write>(
+    output: &mut IndentedWriter<W>,
+    thread_count: usize,
+) -> io::Result<()> {
+    writeln!(
+        output,
+        "// Auto-generated by vortex-gpu-kernels. Do not edit by hand!"
+    )?;
+    writeln!(output, "#include <cuda.h>")?;
+    writeln!(output, "#include <cuda_runtime.h>")?;
+    writeln!(output, "#include <stdint.h>")?;
+    writeln!(output, "#include \"fastlanes_common.cuh\"")?;
+    writeln!(output)?;
+
+    for bit_width in 0..=<T>::T {
+        generate_lane_decoder::<T, _>(output, bit_width)?;
+        writeln!(output)?;
+        generate_device_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
+        writeln!(output)?;
+
+        generate_global_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
+        writeln!(output)?;
+    }
+
+    Ok(())
+}
diff --git a/encodings/fastlanes/kernels/.gitkeep b/encodings/fastlanes/kernels/.gitkeep