Skip to content

Commit ac7da6d

Browse files
committed
feature: Add generator for fastlanes bit unpacking cuda kernels
Signed-off-by: Robert Kruszewski <github@robertk.io>
1 parent 1a6ece1 commit ac7da6d

File tree

6 files changed

+365
-0
lines changed

6 files changed

+365
-0
lines changed

encodings/fastlanes/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
kernels

encodings/fastlanes/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ vortex-alp = { path = "../alp" }
4242
vortex-array = { workspace = true, features = ["_test-harness"] }
4343
vortex-fastlanes = { path = ".", features = ["_test-harness"] }
4444

45+
[build-dependencies]
46+
fastlanes = { workspace = true }
47+
4548
[features]
4649
_test-harness = ["dep:rand"]
4750

encodings/fastlanes/build.rs

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
#![allow(clippy::unwrap_used)]
5+
#![allow(clippy::expect_used)]
6+
#![allow(clippy::use_debug)]
7+
8+
use std::fs::File;
9+
use std::io;
10+
use std::path::Path;
11+
use std::process::Command;
12+
13+
use fastlanes::FastLanes;
14+
15+
use crate::cuda_kernel_generator::IndentedWriter;
16+
use crate::cuda_kernel_generator::generate_cuda_unpack_for_width;
17+
18+
mod cuda_kernel_generator;
19+
20+
fn main() {
21+
// Declare the cfg so rustc doesn't warn about unexpected cfg.
22+
println!("cargo::rustc-check-cfg=cfg(cuda_available)");
23+
24+
let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir");
25+
let kernels_dir = Path::new(&manifest_dir).join("kernels");
26+
27+
println!("cargo:rerun-if-changed={}", kernels_dir.to_str().unwrap());
28+
29+
generate_unpack::<u8>(&kernels_dir, 32).expect("Failed to generate unpack for u8");
30+
generate_unpack::<u16>(&kernels_dir, 32).expect("Failed to generate unpack for u16");
31+
generate_unpack::<u32>(&kernels_dir, 32).expect("Failed to generate unpack for u32");
32+
generate_unpack::<u64>(&kernels_dir, 16).expect("Failed to generate unpack for u64");
33+
34+
if cfg!(not(target_os = "linux")) || !has_nvcc() {
35+
// cuda is only support on linux right now
36+
return;
37+
}
38+
39+
if let Ok(entries) = std::fs::read_dir(&kernels_dir) {
40+
for path in entries.flatten().map(|entry| entry.path()) {
41+
match path.extension().and_then(|e| e.to_str()) {
42+
// Track header files - changes should trigger recompilation of all .cu files
43+
Some("cuh") => {
44+
println!("cargo:rerun-if-changed={}", path.display());
45+
}
46+
// Compile .cu files to PTX
47+
Some("cu") => {
48+
println!("cargo:rerun-if-changed={}", path.display());
49+
nvcc_compile_ptx(&kernels_dir, &path)
50+
.map_err(|e| {
51+
format!("Failed to compile CUDA kernel {}: {}", path.display(), e)
52+
})
53+
.unwrap();
54+
}
55+
_ => {}
56+
}
57+
}
58+
}
59+
60+
// Signal that CUDA kernels are available for conditional compilation.
61+
println!("cargo:rustc-cfg=cuda_available");
62+
}
63+
64+
fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<()> {
65+
let cu_path = output_dir.join(&format!("bit_unpack_{}.cu", T::T));
66+
let mut cu_file = File::create(&cu_path)?;
67+
let mut cu_writer = IndentedWriter::new(&mut cu_file);
68+
generate_cuda_unpack_for_width::<T, _>(&mut cu_writer, thread_count)
69+
}
70+
71+
fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> io::Result<()> {
72+
// https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts
73+
let profile = std::env::var("PROFILE").unwrap();
74+
75+
let mut cmd = Command::new("nvcc");
76+
if profile.as_str() == "debug" {
77+
cmd.arg("-O0");
78+
79+
// NVCC debugging options:
80+
// https://docs.nvidia.com/cuda/cuda-programming-guide/02-basics/nvcc.html#debugging-options
81+
82+
// Include debug symbols for host code.
83+
cmd.arg("-g");
84+
85+
// Include debug symbols for device code.
86+
cmd.arg("-G");
87+
88+
// Generate line-number information for device code. This option does
89+
// not affect execution performance and is useful in conjunction with
90+
// the compute-sanitizer tool to trace the kernel execution.
91+
cmd.arg("-lineinfo");
92+
93+
// CUDA Sanitizers
94+
// - memory: https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html#using-memcheck
95+
// - thread: https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html#using-racecheck
96+
// - init: https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html#using-initcheck
97+
// - synchronize : https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html#using-synccheck
98+
} else {
99+
cmd.arg("-O3");
100+
}
101+
102+
cmd.arg("-std=c++17")
103+
.arg("-arch=native")
104+
// Flags forwarded to Clang.
105+
.arg("--compiler-options=-Wall -Wextra -Wpedantic -Werror")
106+
.arg("--restrict")
107+
.arg("--ptx")
108+
.arg("--include-path")
109+
.arg(kernel_dir)
110+
.arg("-c")
111+
.arg(cu_path)
112+
.arg("-o")
113+
.arg(cu_path.with_extension("ptx"));
114+
115+
let res = cmd.output()?;
116+
117+
if !res.status.success() {
118+
let stderr = String::from_utf8_lossy(&res.stderr);
119+
let stdout = String::from_utf8_lossy(&res.stdout);
120+
121+
println!(
122+
"cargo:warning=Failed to compile CUDA kernel: {}",
123+
cu_path.display()
124+
);
125+
println!("cargo:warning=Command: {:?}", cmd);
126+
127+
if !stdout.is_empty() {
128+
for line in stdout.lines() {
129+
println!("cargo:warning=stdout: {}", line);
130+
}
131+
}
132+
if !stderr.is_empty() {
133+
for line in stderr.lines() {
134+
println!("cargo:warning=stderr: {}", line);
135+
}
136+
}
137+
138+
return Err(io::Error::other(format!(
139+
"nvcc compilation failed for {}",
140+
cu_path.display()
141+
)));
142+
}
143+
Ok(())
144+
}
145+
146+
fn has_nvcc() -> bool {
147+
Command::new("nvcc")
148+
.arg("--version")
149+
.output()
150+
.is_ok_and(|o| o.status.success())
151+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use std::fmt;
5+
use std::io;
6+
use std::io::Write;
7+
8+
pub struct IndentedWriter<W: Write> {
9+
write: W,
10+
indent: String,
11+
}
12+
13+
impl<W: Write> IndentedWriter<W> {
14+
pub fn new(write: W) -> Self {
15+
Self {
16+
write,
17+
indent: String::new(),
18+
}
19+
}
20+
21+
/// # Errors
22+
///
23+
/// Will return Err if writing to the underlying writer fails.
24+
pub fn indent<F>(&mut self, indented: F) -> io::Result<()>
25+
where
26+
F: FnOnce(&mut IndentedWriter<W>) -> io::Result<()>,
27+
{
28+
let original_ident = self.indent.clone();
29+
self.indent += " ";
30+
let res = indented(self);
31+
self.indent = original_ident;
32+
res
33+
}
34+
35+
/// # Errors
36+
///
37+
/// Will return Err if writing to the underlying writer fails.
38+
pub fn write_fmt(&mut self, fmt: fmt::Arguments<'_>) -> io::Result<()> {
39+
write!(self.write, "{}{}", self.indent, fmt)
40+
}
41+
}
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
mod indent;
5+
6+
use std::io;
7+
use std::io::Write;
8+
9+
use fastlanes::FastLanes;
10+
pub use indent::IndentedWriter;
11+
12+
fn generate_lane_decoder<T: FastLanes, W: Write>(
13+
output: &mut IndentedWriter<W>,
14+
bit_width: usize,
15+
) -> io::Result<()> {
16+
let bits = <T>::T;
17+
let lanes = T::LANES;
18+
19+
let func_name = format!("fls_unpack_{bit_width}bw_{bits}ow_lane");
20+
21+
writeln!(
22+
output,
23+
"__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, unsigned int lane) {{"
24+
)?;
25+
26+
output.indent(|output| {
27+
writeln!(output, "unsigned int LANE_COUNT = {lanes};")?;
28+
if bit_width == 0 {
29+
writeln!(output, "uint{bits}_t zero = 0ULL;")?;
30+
writeln!(output)?;
31+
for row in 0..bits {
32+
writeln!(output, "out[INDEX({row}, lane)] = zero;")?;
33+
}
34+
} else if bit_width == bits {
35+
writeln!(output)?;
36+
for row in 0..bits {
37+
writeln!(
38+
output,
39+
"out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];",
40+
)?;
41+
}
42+
} else {
43+
writeln!(output, "uint{bits}_t src;")?;
44+
writeln!(output, "uint{bits}_t tmp;")?;
45+
46+
writeln!(output)?;
47+
writeln!(output, "src = in[lane];")?;
48+
for row in 0..bits {
49+
let curr_word = (row * bit_width) / bits;
50+
let next_word = ((row + 1) * bit_width) / bits;
51+
let shift = (row * bit_width) % bits;
52+
53+
if next_word > curr_word {
54+
let remaining_bits = ((row + 1) * bit_width) % bits;
55+
let current_bits = bit_width - remaining_bits;
56+
writeln!(
57+
output,
58+
"tmp = (src >> {shift}) & MASK(uint{bits}_t, {current_bits});"
59+
)?;
60+
61+
if next_word < bit_width {
62+
writeln!(output, "src = in[lane + LANE_COUNT * {next_word}];")?;
63+
writeln!(
64+
output,
65+
"tmp |= (src & MASK(uint{bits}_t, {remaining_bits})) << {current_bits};"
66+
)?;
67+
}
68+
} else {
69+
writeln!(
70+
output,
71+
"tmp = (src >> {shift}) & MASK(uint{bits}_t, {bit_width});"
72+
)?;
73+
}
74+
75+
writeln!(output, "out[INDEX({row}, lane)] = tmp;")?;
76+
}
77+
}
78+
Ok(())
79+
})?;
80+
81+
writeln!(output, "}}")
82+
}
83+
84+
fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
85+
output: &mut IndentedWriter<W>,
86+
bit_width: usize,
87+
thread_count: usize,
88+
) -> io::Result<()> {
89+
let bits = <T>::T;
90+
let lanes = T::LANES;
91+
let per_thread_loop_count = lanes / thread_count;
92+
93+
let func_name = format!("fls_unpack_{bit_width}bw_{bits}ow_{thread_count}t");
94+
95+
let local_func_params = format!(
96+
"(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, int thread_idx)"
97+
);
98+
99+
writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?;
100+
101+
output.indent(|output| {
102+
for thread_lane in 0..per_thread_loop_count {
103+
writeln!(output, "_fls_unpack_{bit_width}bw_{bits}ow_lane(in, out, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
104+
}
105+
Ok(())
106+
})?;
107+
108+
writeln!(output, "}}")
109+
}
110+
111+
fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
112+
output: &mut IndentedWriter<W>,
113+
bit_width: usize,
114+
thread_count: usize,
115+
) -> io::Result<()> {
116+
let bits = <T>::T;
117+
118+
let func_name = format!("fls_unpack_{bit_width}bw_{bits}ow_{thread_count}t");
119+
let func_params =
120+
format!("(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out)");
121+
122+
writeln!(
123+
output,
124+
"extern \"C\" __global__ void {func_name}{func_params} {{"
125+
)?;
126+
127+
output.indent(|output| {
128+
writeln!(output, "int thread_idx = threadIdx.x;")?;
129+
writeln!(
130+
output,
131+
"auto in = full_in + (blockIdx.x * (128 * {bit_width} / sizeof(uint{bits}_t)));"
132+
)?;
133+
writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?;
134+
135+
writeln!(output, "_{func_name}(in, out, thread_idx);")
136+
})?;
137+
138+
writeln!(output, "}}")
139+
}
140+
141+
/// # Errors
142+
///
143+
/// Will return Err if writing to the underlying writer fails.
144+
pub fn generate_cuda_unpack_for_width<T: FastLanes, W: Write>(
145+
output: &mut IndentedWriter<W>,
146+
thread_count: usize,
147+
) -> io::Result<()> {
148+
writeln!(
149+
output,
150+
"// Auto-generated by vortex-gpu-kernels. Do not edit by hand!"
151+
)?;
152+
writeln!(output, "#include <cuda.h>")?;
153+
writeln!(output, "#include <cuda_runtime.h>")?;
154+
writeln!(output, "#include <stdint.h>")?;
155+
writeln!(output, "#include \"fastlanes_common.cuh\"")?;
156+
writeln!(output)?;
157+
158+
for bit_width in 0..=<T>::T {
159+
generate_lane_decoder::<T, _>(output, bit_width)?;
160+
writeln!(output)?;
161+
generate_device_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
162+
writeln!(output)?;
163+
164+
generate_global_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
165+
writeln!(output)?;
166+
}
167+
168+
Ok(())
169+
}

encodings/fastlanes/kernels/.gitkeep

Whitespace-only changes.

0 commit comments

Comments
 (0)