Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions vortex-gpu/kernels/dict_take.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,16 @@ __device__ void dict_take(
ValueT *__restrict values_out
) {
auto i = threadIdx.x;
auto block_offset = (blockIdx.x * 1024);
auto block_offset = blockIdx.x * 1024;

auto codes = codes_array + block_offset;
auto out = values_out + block_offset;

const int thread_ops = 32;

for (auto j = 0; j < thread_ops; j++) {
auto idx = i * thread_ops + j;
out[idx] = values[codes[idx]];
auto idx = i * thread_ops + j;
out[idx] = values[codes[idx]];
}
}

Expand All @@ -33,8 +33,8 @@ __device__ void dict_take_masked(
ValueT *__restrict values_out
) {
auto i = threadIdx.x;
auto block_offset = (blockIdx.x * 1024);
auto mask_block_offset = (blockIdx.x * (1024 / 32));
auto block_offset = blockIdx.x * 1024;
auto mask_block_offset = blockIdx.x * (1024 / 32);

auto codes = codes_array + block_offset;
auto mask = mask_array + mask_block_offset;
Expand Down
143 changes: 143 additions & 0 deletions vortex-gpu/kernels/rle_decompress.cu
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any reason to avoid floats in the ValueT?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's not, I was working on tests to improve the coverage

Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

#include <cuda.h>
#include <cuda_runtime.h>
#include <stdint.h>

template<typename IndicesT, typename ValueT, typename OffsetsT>
__device__ void rle_decompress(
const IndicesT *__restrict indices_array,
const ValueT *__restrict values_array,
const OffsetsT *__restrict offsets,
ValueT *__restrict values_out
) {
auto i = threadIdx.x;
auto block_offset = blockIdx.x * 1024;

auto indices = indices_array + block_offset;
auto out = values_out + block_offset;
auto values = values_array + offsets[blockIdx.x];

const int thread_ops = 32;

for (auto j = 0; j < thread_ops; j++) {
auto idx = i * thread_ops + j;
out[idx] = values[indices[idx]];
}
}
Comment on lines +9 to +28
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will need a fl-transposed iterator order version?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you mind adding a single fused kernel bp-rle. Where we fuse bitunpacking the indices and rle decoding.

It can be in a follow up, but while you understand how the decode works we should merge something showing that.


// Macro to generate the extern "C" wrapper for each type combination
#define GENERATE_KERNEL(indices_suffix, value_suffix, offsets_suffix, IndicesType, ValueType, OffsetsType) \
extern "C" __global__ void rle_decompress_i##indices_suffix##_v##value_suffix##_o##offsets_suffix( \
const IndicesType *__restrict indices_array, \
const ValueType *__restrict values_array, \
const OffsetsType *__restrict offsets, \
ValueType *__restrict values_out \
) { \
rle_decompress(indices_array, values_array, offsets, values_out); \
}

// Generate all combinations
// Unsigned types
GENERATE_KERNEL(u8, u8, u8, uint8_t, uint8_t, uint8_t)
GENERATE_KERNEL(u8, u8, u16, uint8_t, uint8_t, uint16_t)
GENERATE_KERNEL(u8, u8, u32, uint8_t, uint8_t, uint32_t)
GENERATE_KERNEL(u8, u8, u64, uint8_t, uint8_t, uint64_t)

GENERATE_KERNEL(u8, u16, u8, uint8_t, uint16_t, uint8_t)
GENERATE_KERNEL(u8, u16, u16, uint8_t, uint16_t, uint16_t)
GENERATE_KERNEL(u8, u16, u32, uint8_t, uint16_t, uint32_t)
GENERATE_KERNEL(u8, u16, u64, uint8_t, uint16_t, uint64_t)

GENERATE_KERNEL(u8, u32, u8, uint8_t, uint32_t, uint8_t)
GENERATE_KERNEL(u8, u32, u16, uint8_t, uint32_t, uint16_t)
GENERATE_KERNEL(u8, u32, u32, uint8_t, uint32_t, uint32_t)
GENERATE_KERNEL(u8, u32, u64, uint8_t, uint32_t, uint64_t)

GENERATE_KERNEL(u8, u64, u8, uint8_t, uint64_t, uint8_t)
GENERATE_KERNEL(u8, u64, u16, uint8_t, uint64_t, uint16_t)
GENERATE_KERNEL(u8, u64, u32, uint8_t, uint64_t, uint32_t)
GENERATE_KERNEL(u8, u64, u64, uint8_t, uint64_t, uint64_t)

GENERATE_KERNEL(u16, u8, u8, uint16_t, uint8_t, uint8_t)
GENERATE_KERNEL(u16, u8, u16, uint16_t, uint8_t, uint16_t)
GENERATE_KERNEL(u16, u8, u32, uint16_t, uint8_t, uint32_t)
GENERATE_KERNEL(u16, u8, u64, uint16_t, uint8_t, uint64_t)

GENERATE_KERNEL(u16, u16, u8, uint16_t, uint16_t, uint8_t)
GENERATE_KERNEL(u16, u16, u16, uint16_t, uint16_t, uint16_t)
GENERATE_KERNEL(u16, u16, u32, uint16_t, uint16_t, uint32_t)
GENERATE_KERNEL(u16, u16, u64, uint16_t, uint16_t, uint64_t)

GENERATE_KERNEL(u16, u32, u8, uint16_t, uint32_t, uint8_t)
GENERATE_KERNEL(u16, u32, u16, uint16_t, uint32_t, uint16_t)
GENERATE_KERNEL(u16, u32, u32, uint16_t, uint32_t, uint32_t)
GENERATE_KERNEL(u16, u32, u64, uint16_t, uint32_t, uint64_t)

GENERATE_KERNEL(u16, u64, u8, uint16_t, uint64_t, uint8_t)
GENERATE_KERNEL(u16, u64, u16, uint16_t, uint64_t, uint16_t)
GENERATE_KERNEL(u16, u64, u32, uint16_t, uint64_t, uint32_t)
GENERATE_KERNEL(u16, u64, u64, uint16_t, uint64_t, uint64_t)

// Signed types
GENERATE_KERNEL(u8, i8, u8, uint8_t, int8_t, uint8_t)
GENERATE_KERNEL(u8, i8, u16, uint8_t, int8_t, uint16_t)
GENERATE_KERNEL(u8, i8, u32, uint8_t, int8_t, uint32_t)
GENERATE_KERNEL(u8, i8, u64, uint8_t, int8_t, uint64_t)

GENERATE_KERNEL(u8, i16, u8, uint8_t, int16_t, uint8_t)
GENERATE_KERNEL(u8, i16, u16, uint8_t, int16_t, uint16_t)
GENERATE_KERNEL(u8, i16, u32, uint8_t, int16_t, uint32_t)
GENERATE_KERNEL(u8, i16, u64, uint8_t, int16_t, uint64_t)

GENERATE_KERNEL(u8, i32, u8, uint8_t, int32_t, uint8_t)
GENERATE_KERNEL(u8, i32, u16, uint8_t, int32_t, uint16_t)
GENERATE_KERNEL(u8, i32, u32, uint8_t, int32_t, uint32_t)
GENERATE_KERNEL(u8, i32, u64, uint8_t, int32_t, uint64_t)

GENERATE_KERNEL(u8, i64, u8, uint8_t, int64_t, uint8_t)
GENERATE_KERNEL(u8, i64, u16, uint8_t, int64_t, uint16_t)
GENERATE_KERNEL(u8, i64, u32, uint8_t, int64_t, uint32_t)
GENERATE_KERNEL(u8, i64, u64, uint8_t, int64_t, uint64_t)

GENERATE_KERNEL(u16, i8, u8, uint16_t, int8_t, uint8_t)
GENERATE_KERNEL(u16, i8, u16, uint16_t, int8_t, uint16_t)
GENERATE_KERNEL(u16, i8, u32, uint16_t, int8_t, uint32_t)
GENERATE_KERNEL(u16, i8, u64, uint16_t, int8_t, uint64_t)

GENERATE_KERNEL(u16, i16, u8, uint16_t, int16_t, uint8_t)
GENERATE_KERNEL(u16, i16, u16, uint16_t, int16_t, uint16_t)
GENERATE_KERNEL(u16, i16, u32, uint16_t, int16_t, uint32_t)
GENERATE_KERNEL(u16, i16, u64, uint16_t, int16_t, uint64_t)

GENERATE_KERNEL(u16, i32, u8, uint16_t, int32_t, uint8_t)
GENERATE_KERNEL(u16, i32, u16, uint16_t, int32_t, uint16_t)
GENERATE_KERNEL(u16, i32, u32, uint16_t, int32_t, uint32_t)
GENERATE_KERNEL(u16, i32, u64, uint16_t, int32_t, uint64_t)

GENERATE_KERNEL(u16, i64, u8, uint16_t, int64_t, uint8_t)
GENERATE_KERNEL(u16, i64, u16, uint16_t, int64_t, uint16_t)
GENERATE_KERNEL(u16, i64, u32, uint16_t, int64_t, uint32_t)
GENERATE_KERNEL(u16, i64, u64, uint16_t, int64_t, uint64_t)

// Float types
GENERATE_KERNEL(u8, f32, u8, uint8_t, float, uint8_t)
GENERATE_KERNEL(u8, f32, u16, uint8_t, float, uint16_t)
GENERATE_KERNEL(u8, f32, u32, uint8_t, float, uint32_t)
GENERATE_KERNEL(u8, f32, u64, uint8_t, float, uint64_t)

GENERATE_KERNEL(u8, f64, u8, uint8_t, double, uint8_t)
GENERATE_KERNEL(u8, f64, u16, uint8_t, double, uint16_t)
GENERATE_KERNEL(u8, f64, u32, uint8_t, double, uint32_t)
GENERATE_KERNEL(u8, f64, u64, uint8_t, double, uint64_t)

GENERATE_KERNEL(u16, f32, u8, uint16_t, float, uint8_t)
GENERATE_KERNEL(u16, f32, u16, uint16_t, float, uint16_t)
GENERATE_KERNEL(u16, f32, u32, uint16_t, float, uint32_t)
GENERATE_KERNEL(u16, f32, u64, uint16_t, float, uint64_t)

GENERATE_KERNEL(u16, f64, u8, uint16_t, double, uint8_t)
GENERATE_KERNEL(u16, f64, u16, uint16_t, double, uint16_t)
GENERATE_KERNEL(u16, f64, u32, uint16_t, double, uint32_t)
GENERATE_KERNEL(u16, f64, u64, uint16_t, double, uint64_t)
16 changes: 3 additions & 13 deletions vortex-gpu/src/bit_unpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// This code is only exercised on CI with cuda and linux
#![allow(dead_code)]

use std::sync::{Arc, LazyLock};
use std::sync::Arc;
use std::time::Duration;

use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT;
Expand All @@ -13,15 +13,13 @@ use cudarc::driver::{
PushKernelArg,
};
use cudarc::nvrtc::Ptx;
use parking_lot::RwLock;
use vortex_array::Canonical;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::validity::Validity;
use vortex_buffer::{Buffer, BufferMut};
use vortex_dtype::{NativePType, PType, match_each_unsigned_integer_ptype};
use vortex_error::{VortexExpect, VortexResult, vortex_err};
use vortex_fastlanes::BitPackedArray;
use vortex_utils::aliases::hash_map::HashMap;

use crate::task::GPUTask;

Expand All @@ -40,24 +38,18 @@ impl UnpackKernelId {
}
}

static CUDA_KERNELS: LazyLock<RwLock<HashMap<UnpackKernelId, CudaFunction>>> =
LazyLock::new(|| RwLock::new(HashMap::new()));

fn cuda_bit_unpack_kernel(
kernel_id: UnpackKernelId,
ctx: Arc<CudaContext>,
) -> VortexResult<CudaFunction> {
if let Some(kernel) = CUDA_KERNELS.read().get(&kernel_id) {
return Ok(kernel.clone());
}
let module = ctx
.load_module(Ptx::from_file(format!(
"kernels/gen/fls_{}_bit_unpack.ptx",
kernel_id.output_bit_width
)))
.map_err(|e| vortex_err!("Failed to load kernel module: {e}"))?;

let kernel_func = module
module
.load_function(
format!(
"fls_unpack_{}bw_{}ow_{}t",
Expand All @@ -71,9 +63,7 @@ fn cuda_bit_unpack_kernel(
)
.as_ref(),
)
.map_err(|e| vortex_err!("Failed to load function: {e}"))?;
CUDA_KERNELS.write().insert(kernel_id, kernel_func.clone());
Ok(kernel_func)
.map_err(|e| vortex_err!("Failed to load function: {e}"))
}

pub fn cuda_bit_unpack(
Expand Down
7 changes: 0 additions & 7 deletions vortex-gpu/src/for_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,5 @@ mod tests {
primitive_array.as_slice::<u32>(),
unpacked.as_slice::<u32>()
);
for i in 0..primitive_array.len() {
assert_eq!(
primitive_array.as_slice::<u32>()[i],
unpacked.as_slice::<u32>()[i],
"i {i}"
);
}
}
}
1 change: 1 addition & 0 deletions vortex-gpu/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
pub mod bit_unpack;
pub mod for_;
mod for_bp;
mod rle_decompress;
mod take;
mod task;

Expand Down
Loading
Loading