Skip to content

Commit 6e11e98

Browse files
committed
scans
Signed-off-by: Robert Kruszewski <[email protected]>
1 parent 97dc3e0 commit 6e11e98

File tree

13 files changed

+96
-77
lines changed

13 files changed

+96
-77
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-gpu/src/bit_unpack.rs

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ use std::time::Duration;
99

1010
use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT;
1111
use cudarc::driver::{
12-
CudaContext, CudaFunction, CudaSlice, CudaStream, CudaViewMut, DeviceRepr, LaunchConfig,
13-
PushKernelArg,
12+
CudaContext, CudaFunction, CudaSlice, CudaStream, DeviceRepr, LaunchConfig, PushKernelArg,
1413
};
1514
use cudarc::nvrtc::Ptx;
1615
use vortex_array::Canonical;
@@ -22,6 +21,7 @@ use vortex_error::{VortexExpect, VortexResult, vortex_err};
2221
use vortex_fastlanes::BitPackedArray;
2322

2423
use crate::task::GPUTask;
24+
use crate::{ErasedCudaSlice, GpuArray};
2525

2626
#[derive(Hash, PartialEq, Eq, Debug)]
2727
struct UnpackKernelId {
@@ -189,7 +189,7 @@ impl<P: NativePType + DeviceRepr> GPUTask for BitPackingTask<P> {
189189
.map(|_| ())
190190
}
191191

192-
fn export_result(&mut self) -> VortexResult<Canonical> {
192+
fn export_result(&mut self) -> VortexResult<GpuArray> {
193193
let mut buffer = BufferMut::<P>::with_capacity(self.len());
194194

195195
unsafe { buffer.set_len(self.len()) }
@@ -204,12 +204,8 @@ impl<P: NativePType + DeviceRepr> GPUTask for BitPackingTask<P> {
204204
))
205205
}
206206

207-
fn output(&mut self) -> CudaViewMut<'_, u8> {
208-
unsafe {
209-
self.unpacked
210-
.transmute_mut(self.len() * size_of::<P>())
211-
.vortex_expect("Failed to transmute")
212-
}
207+
fn output(&mut self) -> ErasedCudaSlice {
208+
ErasedCudaSlice::new(self.unpacked)
213209
}
214210

215211
fn len(&self) -> usize {

vortex-gpu/src/buffer/gpu_buf.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use vortex_error::vortex_panic;
1010

1111
pub struct ErasedCudaSlice {
1212
ptr: CUdeviceptr,
13+
stream: Arc<CudaStream>,
1314
len: usize,
1415
ptype: PType,
1516
}
@@ -18,8 +19,10 @@ impl ErasedCudaSlice {
1819
pub fn new<T: NativePType>(slice: impl Into<CudaSlice<T>>) -> Self {
1920
let slice = slice.into();
2021
let len = slice.len();
22+
let stream = slice.stream().clone();
2123
Self {
2224
ptr: slice.leak(),
25+
stream,
2326
len,
2427
ptype: T::PTYPE,
2528
}
@@ -33,7 +36,7 @@ impl ErasedCudaSlice {
3336
self.len
3437
}
3538

36-
pub fn as_slice<T: NativePType>(&self, stream: &Arc<CudaStream>) -> CudaSlice<T> {
39+
pub fn as_slice<T: NativePType>(&self) -> CudaSlice<T> {
3740
if T::PTYPE != self.ptype() {
3841
vortex_panic!(
3942
"Attempted to get slice of type {} from array of type {}",
@@ -42,6 +45,6 @@ impl ErasedCudaSlice {
4245
)
4346
}
4447

45-
unsafe { stream.upgrade_device_ptr::<T>(self.ptr, self.len) }
48+
unsafe { self.stream.upgrade_device_ptr::<T>(self.ptr, self.len) }
4649
}
4750
}

vortex-gpu/src/for_.rs

Lines changed: 8 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,16 @@ use std::time::Duration;
66

77
use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT;
88
use cudarc::driver::{
9-
CudaContext, CudaFunction, CudaStream, CudaViewMut, DeviceRepr, LaunchConfig, PushKernelArg,
9+
CudaContext, CudaFunction, CudaStream, DeviceRepr, LaunchConfig, PushKernelArg,
1010
};
1111
use cudarc::nvrtc::Ptx;
12-
use vortex_array::Canonical;
1312
use vortex_array::arrays::PrimitiveArray;
14-
use vortex_array::validity::Validity;
15-
use vortex_buffer::BufferMut;
1613
use vortex_dtype::{NativePType, PType, match_each_native_ptype};
1714
use vortex_error::{VortexExpect, VortexResult, vortex_err};
1815
use vortex_fastlanes::{BitPackedVTable, FoRArray};
1916

20-
use crate::bit_unpack;
2117
use crate::task::GPUTask;
18+
use crate::{ErasedCudaSlice, GpuArray, GpuPrimitiveArray, bit_unpack};
2219

2320
struct ForTask<P> {
2421
stream: Arc<CudaStream>,
@@ -71,43 +68,23 @@ fn cuda_for_kernel(ptype: PType, ctx: &Arc<CudaContext>) -> VortexResult<CudaFun
7168

7269
impl<P: NativePType + DeviceRepr> GPUTask for ForTask<P> {
7370
fn launch_task(&mut self) -> VortexResult<()> {
74-
let len = self.len();
7571
self.bp_task.launch_task()?;
7672
let mut launch = self.stream.launch_builder(&self.func);
77-
let mut view = unsafe {
78-
self.bp_task
79-
.output()
80-
.transmute_mut::<P>(len)
81-
.vortex_expect("")
82-
};
73+
let mut view = self.bp_task.output().as_slice::<P>();
8374
launch.arg(&mut view);
8475
launch.arg(&self.reference);
8576
unsafe { launch.launch(self.launch_config) }
8677
.map_err(|e| vortex_err!("Failed to launch: {e}"))
8778
.map(|_| ())
8879
}
8980

90-
fn export_result(&mut self) -> VortexResult<Canonical> {
91-
let len = self.len();
92-
let mut buffer = BufferMut::<P>::with_capacity(len);
93-
94-
unsafe { buffer.set_len(len) }
95-
self.stream
96-
.memcpy_dtoh(
97-
&unsafe { self.bp_task.output().transmute::<P>(len).vortex_expect("") },
98-
&mut buffer,
99-
)
100-
.map_err(|e| vortex_err!("Failed to copy to device: {e}"))?;
101-
self.stream
102-
.synchronize()
103-
.map_err(|e| vortex_err!("Failed to synchronize: {e}"))?;
104-
Ok(Canonical::Primitive(PrimitiveArray::new(
105-
buffer,
106-
Validity::NonNullable,
107-
)))
81+
fn export_result(&mut self) -> VortexResult<GpuArray> {
82+
Ok(GpuArray::Primitive(GpuPrimitiveArray {
83+
values: self.bp_task.output(),
84+
}))
10885
}
10986

110-
fn output(&mut self) -> CudaViewMut<'_, u8> {
87+
fn output(&mut self) -> ErasedCudaSlice {
11188
self.bp_task.output()
11289
}
11390

vortex-gpu/src/gpu_array.rs

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,54 @@
33

44
use std::sync::Arc;
55

6+
use cudarc::driver::CudaSlice;
7+
use vortex_dtype::NativePType;
8+
69
use crate::buffer::ErasedCudaSlice;
710

8-
pub type GpuArrayRef = Arc<dyn GpuArray>;
11+
pub enum GpuArray {
12+
Primitive(GpuPrimitiveArray),
13+
Bool(GpuBoolArray),
14+
Struct(GpuStructArray),
15+
Chunked(GpuChunkedArray),
16+
}
17+
18+
pub struct GpuPrimitiveArray {
19+
values: ErasedCudaSlice,
20+
}
21+
22+
impl GpuPrimitiveArray {
23+
fn as_slice<T: NativePType>(&self) -> CudaSlice<T> {
24+
self.values.as_slice()
25+
}
26+
}
27+
28+
pub struct GpuBoolArray {
29+
values: CudaSlice<bool>,
30+
}
931

10-
pub trait GpuArray {
11-
fn child(&self, idx: usize) -> GpuArrayRef;
32+
impl GpuBoolArray {
33+
fn values(&self) -> CudaSlice<bool> {
34+
self.values.clone()
35+
}
36+
}
37+
38+
pub struct GpuChunkedArray {
39+
gpu_arrays: Arc<[GpuArray]>,
40+
}
41+
42+
impl GpuChunkedArray {
43+
fn child(&self, idx: usize) -> &GpuArray {
44+
&self.gpu_arrays[idx]
45+
}
46+
}
47+
48+
pub struct GpuStructArray {
49+
gpu_arrays: Arc<[GpuArray]>,
50+
}
1251

13-
fn buffer(&self, idx: usize) -> ErasedCudaSlice;
52+
impl GpuStructArray {
53+
fn child(&self, idx: usize) -> &GpuArray {
54+
&self.gpu_arrays[idx]
55+
}
1456
}

vortex-gpu/src/lib.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,20 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
pub mod bit_unpack;
5-
pub mod buffer;
5+
mod buffer;
66
pub mod for_;
77
mod for_bp;
8-
pub mod gpu_array;
8+
mod gpu_array;
99
mod indent;
1010
mod jit;
1111
mod rle_decompress;
1212
mod take;
1313
mod task;
1414

1515
pub use bit_unpack::{cuda_bit_unpack, cuda_bit_unpack_timed};
16+
pub use buffer::*;
1617
pub use for_::{cuda_for_unpack, cuda_for_unpack_timed};
1718
pub use for_bp::{cuda_for_bp_unpack, cuda_for_bp_unpack_timed};
19+
pub use gpu_array::*;
1820
pub use jit::create_run_jit_kernel;
1921
pub use take::cuda_take;

vortex-gpu/src/task.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
use cudarc::driver::CudaViewMut;
5-
use vortex_array::Canonical;
64
use vortex_error::VortexResult;
75

6+
use crate::{ErasedCudaSlice, GpuArray};
7+
88
pub trait GPUTask {
99
// Must call `launch_task` once
1010
fn launch_task(&mut self) -> VortexResult<()>;
1111

1212
// Must call this after launch_task
13-
fn export_result(&mut self) -> VortexResult<Canonical>;
13+
fn export_result(&mut self) -> VortexResult<GpuArray>;
1414

1515
// Re can transmute as runtime
16-
fn output(&mut self) -> CudaViewMut<'_, u8>;
16+
fn output(&mut self) -> ErasedCudaSlice;
1717

1818
fn len(&self) -> usize;
1919
}

vortex-layout/Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ vortex-dtype = { workspace = true }
4848
vortex-error = { workspace = true }
4949
vortex-expr = { workspace = true }
5050
vortex-flatbuffers = { workspace = true, features = ["layout"] }
51+
vortex-gpu = { workspace = true, optional = true }
5152
vortex-io = { workspace = true }
5253
vortex-mask = { workspace = true }
5354
vortex-metrics = { workspace = true }
@@ -68,8 +69,8 @@ vortex-io = { path = "../vortex-io", features = ["tokio"] }
6869
test-harness = []
6970
tokio = ["dep:tokio", "vortex-error/tokio"]
7071
zstd = ["dep:vortex-zstd"]
71-
gpu = ["cuda"]
72-
cuda = ["dep:cudarc"]
72+
gpu = ["cuda", "dep:vortex-gpu"]
73+
cuda = ["dep:cudarc", "vortex-gpu/cuda"]
7374

7475
[lints]
7576
workspace = true

vortex-layout/src/gpu/layouts/chunked/reader.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use vortex_expr::ExprRef;
1818
use crate::gpu::children::LazyGpuReaderChildren;
1919
use crate::layouts::chunked::ChunkedLayout;
2020
use crate::segments::SegmentSource;
21-
use crate::{GpuLayoutReader, GpuLayoutReaderRef};
21+
use crate::{GpuArrayFuture, GpuLayoutReader, GpuLayoutReaderRef};
2222

2323
pub struct GpuChunkedLayoutReader {
2424
layout: ChunkedLayout,
@@ -133,7 +133,7 @@ impl GpuLayoutReader for GpuChunkedLayoutReader {
133133
&self,
134134
row_range: &Range<u64>,
135135
expr: &ExprRef,
136-
) -> VortexResult<BoxFuture<'static, VortexResult<ArrayRef>>> {
136+
) -> VortexResult<GpuArrayFuture> {
137137
let dtype = expr.return_dtype(self.dtype())?;
138138
let mut chunk_evals = FuturesOrdered::new();
139139

vortex-layout/src/gpu/layouts/flat/reader.rs

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ use vortex_dtype::{DType, FieldMask};
1414
use vortex_error::{VortexResult, VortexUnwrap as _};
1515
use vortex_expr::{ExprRef, Scope, is_root};
1616

17-
use crate::GpuLayoutReader;
1817
use crate::layouts::SharedArrayFuture;
1918
use crate::layouts::flat::FlatLayout;
2019
use crate::segments::SegmentSource;
20+
use crate::{GpuArrayFuture, GpuLayoutReader, ShareGpuArrayFuture};
2121

2222
pub struct GpuFlatReader {
2323
layout: FlatLayout,
@@ -39,7 +39,7 @@ impl GpuFlatReader {
3939
}
4040

4141
/// Register the segment request and return a future that would resolve into the deserialised array.
42-
fn array_future(&self) -> SharedArrayFuture {
42+
fn array_future(&self) -> ShareGpuArrayFuture {
4343
let row_count = usize::try_from(self.layout.row_count()).vortex_unwrap();
4444

4545
// We create the segment_fut here to ensure we give the segment reader visibility into
@@ -51,9 +51,7 @@ impl GpuFlatReader {
5151
let dtype = self.layout.dtype().clone();
5252
async move {
5353
let segment = segment_fut.await?;
54-
ArrayParts::try_from(segment)?
55-
.decode(&ctx, &dtype, row_count)
56-
.map_err(Arc::new)
54+
ArrayParts::try_from(segment)?.decode(&ctx, &dtype, row_count)?
5755
}
5856
.boxed()
5957
.shared()
@@ -87,7 +85,7 @@ impl GpuLayoutReader for GpuFlatReader {
8785
&self,
8886
row_range: &Range<u64>,
8987
expr: &ExprRef,
90-
) -> VortexResult<BoxFuture<'static, VortexResult<ArrayRef>>> {
88+
) -> VortexResult<GpuArrayFuture> {
9189
assert_eq!(
9290
row_range.clone(),
9391
0..self.layout.row_count(),
@@ -100,14 +98,7 @@ impl GpuLayoutReader for GpuFlatReader {
10098
Ok(async move {
10199
log::debug!("Flat array evaluation {} - {}", name, expr);
102100

103-
let mut array = array.clone().await?;
104-
105-
// Evaluate the projection expression.
106-
if !is_root(&expr) {
107-
array = expr.evaluate(&Scope::new(array))?;
108-
}
109-
110-
Ok(array)
101+
array.clone().await
111102
}
112103
.boxed())
113104
}

0 commit comments

Comments
 (0)