Skip to content

Commit 7373931

Browse files
authored
perf: CUDA FoR loop unrolling (#6017)
This PR changes the FoR impl to use loop unrolling which in some scenarios leads up to a ~85% speedup. As part of that, new benchmarks and tests are introduced. Benchmarks were run on an A10: ``` FoR_cuda_u8/u8_FoR/1K time: [4.6619 µs 4.6818 µs 4.7073 µs] thrpt: [202.59 MiB/s 203.70 MiB/s 204.57 MiB/s] change: time: [−6.9968% −6.1892% −5.2750%] (p = 0.00 < 0.05) thrpt: [+5.5687% +6.5976% +7.5232%] Performance has improved. Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) high mild FoR_cuda_u8/u8_FoR/10K time: [4.4583 µs 4.4809 µs 4.5139 µs] thrpt: [2.0632 GiB/s 2.0784 GiB/s 2.0890 GiB/s] change: time: [−0.0233% +0.7648% +1.5942%] (p = 0.10 > 0.05) thrpt: [−1.5692% −0.7590% +0.0233%] No change in performance detected. FoR_cuda_u8/u8_FoR/100K time: [4.5262 µs 4.5439 µs 4.5736 µs] thrpt: [20.363 GiB/s 20.496 GiB/s 20.576 GiB/s] change: time: [−8.9445% −7.7897% −6.6445%] (p = 0.00 < 0.05) thrpt: [+7.1175% +8.4477% +9.8231%] Performance has improved. FoR_cuda_u8/u8_FoR/1M time: [4.4380 µs 4.4598 µs 4.4891 µs] thrpt: [207.46 GiB/s 208.83 GiB/s 209.85 GiB/s] change: time: [+0.0679% +0.9488% +1.7990%] (p = 0.05 > 0.05) thrpt: [−1.7672% −0.9399% −0.0679%] No change in performance detected. FoR_cuda_u8/u8_FoR/10M time: [4.4880 µs 4.5013 µs 4.5293 µs] thrpt: [2056.2 GiB/s 2069.0 GiB/s 2075.2 GiB/s] change: time: [−3.4123% −2.6199% −1.6909%] (p = 0.00 < 0.05) thrpt: [+1.7200% +2.6903% +3.5328%] Performance has improved. Found 2 outliers among 10 measurements (20.00%) 2 (20.00%) high mild FoR_cuda_u16/u16_FoR/1K time: [4.7696 µs 4.7820 µs 4.8017 µs] thrpt: [397.22 MiB/s 398.86 MiB/s 399.90 MiB/s] change: time: [−31.818% −31.216% −30.430%] (p = 0.00 < 0.05) thrpt: [+43.739% +45.384% +46.666%] Performance has improved. Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) high severe FoR_cuda_u16/u16_FoR/10K time: [4.9942 µs 5.0030 µs 5.0114 µs] thrpt: [3.7168 GiB/s 3.7231 GiB/s 3.7296 GiB/s] change: time: [−46.715% −46.619% −46.535%] (p = 0.00 < 0.05) thrpt: [+87.039% +87.332% +87.671%] Performance has improved. FoR_cuda_u16/u16_FoR/100K time: [11.371 µs 11.387 µs 11.396 µs] thrpt: [16.345 GiB/s 16.358 GiB/s 16.381 GiB/s] change: time: [−26.577% −26.455% −26.344%] (p = 0.00 < 0.05) thrpt: [+35.767% +35.972% +36.197%] Performance has improved. FoR_cuda_u16/u16_FoR/1M time: [4.9764 µs 4.9958 µs 5.0073 µs] thrpt: [371.98 GiB/s 372.84 GiB/s 374.29 GiB/s] change: time: [−46.584% −46.382% −46.210%] (p = 0.00 < 0.05) thrpt: [+85.906% +86.503% +87.210%] Performance has improved. FoR_cuda_u16/u16_FoR/10M time: [11.157 µs 11.211 µs 11.248 µs] thrpt: [1656.0 GiB/s 1661.4 GiB/s 1669.5 GiB/s] change: time: [−26.916% −26.493% −26.164%] (p = 0.00 < 0.05) thrpt: [+35.435% +36.042% +36.828%] Performance has improved. Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) low mild FoR_cuda_u32/u32_FoR/1K time: [5.2116 µs 5.2613 µs 5.3102 µs] thrpt: [718.37 MiB/s 725.05 MiB/s 731.96 MiB/s] change: time: [−26.511% −25.998% −25.368%] (p = 0.00 < 0.05) thrpt: [+33.990% +35.132% +36.075%] Performance has improved. FoR_cuda_u32/u32_FoR/10K time: [5.5475 µs 5.5554 µs 5.5633 µs] thrpt: [6.6962 GiB/s 6.7057 GiB/s 6.7152 GiB/s] change: time: [−39.450% −39.349% −39.240%] (p = 0.00 < 0.05) thrpt: [+64.582% +64.877% +65.153%] Performance has improved. Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) low mild FoR_cuda_u32/u32_FoR/100K time: [9.3362 µs 9.3806 µs 9.4250 µs] thrpt: [39.525 GiB/s 39.713 GiB/s 39.902 GiB/s] change: time: [−25.760% −25.359% −24.988%] (p = 0.00 < 0.05) thrpt: [+33.312% +33.974% +34.698%] Performance has improved. FoR_cuda_u32/u32_FoR/1M time: [13.072 µs 13.168 µs 13.267 µs] thrpt: [280.78 GiB/s 282.91 GiB/s 284.98 GiB/s] change: time: [−18.493% −14.861% −9.3593%] (p = 0.00 < 0.05) thrpt: [+10.326% +17.455% +22.689%] Performance has improved. Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) low mild FoR_cuda_u32/u32_FoR/10M time: [174.68 µs 174.95 µs 175.20 µs] thrpt: [212.63 GiB/s 212.94 GiB/s 213.26 GiB/s] change: time: [−1.4814% −1.2404% −1.0022%] (p = 0.00 < 0.05) thrpt: [+1.0124% +1.2560% +1.5036%] Performance has improved. FoR_cuda_u64/u64_FoR/1K time: [5.8007 µs 5.8204 µs 5.8478 µs] thrpt: [1.2741 GiB/s 1.2801 GiB/s 1.2844 GiB/s] change: time: [−18.401% −18.040% −17.687%] (p = 0.00 < 0.05) thrpt: [+21.488% +22.010% +22.551%] Performance has improved. FoR_cuda_u64/u64_FoR/10K time: [13.322 µs 13.378 µs 13.445 µs] thrpt: [5.5417 GiB/s 5.5695 GiB/s 5.5925 GiB/s] change: time: [−17.451% −17.049% −16.645%] (p = 0.00 < 0.05) thrpt: [+19.969% +20.553% +21.140%] Performance has improved. FoR_cuda_u64/u64_FoR/100K time: [12.205 µs 12.319 µs 12.462 µs] thrpt: [59.788 GiB/s 60.478 GiB/s 61.044 GiB/s] change: time: [−19.829% −19.168% −18.499%] (p = 0.00 < 0.05) thrpt: [+22.697% +23.713% +24.734%] Performance has improved. FoR_cuda_u64/u64_FoR/1M time: [33.368 µs 33.405 µs 33.464 µs] thrpt: [222.65 GiB/s 223.04 GiB/s 223.28 GiB/s] change: time: [−7.8447% −7.4612% −7.0847%] (p = 0.00 < 0.05) thrpt: [+7.6249% +8.0627% +8.5124%] Performance has improved. Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) high mild FoR_cuda_u64/u64_FoR/10M time: [341.44 µs 341.72 µs 342.10 µs] thrpt: [217.79 GiB/s 218.03 GiB/s 218.21 GiB/s] change: time: [−1.8864% −1.6840% −1.4901%] (p = 0.00 < 0.05) thrpt: [+1.5126% +1.7128% +1.9226%] Performance has improved. ``` Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent d0160ff commit 7373931

File tree

3 files changed

+423
-40
lines changed

3 files changed

+423
-40
lines changed

vortex-cuda/benches/for_cuda.rs

Lines changed: 282 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,37 +31,121 @@ const BENCH_ARGS: &[(usize, &str)] = &[
3131
(100_000, "100K"),
3232
(1_000_000, "1M"),
3333
(10_000_000, "10M"),
34-
(100_000_000, "100M"),
3534
];
3635

37-
/// Creates a FoR array for the given size.
38-
fn make_for_array(len: usize) -> FoRArray {
36+
/// Creates a FoR array of u8 for the given size.
37+
fn make_for_array_u8(len: usize) -> FoRArray {
38+
let data: Vec<u8> = (0..len as u8).map(|i| i.wrapping_add(10)).collect();
39+
let primitive_array = PrimitiveArray::new(
40+
Buffer::from(data),
41+
vortex_array::validity::Validity::NonNullable,
42+
)
43+
.into_array();
44+
45+
FoRArray::try_new(primitive_array, 10u8.into()).vortex_expect("failed to create FoR array")
46+
}
47+
48+
/// Creates a FoR array of u16 for the given size.
49+
fn make_for_array_u16(len: usize) -> FoRArray {
50+
let data: Vec<u16> = (0..len as u16).map(|i| i.wrapping_add(10)).collect();
51+
let primitive_array = PrimitiveArray::new(
52+
Buffer::from(data),
53+
vortex_array::validity::Validity::NonNullable,
54+
)
55+
.into_array();
56+
57+
FoRArray::try_new(primitive_array, 10u16.into()).vortex_expect("failed to create FoR array")
58+
}
59+
60+
/// Creates a FoR array of u32 for the given size.
61+
fn make_for_array_u32(len: usize) -> FoRArray {
3962
let primitive_array = PrimitiveArray::new(
4063
Buffer::from((0u32..len as u32).collect::<Vec<u32>>()),
4164
vortex_array::validity::Validity::NonNullable,
4265
)
4366
.into_array();
4467

45-
let for_offset = 10u32;
68+
FoRArray::try_new(primitive_array, 10u32.into()).vortex_expect("failed to create FoR array")
69+
}
4670

47-
FoRArray::try_new(primitive_array, for_offset.into())
48-
.vortex_expect("failed to create FoR array")
71+
/// Creates a FoR array of u64 for the given size.
72+
fn make_for_array_u64(len: usize) -> FoRArray {
73+
let data: Vec<u64> = (0..len as u64).map(|i| i.wrapping_add(10)).collect();
74+
let primitive_array = PrimitiveArray::new(
75+
Buffer::from(data),
76+
vortex_array::validity::Validity::NonNullable,
77+
)
78+
.into_array();
79+
80+
FoRArray::try_new(primitive_array, 10u64.into()).vortex_expect("failed to create FoR array")
4981
}
5082

5183
/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
52-
fn launch_for_kernel_timed(
84+
fn launch_for_kernel_timed_u8(
85+
for_array: &FoRArray,
86+
device_data: cudarc::driver::CudaSlice<u8>,
87+
reference: u8,
88+
cuda_ctx: &mut CudaExecutionCtx,
89+
) -> vortex_error::VortexResult<Duration> {
90+
let array_len_u64 = for_array.len() as u64;
91+
92+
let events = vortex_cuda::launch_cuda_kernel!(
93+
execution_ctx: cuda_ctx,
94+
module: "for",
95+
ptypes: &[for_array.ptype()],
96+
launch_args: [device_data, reference, array_len_u64],
97+
event_recording: CU_EVENT_BLOCKING_SYNC,
98+
array_len: for_array.len()
99+
);
100+
101+
let elapsed_ms = events
102+
.before_launch
103+
.elapsed_ms(&events.after_launch) // synchronizes
104+
.map_err(|e| vortex_error::vortex_err!("failed to get elapsed time: {}", e))?;
105+
106+
Ok(Duration::from_secs_f32(elapsed_ms / 1000.0))
107+
}
108+
109+
/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
110+
fn launch_for_kernel_timed_u16(
111+
for_array: &FoRArray,
112+
device_data: cudarc::driver::CudaSlice<u16>,
113+
reference: u16,
114+
cuda_ctx: &mut CudaExecutionCtx,
115+
) -> vortex_error::VortexResult<Duration> {
116+
let array_len_u64 = for_array.len() as u64;
117+
118+
let events = vortex_cuda::launch_cuda_kernel!(
119+
execution_ctx: cuda_ctx,
120+
module: "for",
121+
ptypes: &[for_array.ptype()],
122+
launch_args: [device_data, reference, array_len_u64],
123+
event_recording: CU_EVENT_BLOCKING_SYNC,
124+
array_len: for_array.len()
125+
);
126+
127+
let elapsed_ms = events
128+
.before_launch
129+
.elapsed_ms(&events.after_launch) // synchronizes
130+
.map_err(|e| vortex_error::vortex_err!("failed to get elapsed time: {}", e))?;
131+
132+
Ok(Duration::from_secs_f32(elapsed_ms / 1000.0))
133+
}
134+
135+
/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
136+
fn launch_for_kernel_timed_u32(
53137
for_array: &FoRArray,
54-
reference: u32,
55138
device_data: cudarc::driver::CudaSlice<u32>,
139+
reference: u32,
56140
cuda_ctx: &mut CudaExecutionCtx,
57141
) -> vortex_error::VortexResult<Duration> {
58-
let array_len = for_array.len() as u64;
142+
let array_len_u64 = for_array.len() as u64;
59143

60144
let events = vortex_cuda::launch_cuda_kernel!(
61145
execution_ctx: cuda_ctx,
62146
module: "for",
63147
ptypes: &[for_array.ptype()],
64-
launch_args: [device_data, reference, array_len],
148+
launch_args: [device_data, reference, array_len_u64],
65149
event_recording: CU_EVENT_BLOCKING_SYNC,
66150
array_len: for_array.len()
67151
);
@@ -74,17 +158,137 @@ fn launch_for_kernel_timed(
74158
Ok(Duration::from_secs_f32(elapsed_ms / 1000.0))
75159
}
76160

77-
fn benchmark_for_cuda(c: &mut Criterion) {
78-
if !has_nvcc() {
79-
eprintln!("nvcc not found, skipping CUDA benchmarks");
80-
return;
161+
/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
162+
fn launch_for_kernel_timed_u64(
163+
for_array: &FoRArray,
164+
device_data: cudarc::driver::CudaSlice<u64>,
165+
reference: u64,
166+
cuda_ctx: &mut CudaExecutionCtx,
167+
) -> vortex_error::VortexResult<Duration> {
168+
let array_len_u64 = for_array.len() as u64;
169+
170+
let events = vortex_cuda::launch_cuda_kernel!(
171+
execution_ctx: cuda_ctx,
172+
module: "for",
173+
ptypes: &[for_array.ptype()],
174+
launch_args: [device_data, reference, array_len_u64],
175+
event_recording: CU_EVENT_BLOCKING_SYNC,
176+
array_len: for_array.len()
177+
);
178+
179+
let elapsed_ms = events
180+
.before_launch
181+
.elapsed_ms(&events.after_launch) // synchronizes
182+
.map_err(|e| vortex_error::vortex_err!("failed to get elapsed time: {}", e))?;
183+
184+
Ok(Duration::from_secs_f32(elapsed_ms / 1000.0))
185+
}
186+
187+
/// Benchmark u8 FoR decompression
188+
fn benchmark_for_u8(c: &mut Criterion) {
189+
let mut group = c.benchmark_group("FoR_cuda_u8");
190+
group.sample_size(10);
191+
192+
for (len, label) in BENCH_ARGS {
193+
let for_array = make_for_array_u8(*len);
194+
195+
group.throughput(Throughput::Bytes((len * size_of::<u8>()) as u64));
196+
group.bench_with_input(
197+
BenchmarkId::new("u8_FoR", label),
198+
&for_array,
199+
|b, for_array| {
200+
b.iter_custom(|iters| {
201+
let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty())
202+
.vortex_expect("failed to create execution context");
203+
204+
let encoded = for_array.encoded();
205+
let unpacked_array = encoded.to_primitive();
206+
let unpacked_slice = unpacked_array.as_slice::<u8>();
207+
208+
let reference = 10u8;
209+
let mut total_time = Duration::ZERO;
210+
211+
for _ in 0..iters {
212+
let device_data = cuda_ctx
213+
.to_device(unpacked_slice)
214+
.vortex_expect("failed to copy to device");
215+
216+
let kernel_time = launch_for_kernel_timed_u8(
217+
for_array,
218+
device_data,
219+
reference,
220+
&mut cuda_ctx,
221+
)
222+
.vortex_expect("kernel launch failed");
223+
224+
total_time += kernel_time;
225+
}
226+
227+
total_time
228+
});
229+
},
230+
);
81231
}
82232

83-
let mut group = c.benchmark_group("FoR_cuda");
233+
group.finish();
234+
}
235+
236+
/// Benchmark u16 FoR decompression
237+
fn benchmark_for_u16(c: &mut Criterion) {
238+
let mut group = c.benchmark_group("FoR_cuda_u16");
84239
group.sample_size(10);
85240

86241
for (len, label) in BENCH_ARGS {
87-
let for_array = make_for_array(*len);
242+
let for_array = make_for_array_u16(*len);
243+
244+
group.throughput(Throughput::Bytes((len * size_of::<u16>()) as u64));
245+
group.bench_with_input(
246+
BenchmarkId::new("u16_FoR", label),
247+
&for_array,
248+
|b, for_array| {
249+
b.iter_custom(|iters| {
250+
let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty())
251+
.vortex_expect("failed to create execution context");
252+
253+
let encoded = for_array.encoded();
254+
let unpacked_array = encoded.to_primitive();
255+
let unpacked_slice = unpacked_array.as_slice::<u16>();
256+
257+
let reference = 10u16;
258+
let mut total_time = Duration::ZERO;
259+
260+
for _ in 0..iters {
261+
let device_data = cuda_ctx
262+
.to_device(unpacked_slice)
263+
.vortex_expect("failed to copy to device");
264+
265+
let kernel_time = launch_for_kernel_timed_u16(
266+
for_array,
267+
device_data,
268+
reference,
269+
&mut cuda_ctx,
270+
)
271+
.vortex_expect("kernel launch failed");
272+
273+
total_time += kernel_time;
274+
}
275+
276+
total_time
277+
});
278+
},
279+
);
280+
}
281+
282+
group.finish();
283+
}
284+
285+
/// Benchmark u32 FoR decompression
286+
fn benchmark_for_u32(c: &mut Criterion) {
287+
let mut group = c.benchmark_group("FoR_cuda_u32");
288+
group.sample_size(10);
289+
290+
for (len, label) in BENCH_ARGS {
291+
let for_array = make_for_array_u32(*len);
88292

89293
group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
90294
group.bench_with_input(
@@ -107,10 +311,59 @@ fn benchmark_for_cuda(c: &mut Criterion) {
107311
.to_device(unpacked_slice)
108312
.vortex_expect("failed to copy to device");
109313

110-
let kernel_time = launch_for_kernel_timed(
314+
let kernel_time = launch_for_kernel_timed_u32(
111315
for_array,
316+
device_data,
112317
reference,
318+
&mut cuda_ctx,
319+
)
320+
.vortex_expect("kernel launch failed");
321+
322+
total_time += kernel_time;
323+
}
324+
325+
total_time
326+
});
327+
},
328+
);
329+
}
330+
331+
group.finish();
332+
}
333+
334+
/// Benchmark u64 FoR decompression
335+
fn benchmark_for_u64(c: &mut Criterion) {
336+
let mut group = c.benchmark_group("FoR_cuda_u64");
337+
group.sample_size(10);
338+
339+
for (len, label) in BENCH_ARGS {
340+
let for_array = make_for_array_u64(*len);
341+
342+
group.throughput(Throughput::Bytes((len * size_of::<u64>()) as u64));
343+
group.bench_with_input(
344+
BenchmarkId::new("u64_FoR", label),
345+
&for_array,
346+
|b, for_array| {
347+
b.iter_custom(|iters| {
348+
let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty())
349+
.vortex_expect("failed to create execution context");
350+
351+
let encoded = for_array.encoded();
352+
let unpacked_array = encoded.to_primitive();
353+
let unpacked_slice = unpacked_array.as_slice::<u64>();
354+
355+
let reference = 10u64;
356+
let mut total_time = Duration::ZERO;
357+
358+
for _ in 0..iters {
359+
let device_data = cuda_ctx
360+
.to_device(unpacked_slice)
361+
.vortex_expect("failed to copy to device");
362+
363+
let kernel_time = launch_for_kernel_timed_u64(
364+
for_array,
113365
device_data,
366+
reference,
114367
&mut cuda_ctx,
115368
)
116369
.vortex_expect("kernel launch failed");
@@ -127,5 +380,17 @@ fn benchmark_for_cuda(c: &mut Criterion) {
127380
group.finish();
128381
}
129382

383+
fn benchmark_for_cuda(c: &mut Criterion) {
384+
if !has_nvcc() {
385+
eprintln!("nvcc not found, skipping CUDA benchmarks");
386+
return;
387+
}
388+
389+
benchmark_for_u8(c);
390+
benchmark_for_u16(c);
391+
benchmark_for_u32(c);
392+
benchmark_for_u64(c);
393+
}
394+
130395
criterion_group!(benches, benchmark_for_cuda);
131396
criterion_main!(benches);

0 commit comments

Comments
 (0)