Skip to content

Commit 0b93fe0

Browse files
authored
feat: patches uses a map in some cases (#1626)
See [this sheet for the data from take_patches.rs](https://docs.google.com/spreadsheets/d/1D9vBZ1QJ6mwcIvV5wIL0hjGgVchcEnAyhvitqWu2ugU). I'm on an M3 Max with 96 GiB of RAM with macOS 14.4. This threshold likely depends on the ISA. Intuitively, repeated searching is `O(N_INDICES * lg N_PATCHES)` and repeated map lookups is `O(N_INDICES + N_PATCHES)`. It seems to me that the compiler & CPU would have trouble paralleling search (via SIMD or ILP) because of the branching, whereas map lookups are more obviously parallelized (e.g. SIMD hash computation). I'm not entirely sure why the cross over point seems to be around N_PATCHES / N_INDICES = 5. I believe the M3 Max has 128-bit registers, so if the indices are 32-bits then index arithmetic could be 4-way parallel.
1 parent e8cd434 commit 0b93fe0

File tree

5 files changed

+236
-11
lines changed

5 files changed

+236
-11
lines changed

vortex-array/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,7 @@ harness = false
8383
[[bench]]
8484
name = "take_strings"
8585
harness = false
86+
87+
[[bench]]
88+
name = "take_patches"
89+
harness = false
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#![allow(clippy::unwrap_used)]
2+
3+
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
4+
use rand::rngs::StdRng;
5+
use rand::{Rng, SeedableRng as _};
6+
use vortex_array::patches::Patches;
7+
use vortex_array::ArrayData;
8+
9+
fn fixture(len: usize, sparsity: f64, rng: &mut StdRng) -> Patches {
10+
// NB: indices are always ordered
11+
let indices = (0..len)
12+
.filter(|_| rng.gen_bool(sparsity))
13+
.map(|x| x as u64)
14+
.collect::<Vec<u64>>();
15+
let sparse_len = indices.len();
16+
let values = ArrayData::from((0..sparse_len).map(|x| x as u64).collect::<Vec<_>>());
17+
Patches::new(len, ArrayData::from(indices), values)
18+
}
19+
20+
fn indices(array_len: usize, n_indices: usize, rng: &mut StdRng) -> ArrayData {
21+
ArrayData::from(
22+
(0..n_indices)
23+
.map(|_| rng.gen_range(0..(array_len as u64)))
24+
.collect::<Vec<u64>>(),
25+
)
26+
}
27+
28+
#[allow(clippy::cast_possible_truncation)]
29+
fn bench_take(c: &mut Criterion) {
30+
let mut group = c.benchmark_group("bench_take");
31+
let mut rng = StdRng::seed_from_u64(0);
32+
33+
for patches_sparsity in [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001] {
34+
let patches = fixture(65_535, patches_sparsity, &mut rng);
35+
for index_multiple in [1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001] {
36+
let indices = indices(
37+
patches.array_len(),
38+
(patches.array_len() as f64 * index_multiple) as usize,
39+
&mut rng,
40+
);
41+
group.bench_with_input(
42+
BenchmarkId::from_parameter(format!(
43+
"take_search: array_len={}, n_patches={} (~{}%), n_indices={} ({}%)",
44+
patches.array_len(),
45+
patches.num_patches(),
46+
patches_sparsity,
47+
indices.len(),
48+
index_multiple * 100.0
49+
)),
50+
&(&patches, &indices),
51+
|b, (patches, indices)| b.iter(|| patches.take_search(indices)),
52+
);
53+
group.bench_with_input(
54+
BenchmarkId::from_parameter(format!(
55+
"take_map: array_len={}, n_patches={} (~{}%), n_indices={} ({}%)",
56+
patches.array_len(),
57+
patches.num_patches(),
58+
patches_sparsity,
59+
indices.len(),
60+
index_multiple * 100.0
61+
)),
62+
&(&patches, &indices),
63+
|b, (patches, indices)| b.iter(|| patches.take_map(indices)),
64+
);
65+
}
66+
}
67+
group.finish()
68+
}
69+
70+
criterion_group!(benches, bench_take);
71+
criterion_main!(benches);

vortex-array/benches/take_strings.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,5 @@ fn bench_varbinview(c: &mut Criterion) {
4545
});
4646
}
4747

48-
criterion_group!(bench_take, bench_varbin, bench_varbinview);
49-
criterion_main!(bench_take);
48+
criterion_group!(bench_take_strings, bench_varbin, bench_varbinview);
49+
criterion_main!(bench_take_strings);

vortex-array/src/patches.rs

Lines changed: 81 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
use std::fmt::Debug;
2+
use std::hash::Hash;
23

34
use serde::{Deserialize, Serialize};
45
use vortex_dtype::Nullability::NonNullable;
5-
use vortex_dtype::{match_each_integer_ptype, DType, PType};
6+
use vortex_dtype::{
7+
match_each_integer_ptype, match_each_unsigned_integer_ptype, DType, NativeIndexPType, PType,
8+
};
69
use vortex_error::{vortex_bail, VortexExpect, VortexResult};
710
use vortex_scalar::Scalar;
811

12+
use crate::aliases::hash_map::HashMap;
913
use crate::array::PrimitiveArray;
1014
use crate::compute::{
1115
scalar_at, search_sorted, search_sorted_usize, search_sorted_usize_many, slice,
@@ -220,15 +224,33 @@ impl Patches {
220224
Ok(Some(Self::new(stop - start, indices, values)))
221225
}
222226

227+
// https://docs.google.com/spreadsheets/d/1D9vBZ1QJ6mwcIvV5wIL0hjGgVchcEnAyhvitqWu2ugU
228+
const PREFER_MAP_WHEN_PATCHES_OVER_INDICES_LESS_THAN: f64 = 5.0;
229+
230+
fn is_map_faster_than_search(&self, take_indices: &ArrayData) -> bool {
231+
(self.num_patches() as f64 / take_indices.len() as f64)
232+
< Self::PREFER_MAP_WHEN_PATCHES_OVER_INDICES_LESS_THAN
233+
}
234+
223235
/// Take the indices from the patches.
224-
pub fn take(&self, indices: &ArrayData) -> VortexResult<Option<Self>> {
225-
if indices.is_empty() {
236+
pub fn take(&self, take_indices: &ArrayData) -> VortexResult<Option<Self>> {
237+
if self.is_map_faster_than_search(take_indices) {
238+
self.take_map(take_indices)
239+
} else {
240+
self.take_search(take_indices)
241+
}
242+
}
243+
244+
pub fn take_search(&self, take_indices: &ArrayData) -> VortexResult<Option<Self>> {
245+
let new_length = take_indices.len();
246+
247+
if take_indices.is_empty() {
226248
return Ok(None);
227249
}
228250

229251
// TODO(ngates): plenty of optimisations to be made here
230252
let take_indices =
231-
try_cast(indices, &DType::Primitive(PType::U64, NonNullable))?.into_primitive()?;
253+
try_cast(take_indices, &DType::Primitive(PType::U64, NonNullable))?.into_primitive()?;
232254

233255
let (values_indices, new_indices): (Vec<u64>, Vec<u64>) = search_sorted_usize_many(
234256
self.indices(),
@@ -258,7 +280,61 @@ impl Patches {
258280
PrimitiveArray::from_vec(values_indices, Validity::NonNullable).into_array();
259281
let new_values = take(self.values(), values_indices)?;
260282

261-
Ok(Some(Self::new(indices.len(), new_indices, new_values)))
283+
Ok(Some(Self::new(new_length, new_indices, new_values)))
284+
}
285+
286+
pub fn take_map(&self, take_indices: &ArrayData) -> VortexResult<Option<Self>> {
287+
let take_indices = take_indices.clone().into_primitive()?;
288+
match_each_unsigned_integer_ptype!(self.indices_ptype(), |$INDICES| {
289+
let indices = self.indices
290+
.clone()
291+
.into_primitive()?;
292+
let indices = indices
293+
.maybe_null_slice::<$INDICES>();
294+
match_each_unsigned_integer_ptype!(take_indices.ptype(), |$TAKE_INDICES| {
295+
let take_indices = take_indices
296+
.into_primitive()?;
297+
let take_indices = take_indices
298+
.maybe_null_slice::<$TAKE_INDICES>();
299+
return self.take_map_helper(indices, take_indices);
300+
});
301+
});
302+
}
303+
304+
fn take_map_helper<I: NativeIndexPType + Hash + Eq, TI: NativeIndexPType>(
305+
&self,
306+
indices: &[I],
307+
take_indices: &[TI],
308+
) -> VortexResult<Option<Self>> {
309+
let new_length = take_indices.len();
310+
let sparse_index_to_value_index: HashMap<I, usize> = indices
311+
.iter()
312+
.enumerate()
313+
.map(|(value_index, sparse_index)| (*sparse_index, value_index))
314+
.collect();
315+
let min_index = self.min_index()?;
316+
let max_index = self.max_index()?;
317+
let (new_sparse_indices, value_indices): (Vec<u64>, Vec<u64>) = take_indices
318+
.iter()
319+
.enumerate()
320+
.filter(|(_, ti)| ti.as_usize() < min_index || ti.as_usize() > max_index)
321+
.filter_map(|(new_sparse_index, take_sparse_index)| {
322+
sparse_index_to_value_index
323+
.get(&I::usize_as(take_sparse_index.as_usize()))
324+
// FIXME(DK): should we choose a small index width or should the compressor do that?
325+
.map(|value_index| (new_sparse_index as u64, *value_index as u64))
326+
})
327+
.unzip();
328+
329+
if new_sparse_indices.is_empty() {
330+
return Ok(None);
331+
}
332+
333+
Ok(Some(Patches::new(
334+
new_length,
335+
ArrayData::from(new_sparse_indices),
336+
take(self.values(), ArrayData::from(value_indices))?,
337+
)))
262338
}
263339
}
264340

vortex-dtype/src/ptype.rs

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,40 @@ pub trait NativePType:
8383
fn is_eq(self, other: Self) -> bool;
8484
}
8585

86+
/// A trait for native Rust types that correspond 1:1 to a PType used for indexing.
87+
///
88+
/// Vortex assumes that every value used as an index is representable as both a usize and a u64.
89+
pub trait NativeIndexPType: NativePType {
90+
/// Convert this value to a usize.
91+
///
92+
/// # Safety
93+
///
94+
/// 1. Assumes that the value is a valid index into an array on this machine.
95+
fn as_usize(self) -> usize;
96+
97+
/// Convert this value to a u64.
98+
///
99+
/// # Safety
100+
///
101+
/// 1. Assumes that the value is a valid index into an array on this machine.
102+
/// 2. Assumes this machine's pointers are not larger than u64.
103+
fn as_u64(self) -> u64;
104+
105+
/// Convert a usize to this type.
106+
///
107+
/// # Safety
108+
///
109+
/// 1. Assumes that the value is small enough to fit in this type.
110+
fn usize_as(value: usize) -> Self;
111+
112+
/// Convert a u64 to this type.
113+
///
114+
/// # Safety
115+
///
116+
/// 1. Assumes that the value is small enough to fit in this type.
117+
fn u64_as(value: u64) -> Self;
118+
}
119+
86120
macro_rules! native_ptype {
87121
($T:ty, $ptype:tt, $arrow:tt) => {
88122
impl NativePType for $T {
@@ -104,6 +138,46 @@ macro_rules! native_ptype {
104138
};
105139
}
106140

141+
macro_rules! native_index_ptype {
142+
($T:ty, $ptype:tt, $arrow:tt) => {
143+
impl NativePType for $T {
144+
const PTYPE: PType = PType::$ptype;
145+
type ArrowPrimitiveType = $arrow;
146+
147+
fn is_nan(self) -> bool {
148+
false
149+
}
150+
151+
fn total_compare(self, other: Self) -> Ordering {
152+
self.cmp(&other)
153+
}
154+
155+
fn is_eq(self, other: Self) -> bool {
156+
self == other
157+
}
158+
}
159+
160+
#[allow(clippy::cast_possible_truncation)]
161+
impl NativeIndexPType for $T {
162+
fn as_usize(self) -> usize {
163+
self as usize
164+
}
165+
166+
fn as_u64(self) -> u64 {
167+
self as u64
168+
}
169+
170+
fn usize_as(value: usize) -> Self {
171+
value as Self
172+
}
173+
174+
fn u64_as(value: u64) -> Self {
175+
value as Self
176+
}
177+
}
178+
};
179+
}
180+
107181
macro_rules! native_float_ptype {
108182
($T:ty, $ptype:tt, $arrow:tt) => {
109183
impl NativePType for $T {
@@ -125,10 +199,10 @@ macro_rules! native_float_ptype {
125199
};
126200
}
127201

128-
native_ptype!(u8, U8, UInt8Type);
129-
native_ptype!(u16, U16, UInt16Type);
130-
native_ptype!(u32, U32, UInt32Type);
131-
native_ptype!(u64, U64, UInt64Type);
202+
native_index_ptype!(u8, U8, UInt8Type);
203+
native_index_ptype!(u16, U16, UInt16Type);
204+
native_index_ptype!(u32, U32, UInt32Type);
205+
native_index_ptype!(u64, U64, UInt64Type);
132206
native_ptype!(i8, I8, Int8Type);
133207
native_ptype!(i16, I16, Int16Type);
134208
native_ptype!(i32, I32, Int32Type);

0 commit comments

Comments
 (0)