Skip to content

Commit 8339aaa

Browse files
authored
Move ArrayData into a module (#1370)
Moves some things around in preparation for the VTable changes and removes ToOwnedArrayData trait
1 parent 63bd53b commit 8339aaa

File tree

10 files changed

+360
-448
lines changed

10 files changed

+360
-448
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ benchmarks/.out
193193
*.vortex
194194

195195
# TPC-H benchmarking data
196-
data/
196+
/data/
197197

198198
# vscode
199199
.vscode/

pyvortex/src/array.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -520,10 +520,10 @@ impl PyArray {
520520
/// >>> arr = vortex.array([1, 2, None, 3])
521521
/// >>> print(arr.tree_display())
522522
/// root: vortex.primitive(0x03)(i64?, len=4) nbytes=33 B (100.00%)
523-
/// metadata: PrimitiveMetadata { validity: Array }
523+
/// metadata: ???
524524
/// buffer: 32 B
525525
/// validity: vortex.bool(0x02)(bool, len=4) nbytes=1 B (3.03%)
526-
/// metadata: BoolMetadata { validity: NonNullable, first_byte_bit_offset: 0 }
526+
/// metadata: ???
527527
/// buffer: 1 B
528528
/// <BLANKLINE>
529529
///

vortex-array/src/data/mod.rs

Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
use std::borrow::Cow;
2+
use std::fmt::{Display, Formatter};
3+
use std::future::ready;
4+
use std::sync::{Arc, RwLock};
5+
6+
use itertools::Itertools;
7+
use owned::OwnedArrayData;
8+
use viewed::ViewedArrayData;
9+
use vortex_buffer::Buffer;
10+
use vortex_dtype::DType;
11+
use vortex_error::{vortex_err, vortex_panic, VortexError, VortexExpect, VortexResult};
12+
13+
use crate::encoding::{EncodingId, EncodingRef};
14+
use crate::iter::{ArrayIterator, ArrayIteratorAdapter};
15+
use crate::stats::StatsSet;
16+
use crate::stream::{ArrayStream, ArrayStreamAdapter};
17+
use crate::{ArrayChildrenIterator, ArrayDType, ArrayMetadata, ArrayTrait, Context};
18+
19+
mod owned;
20+
mod viewed;
21+
22+
/// A central type for all Vortex arrays, which are known length sequences of typed and possibly compressed data.
23+
///
24+
/// This is the main entrypoint for working with in-memory Vortex data, and dispatches work over the underlying encoding or memory representations.
25+
#[derive(Debug, Clone)]
26+
pub struct ArrayData(pub(crate) InnerArrayData);
27+
28+
// TODO(ngates): make this non-pub once TypedArray disappears
29+
#[derive(Debug, Clone)]
30+
pub(crate) enum InnerArrayData {
31+
/// Owned [`ArrayData`] with serialized metadata, backed by heap-allocated memory.
32+
Owned(OwnedArrayData),
33+
/// Zero-copy view over flatbuffer-encoded [`ArrayData`] data, created without eager serialization.
34+
Viewed(ViewedArrayData),
35+
}
36+
37+
impl ArrayData {
38+
pub fn try_new_owned(
39+
encoding: EncodingRef,
40+
dtype: DType,
41+
len: usize,
42+
metadata: Arc<dyn ArrayMetadata>,
43+
buffer: Option<Buffer>,
44+
children: Arc<[ArrayData]>,
45+
statistics: StatsSet,
46+
) -> VortexResult<Self> {
47+
let data = OwnedArrayData {
48+
encoding,
49+
dtype,
50+
len,
51+
metadata,
52+
buffer,
53+
children,
54+
stats_map: Arc::new(RwLock::new(statistics)),
55+
};
56+
57+
let array = ArrayData(InnerArrayData::Owned(data));
58+
// Validate here that the metadata correctly parses, so that an encoding can infallibly
59+
// FIXME(robert): Encoding::with_dyn no longer eagerly validates metadata, come up with a way to validate metadata
60+
encoding.with_dyn(&array, &mut |_| Ok(()))?;
61+
62+
Ok(array)
63+
}
64+
65+
pub fn try_new_viewed<F>(
66+
ctx: Arc<Context>,
67+
dtype: DType,
68+
len: usize,
69+
flatbuffer: Buffer,
70+
flatbuffer_init: F,
71+
buffers: Vec<Buffer>,
72+
) -> VortexResult<Self>
73+
where
74+
F: FnOnce(&[u8]) -> VortexResult<crate::flatbuffers::Array>,
75+
{
76+
let array = flatbuffer_init(flatbuffer.as_ref())?;
77+
let flatbuffer_loc = array._tab.loc();
78+
79+
let encoding = ctx.lookup_encoding(array.encoding()).ok_or_else(
80+
|| {
81+
let pretty_known_encodings = ctx.encodings()
82+
.format_with("\n", |e, f| f(&format_args!("- {}", e.id())));
83+
vortex_err!(InvalidSerde: "Unknown encoding with ID {:#02x}. Known encodings:\n{pretty_known_encodings}", array.encoding())
84+
},
85+
)?;
86+
87+
let view = ViewedArrayData {
88+
encoding,
89+
dtype,
90+
len,
91+
flatbuffer,
92+
flatbuffer_loc,
93+
buffers: buffers.into(),
94+
ctx,
95+
};
96+
97+
// Validate here that the metadata correctly parses, so that an encoding can infallibly
98+
// implement Encoding::with_view().
99+
// FIXME(ngates): validate the metadata
100+
ArrayData::from(view.clone()).with_dyn(|_| Ok::<(), VortexError>(()))?;
101+
102+
Ok(view.into())
103+
}
104+
105+
pub fn encoding(&self) -> EncodingRef {
106+
match &self.0 {
107+
InnerArrayData::Owned(d) => d.encoding(),
108+
InnerArrayData::Viewed(v) => v.encoding(),
109+
}
110+
}
111+
112+
/// Returns the number of logical elements in the array.
113+
#[allow(clippy::same_name_method)]
114+
pub fn len(&self) -> usize {
115+
match &self.0 {
116+
InnerArrayData::Owned(d) => d.len(),
117+
InnerArrayData::Viewed(v) => v.len(),
118+
}
119+
}
120+
121+
pub fn is_empty(&self) -> bool {
122+
match &self.0 {
123+
InnerArrayData::Owned(d) => d.is_empty(),
124+
InnerArrayData::Viewed(v) => v.is_empty(),
125+
}
126+
}
127+
128+
/// Total size of the array in bytes, including all children and buffers.
129+
pub fn nbytes(&self) -> usize {
130+
self.with_dyn(|a| a.nbytes())
131+
}
132+
133+
pub fn child<'a>(&'a self, idx: usize, dtype: &'a DType, len: usize) -> VortexResult<Self> {
134+
match &self.0 {
135+
InnerArrayData::Owned(d) => d.child(idx, dtype, len).cloned(),
136+
InnerArrayData::Viewed(v) => v
137+
.child(idx, dtype, len)
138+
.map(|view| ArrayData(InnerArrayData::Viewed(view))),
139+
}
140+
}
141+
142+
/// Returns a Vec of Arrays with all the array's child arrays.
143+
pub fn children(&self) -> Vec<ArrayData> {
144+
match &self.0 {
145+
InnerArrayData::Owned(d) => d.children().iter().cloned().collect_vec(),
146+
InnerArrayData::Viewed(v) => v.children(),
147+
}
148+
}
149+
150+
/// Returns the number of child arrays
151+
pub fn nchildren(&self) -> usize {
152+
match &self.0 {
153+
InnerArrayData::Owned(d) => d.nchildren(),
154+
InnerArrayData::Viewed(v) => v.nchildren(),
155+
}
156+
}
157+
158+
pub fn depth_first_traversal(&self) -> ArrayChildrenIterator {
159+
ArrayChildrenIterator::new(self.clone())
160+
}
161+
162+
/// Count the number of cumulative buffers encoded by self.
163+
pub fn cumulative_nbuffers(&self) -> usize {
164+
self.children()
165+
.iter()
166+
.map(|child| child.cumulative_nbuffers())
167+
.sum::<usize>()
168+
+ if self.buffer().is_some() { 1 } else { 0 }
169+
}
170+
171+
/// Return the buffer offsets and the total length of all buffers, assuming the given alignment.
172+
/// This includes all child buffers.
173+
pub fn all_buffer_offsets(&self, alignment: usize) -> Vec<u64> {
174+
let mut offsets = vec![];
175+
let mut offset = 0;
176+
177+
for col_data in self.depth_first_traversal() {
178+
if let Some(buffer) = col_data.buffer() {
179+
offsets.push(offset as u64);
180+
181+
let buffer_size = buffer.len();
182+
let aligned_size = (buffer_size + (alignment - 1)) & !(alignment - 1);
183+
offset += aligned_size;
184+
}
185+
}
186+
offsets.push(offset as u64);
187+
188+
offsets
189+
}
190+
191+
/// Get back the (possibly owned) metadata for the array.
192+
///
193+
/// View arrays will return a reference to their bytes, while heap-backed arrays
194+
/// must first serialize their metadata, returning an owned byte array to the caller.
195+
pub fn metadata(&self) -> VortexResult<Cow<[u8]>> {
196+
match &self.0 {
197+
InnerArrayData::Owned(array_data) => {
198+
// Heap-backed arrays must first try and serialize the metadata.
199+
let owned_meta: Vec<u8> = array_data
200+
.metadata()
201+
.try_serialize_metadata()?
202+
.as_ref()
203+
.to_owned();
204+
205+
Ok(Cow::Owned(owned_meta))
206+
}
207+
InnerArrayData::Viewed(array_view) => {
208+
// View arrays have direct access to metadata bytes.
209+
array_view
210+
.metadata()
211+
.ok_or_else(|| vortex_err!("things"))
212+
.map(Cow::Borrowed)
213+
}
214+
}
215+
}
216+
217+
pub fn buffer(&self) -> Option<&Buffer> {
218+
match &self.0 {
219+
InnerArrayData::Owned(d) => d.buffer(),
220+
InnerArrayData::Viewed(v) => v.buffer(),
221+
}
222+
}
223+
224+
pub fn into_buffer(self) -> Option<Buffer> {
225+
match self.0 {
226+
InnerArrayData::Owned(d) => d.into_buffer(),
227+
InnerArrayData::Viewed(v) => v.buffer().cloned(),
228+
}
229+
}
230+
231+
pub fn into_array_iterator(self) -> impl ArrayIterator {
232+
ArrayIteratorAdapter::new(self.dtype().clone(), std::iter::once(Ok(self)))
233+
}
234+
235+
pub fn into_array_stream(self) -> impl ArrayStream {
236+
ArrayStreamAdapter::new(
237+
self.dtype().clone(),
238+
futures_util::stream::once(ready(Ok(self))),
239+
)
240+
}
241+
242+
/// Checks whether array is of a given encoding.
243+
pub fn is_encoding(&self, id: EncodingId) -> bool {
244+
self.encoding().id() == id
245+
}
246+
247+
#[inline]
248+
pub fn with_dyn<R, F>(&self, mut f: F) -> R
249+
where
250+
F: FnMut(&dyn ArrayTrait) -> R,
251+
{
252+
let mut result = None;
253+
254+
self.encoding()
255+
.with_dyn(self, &mut |array| {
256+
// Sanity check that the encoding implements the correct array trait
257+
debug_assert!(
258+
match array.dtype() {
259+
DType::Null => array.as_null_array().is_some(),
260+
DType::Bool(_) => array.as_bool_array().is_some(),
261+
DType::Primitive(..) => array.as_primitive_array().is_some(),
262+
DType::Utf8(_) => array.as_utf8_array().is_some(),
263+
DType::Binary(_) => array.as_binary_array().is_some(),
264+
DType::Struct(..) => array.as_struct_array().is_some(),
265+
DType::List(..) => array.as_list_array().is_some(),
266+
DType::Extension(..) => array.as_extension_array().is_some(),
267+
},
268+
"Encoding {} does not implement the variant trait for {}",
269+
self.encoding().id(),
270+
array.dtype()
271+
);
272+
273+
result = Some(f(array));
274+
Ok(())
275+
})
276+
.unwrap_or_else(|err| {
277+
vortex_panic!(
278+
err,
279+
"Failed to convert Array to {}",
280+
std::any::type_name::<dyn ArrayTrait>()
281+
)
282+
});
283+
284+
// Now we unwrap the optional, which we know to be populated by the closure.
285+
result.vortex_expect("Failed to get result from Array::with_dyn")
286+
}
287+
}
288+
289+
impl Display for ArrayData {
290+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
291+
let prefix = match &self.0 {
292+
InnerArrayData::Owned(_) => "",
293+
InnerArrayData::Viewed(_) => "$",
294+
};
295+
write!(
296+
f,
297+
"{}{}({}, len={})",
298+
prefix,
299+
self.encoding().id(),
300+
self.dtype(),
301+
self.len()
302+
)
303+
}
304+
}

0 commit comments

Comments
 (0)