|
| 1 | +use std::borrow::Cow; |
| 2 | +use std::fmt::{Display, Formatter}; |
| 3 | +use std::future::ready; |
| 4 | +use std::sync::{Arc, RwLock}; |
| 5 | + |
| 6 | +use itertools::Itertools; |
| 7 | +use owned::OwnedArrayData; |
| 8 | +use viewed::ViewedArrayData; |
| 9 | +use vortex_buffer::Buffer; |
| 10 | +use vortex_dtype::DType; |
| 11 | +use vortex_error::{vortex_err, vortex_panic, VortexError, VortexExpect, VortexResult}; |
| 12 | + |
| 13 | +use crate::encoding::{EncodingId, EncodingRef}; |
| 14 | +use crate::iter::{ArrayIterator, ArrayIteratorAdapter}; |
| 15 | +use crate::stats::StatsSet; |
| 16 | +use crate::stream::{ArrayStream, ArrayStreamAdapter}; |
| 17 | +use crate::{ArrayChildrenIterator, ArrayDType, ArrayMetadata, ArrayTrait, Context}; |
| 18 | + |
| 19 | +mod owned; |
| 20 | +mod viewed; |
| 21 | + |
| 22 | +/// A central type for all Vortex arrays, which are known length sequences of typed and possibly compressed data. |
| 23 | +/// |
| 24 | +/// This is the main entrypoint for working with in-memory Vortex data, and dispatches work over the underlying encoding or memory representations. |
| 25 | +#[derive(Debug, Clone)] |
| 26 | +pub struct ArrayData(pub(crate) InnerArrayData); |
| 27 | + |
| 28 | +// TODO(ngates): make this non-pub once TypedArray disappears |
| 29 | +#[derive(Debug, Clone)] |
| 30 | +pub(crate) enum InnerArrayData { |
| 31 | + /// Owned [`ArrayData`] with serialized metadata, backed by heap-allocated memory. |
| 32 | + Owned(OwnedArrayData), |
| 33 | + /// Zero-copy view over flatbuffer-encoded [`ArrayData`] data, created without eager serialization. |
| 34 | + Viewed(ViewedArrayData), |
| 35 | +} |
| 36 | + |
| 37 | +impl ArrayData { |
| 38 | + pub fn try_new_owned( |
| 39 | + encoding: EncodingRef, |
| 40 | + dtype: DType, |
| 41 | + len: usize, |
| 42 | + metadata: Arc<dyn ArrayMetadata>, |
| 43 | + buffer: Option<Buffer>, |
| 44 | + children: Arc<[ArrayData]>, |
| 45 | + statistics: StatsSet, |
| 46 | + ) -> VortexResult<Self> { |
| 47 | + let data = OwnedArrayData { |
| 48 | + encoding, |
| 49 | + dtype, |
| 50 | + len, |
| 51 | + metadata, |
| 52 | + buffer, |
| 53 | + children, |
| 54 | + stats_map: Arc::new(RwLock::new(statistics)), |
| 55 | + }; |
| 56 | + |
| 57 | + let array = ArrayData(InnerArrayData::Owned(data)); |
| 58 | + // Validate here that the metadata correctly parses, so that an encoding can infallibly |
| 59 | + // FIXME(robert): Encoding::with_dyn no longer eagerly validates metadata, come up with a way to validate metadata |
| 60 | + encoding.with_dyn(&array, &mut |_| Ok(()))?; |
| 61 | + |
| 62 | + Ok(array) |
| 63 | + } |
| 64 | + |
| 65 | + pub fn try_new_viewed<F>( |
| 66 | + ctx: Arc<Context>, |
| 67 | + dtype: DType, |
| 68 | + len: usize, |
| 69 | + flatbuffer: Buffer, |
| 70 | + flatbuffer_init: F, |
| 71 | + buffers: Vec<Buffer>, |
| 72 | + ) -> VortexResult<Self> |
| 73 | + where |
| 74 | + F: FnOnce(&[u8]) -> VortexResult<crate::flatbuffers::Array>, |
| 75 | + { |
| 76 | + let array = flatbuffer_init(flatbuffer.as_ref())?; |
| 77 | + let flatbuffer_loc = array._tab.loc(); |
| 78 | + |
| 79 | + let encoding = ctx.lookup_encoding(array.encoding()).ok_or_else( |
| 80 | + || { |
| 81 | + let pretty_known_encodings = ctx.encodings() |
| 82 | + .format_with("\n", |e, f| f(&format_args!("- {}", e.id()))); |
| 83 | + vortex_err!(InvalidSerde: "Unknown encoding with ID {:#02x}. Known encodings:\n{pretty_known_encodings}", array.encoding()) |
| 84 | + }, |
| 85 | + )?; |
| 86 | + |
| 87 | + let view = ViewedArrayData { |
| 88 | + encoding, |
| 89 | + dtype, |
| 90 | + len, |
| 91 | + flatbuffer, |
| 92 | + flatbuffer_loc, |
| 93 | + buffers: buffers.into(), |
| 94 | + ctx, |
| 95 | + }; |
| 96 | + |
| 97 | + // Validate here that the metadata correctly parses, so that an encoding can infallibly |
| 98 | + // implement Encoding::with_view(). |
| 99 | + // FIXME(ngates): validate the metadata |
| 100 | + ArrayData::from(view.clone()).with_dyn(|_| Ok::<(), VortexError>(()))?; |
| 101 | + |
| 102 | + Ok(view.into()) |
| 103 | + } |
| 104 | + |
| 105 | + pub fn encoding(&self) -> EncodingRef { |
| 106 | + match &self.0 { |
| 107 | + InnerArrayData::Owned(d) => d.encoding(), |
| 108 | + InnerArrayData::Viewed(v) => v.encoding(), |
| 109 | + } |
| 110 | + } |
| 111 | + |
| 112 | + /// Returns the number of logical elements in the array. |
| 113 | + #[allow(clippy::same_name_method)] |
| 114 | + pub fn len(&self) -> usize { |
| 115 | + match &self.0 { |
| 116 | + InnerArrayData::Owned(d) => d.len(), |
| 117 | + InnerArrayData::Viewed(v) => v.len(), |
| 118 | + } |
| 119 | + } |
| 120 | + |
| 121 | + pub fn is_empty(&self) -> bool { |
| 122 | + match &self.0 { |
| 123 | + InnerArrayData::Owned(d) => d.is_empty(), |
| 124 | + InnerArrayData::Viewed(v) => v.is_empty(), |
| 125 | + } |
| 126 | + } |
| 127 | + |
| 128 | + /// Total size of the array in bytes, including all children and buffers. |
| 129 | + pub fn nbytes(&self) -> usize { |
| 130 | + self.with_dyn(|a| a.nbytes()) |
| 131 | + } |
| 132 | + |
| 133 | + pub fn child<'a>(&'a self, idx: usize, dtype: &'a DType, len: usize) -> VortexResult<Self> { |
| 134 | + match &self.0 { |
| 135 | + InnerArrayData::Owned(d) => d.child(idx, dtype, len).cloned(), |
| 136 | + InnerArrayData::Viewed(v) => v |
| 137 | + .child(idx, dtype, len) |
| 138 | + .map(|view| ArrayData(InnerArrayData::Viewed(view))), |
| 139 | + } |
| 140 | + } |
| 141 | + |
| 142 | + /// Returns a Vec of Arrays with all the array's child arrays. |
| 143 | + pub fn children(&self) -> Vec<ArrayData> { |
| 144 | + match &self.0 { |
| 145 | + InnerArrayData::Owned(d) => d.children().iter().cloned().collect_vec(), |
| 146 | + InnerArrayData::Viewed(v) => v.children(), |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + /// Returns the number of child arrays |
| 151 | + pub fn nchildren(&self) -> usize { |
| 152 | + match &self.0 { |
| 153 | + InnerArrayData::Owned(d) => d.nchildren(), |
| 154 | + InnerArrayData::Viewed(v) => v.nchildren(), |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + pub fn depth_first_traversal(&self) -> ArrayChildrenIterator { |
| 159 | + ArrayChildrenIterator::new(self.clone()) |
| 160 | + } |
| 161 | + |
| 162 | + /// Count the number of cumulative buffers encoded by self. |
| 163 | + pub fn cumulative_nbuffers(&self) -> usize { |
| 164 | + self.children() |
| 165 | + .iter() |
| 166 | + .map(|child| child.cumulative_nbuffers()) |
| 167 | + .sum::<usize>() |
| 168 | + + if self.buffer().is_some() { 1 } else { 0 } |
| 169 | + } |
| 170 | + |
| 171 | + /// Return the buffer offsets and the total length of all buffers, assuming the given alignment. |
| 172 | + /// This includes all child buffers. |
| 173 | + pub fn all_buffer_offsets(&self, alignment: usize) -> Vec<u64> { |
| 174 | + let mut offsets = vec![]; |
| 175 | + let mut offset = 0; |
| 176 | + |
| 177 | + for col_data in self.depth_first_traversal() { |
| 178 | + if let Some(buffer) = col_data.buffer() { |
| 179 | + offsets.push(offset as u64); |
| 180 | + |
| 181 | + let buffer_size = buffer.len(); |
| 182 | + let aligned_size = (buffer_size + (alignment - 1)) & !(alignment - 1); |
| 183 | + offset += aligned_size; |
| 184 | + } |
| 185 | + } |
| 186 | + offsets.push(offset as u64); |
| 187 | + |
| 188 | + offsets |
| 189 | + } |
| 190 | + |
| 191 | + /// Get back the (possibly owned) metadata for the array. |
| 192 | + /// |
| 193 | + /// View arrays will return a reference to their bytes, while heap-backed arrays |
| 194 | + /// must first serialize their metadata, returning an owned byte array to the caller. |
| 195 | + pub fn metadata(&self) -> VortexResult<Cow<[u8]>> { |
| 196 | + match &self.0 { |
| 197 | + InnerArrayData::Owned(array_data) => { |
| 198 | + // Heap-backed arrays must first try and serialize the metadata. |
| 199 | + let owned_meta: Vec<u8> = array_data |
| 200 | + .metadata() |
| 201 | + .try_serialize_metadata()? |
| 202 | + .as_ref() |
| 203 | + .to_owned(); |
| 204 | + |
| 205 | + Ok(Cow::Owned(owned_meta)) |
| 206 | + } |
| 207 | + InnerArrayData::Viewed(array_view) => { |
| 208 | + // View arrays have direct access to metadata bytes. |
| 209 | + array_view |
| 210 | + .metadata() |
| 211 | + .ok_or_else(|| vortex_err!("things")) |
| 212 | + .map(Cow::Borrowed) |
| 213 | + } |
| 214 | + } |
| 215 | + } |
| 216 | + |
| 217 | + pub fn buffer(&self) -> Option<&Buffer> { |
| 218 | + match &self.0 { |
| 219 | + InnerArrayData::Owned(d) => d.buffer(), |
| 220 | + InnerArrayData::Viewed(v) => v.buffer(), |
| 221 | + } |
| 222 | + } |
| 223 | + |
| 224 | + pub fn into_buffer(self) -> Option<Buffer> { |
| 225 | + match self.0 { |
| 226 | + InnerArrayData::Owned(d) => d.into_buffer(), |
| 227 | + InnerArrayData::Viewed(v) => v.buffer().cloned(), |
| 228 | + } |
| 229 | + } |
| 230 | + |
| 231 | + pub fn into_array_iterator(self) -> impl ArrayIterator { |
| 232 | + ArrayIteratorAdapter::new(self.dtype().clone(), std::iter::once(Ok(self))) |
| 233 | + } |
| 234 | + |
| 235 | + pub fn into_array_stream(self) -> impl ArrayStream { |
| 236 | + ArrayStreamAdapter::new( |
| 237 | + self.dtype().clone(), |
| 238 | + futures_util::stream::once(ready(Ok(self))), |
| 239 | + ) |
| 240 | + } |
| 241 | + |
| 242 | + /// Checks whether array is of a given encoding. |
| 243 | + pub fn is_encoding(&self, id: EncodingId) -> bool { |
| 244 | + self.encoding().id() == id |
| 245 | + } |
| 246 | + |
| 247 | + #[inline] |
| 248 | + pub fn with_dyn<R, F>(&self, mut f: F) -> R |
| 249 | + where |
| 250 | + F: FnMut(&dyn ArrayTrait) -> R, |
| 251 | + { |
| 252 | + let mut result = None; |
| 253 | + |
| 254 | + self.encoding() |
| 255 | + .with_dyn(self, &mut |array| { |
| 256 | + // Sanity check that the encoding implements the correct array trait |
| 257 | + debug_assert!( |
| 258 | + match array.dtype() { |
| 259 | + DType::Null => array.as_null_array().is_some(), |
| 260 | + DType::Bool(_) => array.as_bool_array().is_some(), |
| 261 | + DType::Primitive(..) => array.as_primitive_array().is_some(), |
| 262 | + DType::Utf8(_) => array.as_utf8_array().is_some(), |
| 263 | + DType::Binary(_) => array.as_binary_array().is_some(), |
| 264 | + DType::Struct(..) => array.as_struct_array().is_some(), |
| 265 | + DType::List(..) => array.as_list_array().is_some(), |
| 266 | + DType::Extension(..) => array.as_extension_array().is_some(), |
| 267 | + }, |
| 268 | + "Encoding {} does not implement the variant trait for {}", |
| 269 | + self.encoding().id(), |
| 270 | + array.dtype() |
| 271 | + ); |
| 272 | + |
| 273 | + result = Some(f(array)); |
| 274 | + Ok(()) |
| 275 | + }) |
| 276 | + .unwrap_or_else(|err| { |
| 277 | + vortex_panic!( |
| 278 | + err, |
| 279 | + "Failed to convert Array to {}", |
| 280 | + std::any::type_name::<dyn ArrayTrait>() |
| 281 | + ) |
| 282 | + }); |
| 283 | + |
| 284 | + // Now we unwrap the optional, which we know to be populated by the closure. |
| 285 | + result.vortex_expect("Failed to get result from Array::with_dyn") |
| 286 | + } |
| 287 | +} |
| 288 | + |
| 289 | +impl Display for ArrayData { |
| 290 | + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { |
| 291 | + let prefix = match &self.0 { |
| 292 | + InnerArrayData::Owned(_) => "", |
| 293 | + InnerArrayData::Viewed(_) => "$", |
| 294 | + }; |
| 295 | + write!( |
| 296 | + f, |
| 297 | + "{}{}({}, len={})", |
| 298 | + prefix, |
| 299 | + self.encoding().id(), |
| 300 | + self.dtype(), |
| 301 | + self.len() |
| 302 | + ) |
| 303 | + } |
| 304 | +} |
0 commit comments