diff --git a/crates/build/re_types_builder/src/codegen/cpp/array_builder.rs b/crates/build/re_types_builder/src/codegen/cpp/array_builder.rs index 1a000e59672f..f5af9a134e29 100644 --- a/crates/build/re_types_builder/src/codegen/cpp/array_builder.rs +++ b/crates/build/re_types_builder/src/codegen/cpp/array_builder.rs @@ -66,11 +66,6 @@ fn arrow_array_builder_type_and_declaration( ); ident } - Type::Binary => { - let ident = format_ident!("LargeBinaryBuilder"); - declarations.insert("arrow", ForwardDecl::Class(ident.clone())); - ident - } Type::String => { let ident = format_ident!("StringBuilder"); declarations.insert("arrow", ForwardDecl::Class(ident.clone())); diff --git a/crates/build/re_types_builder/src/codegen/cpp/mod.rs b/crates/build/re_types_builder/src/codegen/cpp/mod.rs index a7bd212ec272..9d39c252b961 100644 --- a/crates/build/re_types_builder/src/codegen/cpp/mod.rs +++ b/crates/build/re_types_builder/src/codegen/cpp/mod.rs @@ -2003,7 +2003,6 @@ fn quote_fill_arrow_array_builder( ElementType::Float16 => Some("HalfFloatBuilder"), ElementType::Float32 => Some("FloatBuilder"), ElementType::Float64 => Some("DoubleBuilder"), - ElementType::Binary => Some("BinaryBuilder"), ElementType::String => Some("StringBuilder"), ElementType::Object{..} => None, }; @@ -2234,7 +2233,7 @@ fn quote_append_single_value_to_builder( value_access: &TokenStream, includes: &mut Includes, ) -> TokenStream { - match typ { + match &typ { Type::Unit => { quote!(ARROW_RETURN_NOT_OK(#value_builder->AppendNull());) } @@ -2253,11 +2252,6 @@ fn quote_append_single_value_to_builder( | Type::String => { quote!(ARROW_RETURN_NOT_OK(#value_builder->Append(#value_access));) } - Type::Binary => { - quote!( - ARROW_RETURN_NOT_OK(#value_builder->Append(#value_access.data(), static_cast(#value_access.size()))); - ) - } Type::Float16 => { // Cast `rerun::half` to a `uint16_t`` quote! { @@ -2296,14 +2290,6 @@ fn quote_append_single_value_to_builder( ); } } - ElementType::Binary => { - quote! { - for (size_t item_idx = 0; item_idx < #num_items_per_element; item_idx += 1) { - auto&& data = &#value_access[elem_idx].data; - ARROW_RETURN_NOT_OK(#value_builder->Append(data.data(), static_cast(data.size()))); - } - } - } ElementType::String => { quote! { for (size_t item_idx = 0; item_idx < #num_items_per_element; item_idx += 1) { @@ -2461,10 +2447,6 @@ fn quote_field_type(includes: &mut Includes, obj_field: &ObjectField) -> TokenSt } Type::Float32 => quote! { float }, Type::Float64 => quote! { double }, - Type::Binary => { - includes.insert_rerun("collection.hpp"); - quote! { rerun::Collection } - } Type::String => { includes.insert_system("string"); quote! { std::string } @@ -2525,10 +2507,6 @@ fn quote_element_type(includes: &mut Includes, typ: &ElementType) -> TokenStream } ElementType::Float32 => quote! { float }, ElementType::Float64 => quote! { double }, - ElementType::Binary => { - includes.insert_rerun("collection.hpp"); - quote! { rerun::Collection } - } ElementType::String => { includes.insert_system("string"); quote! { std::string } @@ -2670,7 +2648,6 @@ fn quote_arrow_datatype( Type::Float16 => quote!(arrow::float16()), Type::Float32 => quote!(arrow::float32()), Type::Float64 => quote!(arrow::float64()), - Type::Binary => quote!(arrow::large_binary()), Type::String => quote!(arrow::utf8()), Type::Bool => quote!(arrow::boolean()), diff --git a/crates/build/re_types_builder/src/codegen/docs/website.rs b/crates/build/re_types_builder/src/codegen/docs/website.rs index e5532cd000b3..9ef210c22afa 100644 --- a/crates/build/re_types_builder/src/codegen/docs/website.rs +++ b/crates/build/re_types_builder/src/codegen/docs/website.rs @@ -414,7 +414,6 @@ fn write_fields(reporter: &Reporter, objects: &Objects, o: &mut String, object: Type::Float16 => atomic("float16"), Type::Float32 => atomic("float32"), Type::Float64 => atomic("float64"), - Type::Binary => atomic("binary"), Type::String => atomic("utf8"), Type::Array { elem_type, length } => { diff --git a/crates/build/re_types_builder/src/codegen/python/mod.rs b/crates/build/re_types_builder/src/codegen/python/mod.rs index 7cf1cf63a41b..57d745fc3a71 100644 --- a/crates/build/re_types_builder/src/codegen/python/mod.rs +++ b/crates/build/re_types_builder/src/codegen/python/mod.rs @@ -1673,7 +1673,6 @@ fn quote_field_type_from_field( | Type::Int64 => "int".to_owned(), Type::Bool => "bool".to_owned(), Type::Float16 | Type::Float32 | Type::Float64 => "float".to_owned(), - Type::Binary => "bytes".to_owned(), Type::String => "str".to_owned(), Type::Array { elem_type, @@ -1692,7 +1691,6 @@ fn quote_field_type_from_field( ElementType::Float16 => "npt.NDArray[np.float16]".to_owned(), ElementType::Float32 => "npt.NDArray[np.float32]".to_owned(), ElementType::Float64 => "npt.NDArray[np.float64]".to_owned(), - ElementType::Binary => "list[bytes]".to_owned(), ElementType::String => "list[str]".to_owned(), ElementType::Object { .. } => { let typ = quote_type_from_element_type(elem_type); @@ -1754,13 +1752,6 @@ fn quote_field_converter_from_field( "float".to_owned() } } - Type::Binary => { - if field.is_nullable { - "bytes_or_none".to_owned() - } else { - "bytes".to_owned() - } - } Type::String => { if field.is_nullable { "str_or_none".to_owned() @@ -1877,7 +1868,6 @@ fn quote_type_from_type(typ: &Type) -> String { | Type::Int64 => "int".to_owned(), Type::Bool => "bool".to_owned(), Type::Float16 | Type::Float32 | Type::Float64 => "float".to_owned(), - Type::Binary => "bytes".to_owned(), Type::String => "str".to_owned(), Type::Object { fqname } => fqname_to_type(fqname), Type::Array { elem_type, .. } | Type::Vector { elem_type } => { @@ -2036,7 +2026,6 @@ fn np_dtype_from_type(t: &Type) -> Option<&'static str> { Type::Float32 => Some("np.float32"), Type::Float64 => Some("np.float64"), Type::Unit - | Type::Binary | Type::String | Type::Array { .. } | Type::Vector { .. } @@ -2133,11 +2122,7 @@ fn quote_arrow_serialization( code.push_indented(2, &field_fwd, 1); } - Type::Unit - | Type::Binary - | Type::String - | Type::Array { .. } - | Type::Vector { .. } => { + Type::Unit | Type::String | Type::Array { .. } | Type::Vector { .. } => { return Err( "We lack codegen for arrow-serialization of general structs".to_owned() ); @@ -2264,7 +2249,6 @@ return pa.array(pa_data, type=data_type) | Type::Float16 | Type::Float32 | Type::Float64 - | Type::Binary | Type::String => { let datatype = quote_arrow_datatype(&type_registry.get(&field.fqname)); format!("pa.array({variant_kind_list}, type={datatype})") @@ -2822,7 +2806,7 @@ fn quote_arrow_datatype(datatype: &DataType) -> String { DataType::Atomic(AtomicDataType::Float32) => "pa.float32()".to_owned(), DataType::Atomic(AtomicDataType::Float64) => "pa.float64()".to_owned(), - DataType::Binary => "pa.large_binary()".to_owned(), + DataType::Binary => "pa.binary()".to_owned(), DataType::Utf8 => "pa.utf8()".to_owned(), diff --git a/crates/build/re_types_builder/src/codegen/rust/api.rs b/crates/build/re_types_builder/src/codegen/rust/api.rs index 1b229d237747..0b57dc79b7ea 100644 --- a/crates/build/re_types_builder/src/codegen/rust/api.rs +++ b/crates/build/re_types_builder/src/codegen/rust/api.rs @@ -783,7 +783,6 @@ impl quote::ToTokens for TypeTokenizer<'_> { Type::Float16 => quote!(half::f16), Type::Float32 => quote!(f32), Type::Float64 => quote!(f64), - Type::Binary => quote!(::arrow::buffer::Buffer), Type::String => quote!(::re_types_core::ArrowString), Type::Array { elem_type, length } => { if *unwrap { @@ -822,7 +821,6 @@ impl quote::ToTokens for &ElementType { ElementType::Float16 => quote!(half::f16), ElementType::Float32 => quote!(f32), ElementType::Float64 => quote!(f64), - ElementType::Binary => quote!(::arrow::buffer::Buffer), ElementType::String => quote!(::re_types_core::ArrowString), ElementType::Object { fqname } => quote_fqname_as_type_path(fqname), } diff --git a/crates/build/re_types_builder/src/codegen/rust/arrow.rs b/crates/build/re_types_builder/src/codegen/rust/arrow.rs index 92fe805feb29..f3b5025fa22e 100644 --- a/crates/build/re_types_builder/src/codegen/rust/arrow.rs +++ b/crates/build/re_types_builder/src/codegen/rust/arrow.rs @@ -35,7 +35,7 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> { DataType::Atomic(AtomicDataType::Float32) => quote!(DataType::Float32), DataType::Atomic(AtomicDataType::Float64) => quote!(DataType::Float64), - DataType::Binary => quote!(DataType::LargeBinary), + DataType::Binary => quote!(DataType::Binary), DataType::Utf8 => quote!(DataType::Utf8), diff --git a/crates/build/re_types_builder/src/codegen/rust/deserializer.rs b/crates/build/re_types_builder/src/codegen/rust/deserializer.rs index 01b7132272f7..bd26513db2fb 100644 --- a/crates/build/re_types_builder/src/codegen/rust/deserializer.rs +++ b/crates/build/re_types_builder/src/codegen/rust/deserializer.rs @@ -537,75 +537,11 @@ fn quote_arrow_field_deserializer( } } - DataType::Binary => { - // Special code to handle deserializing both 32-bit and 64-bit opffsets (BinaryArray vs LargeBinaryArray) - quote! {{ - fn extract_from_binary( - arrow_data: &arrow::array::GenericByteArray>, - ) -> DeserializationResult>> - where - O: ::arrow::array::OffsetSizeTrait, - { - use ::arrow::array::Array as _; - use ::re_types_core::arrow_zip_validity::ZipValidity; - - let arrow_data_buf = arrow_data.values(); - let offsets = arrow_data.offsets(); - - ZipValidity::new_with_validity(offsets.windows(2), arrow_data.nulls()) - .map(|elem| { - elem.map(|window| { - // NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs. - - let start = window[0].as_usize(); - let end = window[1].as_usize(); - let len = end - start; - - // NOTE: It is absolutely crucial we explicitly handle the - // boundchecks manually first, otherwise rustc completely chokes - // when slicing the data (as in: a 100x perf drop)! - if arrow_data_buf.len() < end { - // error context is appended below during final collection - return Err(DeserializationError::offset_slice_oob( - (start, end), - arrow_data_buf.len(), - )); - } - - #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] - let data = arrow_data_buf.slice_with_length(start, len); - Ok(data) - }) - .transpose() - }) - .collect::>>>() - } - - if let Some(arrow_data) = #data_src.as_any().downcast_ref::() { - extract_from_binary(arrow_data) - .with_context(#obj_field_fqname)? - .into_iter() - } else if let Some(arrow_data) = #data_src.as_any().downcast_ref::() - { - extract_from_binary(arrow_data) - .with_context(#obj_field_fqname)? - .into_iter() - } else { - let expected = Self::arrow_datatype(); - let actual = arrow_data.data_type().clone(); - return Err(DeserializationError::datatype_mismatch(expected, actual)) - .with_context(#obj_field_fqname); - } - }} - } - DataType::Utf8 => { - let quoted_downcast = quote_array_downcast( - obj_field_fqname, - data_src, - quote!(StringArray), - quoted_datatype, - ); + let quoted_downcast = { + let cast_as = quote!(StringArray); + quote_array_downcast(obj_field_fqname, data_src, cast_as, quoted_datatype) + }; let quoted_iter_transparency = quote_iterator_transparency( objects, @@ -641,8 +577,7 @@ fn quote_arrow_field_deserializer( (start, end), #data_src_buf.len(), )); } - - #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] + #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] // TODO(apache/arrow-rs#6900): slice_with_length_unchecked unsafe when https://github.com/apache/arrow-rs/pull/6901 is merged and released let data = #data_src_buf.slice_with_length(start, len); Ok(data) @@ -889,7 +824,7 @@ fn quote_arrow_field_deserializer( quote!(#fqname_use::from_arrow_opt(#data_src).with_context(#obj_field_fqname)?.into_iter()) } - DataType::Object { .. } => unimplemented!("{datatype:#?}"), + _ => unimplemented!("{datatype:#?}"), } } diff --git a/crates/build/re_types_builder/src/codegen/rust/serializer.rs b/crates/build/re_types_builder/src/codegen/rust/serializer.rs index 3a27808bf03f..48108bfcd2bb 100644 --- a/crates/build/re_types_builder/src/codegen/rust/serializer.rs +++ b/crates/build/re_types_builder/src/codegen/rust/serializer.rs @@ -583,14 +583,7 @@ fn quote_arrow_field_serializer( } } - DataType::Binary | DataType::Utf8 => { - let is_binary = datatype.to_logical_type() == &DataType::Binary; - let as_bytes = if is_binary { - quote!() - } else { - quote!(.as_bytes()) - }; - + DataType::Utf8 => { // NOTE: We need values for all slots, regardless of what the validity says, // hence `unwrap_or_default`. let (quoted_member_accessor, quoted_transparent_length) = if inner_is_arrow_transparent @@ -630,7 +623,7 @@ fn quote_arrow_field_serializer( let inner_data_and_offsets = if elements_are_nullable { quote! { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( #data_src.iter().map(|opt| opt.as_ref() #quoted_transparent_length .unwrap_or_default()) ); @@ -643,13 +636,13 @@ fn quote_arrow_field_serializer( // NOTE: Flattening to remove the guaranteed layer of nullability: we don't care // about it while building the backing buffer since it's all offsets driven. for data in #data_src.iter().flatten() { - buffer_builder.append_slice(data #quoted_member_accessor #as_bytes); + buffer_builder.append_slice(data #quoted_member_accessor.as_bytes()); } let inner_data: arrow::buffer::Buffer = buffer_builder.finish(); } } else { quote! { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( #data_src.iter() #quoted_transparent_length ); @@ -660,29 +653,22 @@ fn quote_arrow_field_serializer( let mut buffer_builder = arrow::array::builder::BufferBuilder::::new(capacity); for data in &#data_src { - buffer_builder.append_slice(data #quoted_member_accessor #as_bytes); + buffer_builder.append_slice(data #quoted_member_accessor.as_bytes()); } let inner_data: arrow::buffer::Buffer = buffer_builder.finish(); } }; - if is_binary { - quote! {{ - #inner_data_and_offsets - as_array_ref(LargeBinaryArray::new(offsets, inner_data, #validity_src)) - }} - } else { - quote! {{ - #inner_data_and_offsets - - // Safety: we're building this from actual native strings, so no need to do the - // whole utf8 validation _again_. - // It would be nice to use quote_comment here and put this safety notice in the generated code, - // but that seems to push us over some complexity limit causing rustfmt to fail. - #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] - as_array_ref(unsafe { StringArray::new_unchecked(offsets, inner_data, #validity_src) }) - }} - } + quote! {{ + #inner_data_and_offsets + + // Safety: we're building this from actual native strings, so no need to do the + // whole utf8 validation _again_. + // It would be nice to use quote_comment here and put this safety notice in the generated code, + // but that seems to push us over some complexity limit causing rustfmt to fail. + #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] + as_array_ref(unsafe { StringArray::new_unchecked(offsets, inner_data, #validity_src) }) + }} } DataType::List(inner_field) | DataType::FixedSizeList(inner_field, _) => { @@ -933,6 +919,6 @@ fn quote_arrow_field_serializer( }} } - DataType::Object { .. } => unimplemented!("{datatype:#?}"), + _ => unimplemented!("{datatype:#?}"), } } diff --git a/crates/build/re_types_builder/src/data_type.rs b/crates/build/re_types_builder/src/data_type.rs index 28e0533abd19..68b7eb3f7f6c 100644 --- a/crates/build/re_types_builder/src/data_type.rs +++ b/crates/build/re_types_builder/src/data_type.rs @@ -112,7 +112,6 @@ impl std::fmt::Display for AtomicDataType { pub enum DataType { Atomic(AtomicDataType), - // 32-bit or 64-bit Binary, Utf8, @@ -154,12 +153,8 @@ impl DataType { pub enum LazyDatatype { Atomic(AtomicDataType), - /// A list of bytes of arbitrary length. - /// - /// 32-bit or 64-bit Binary, - /// Utf8 Utf8, /// Elements are non-nullable diff --git a/crates/build/re_types_builder/src/objects.rs b/crates/build/re_types_builder/src/objects.rs index a173e3c03c05..1d2b3507eb59 100644 --- a/crates/build/re_types_builder/src/objects.rs +++ b/crates/build/re_types_builder/src/objects.rs @@ -1190,15 +1190,7 @@ pub enum Type { Float16, Float32, Float64, - - /// A list of bytes of arbitrary length. - /// - /// 32-bit or 64-bit - Binary, - - /// Utf8 String, - Array { elem_type: ElementType, length: usize, @@ -1226,7 +1218,6 @@ impl From for Type { ElementType::Float16 => Self::Float16, ElementType::Float32 => Self::Float32, ElementType::Float64 => Self::Float64, - ElementType::Binary => Self::Binary, ElementType::String => Self::String, ElementType::Object { fqname } => Self::Object { fqname }, } @@ -1245,28 +1236,14 @@ impl Type { let typ = field_type.base_type(); if let Some(type_override) = attrs.try_get::(fqname, ATTR_RERUN_OVERRIDE_TYPE) { - match type_override.as_str() { - "binary" => { - if typ == FbsBaseType::Vector && field_type.element() == FbsBaseType::UByte { - return Self::Binary; - } else { - panic!("{fqname}: 'binary' can only be used on '[ubyte]', got {typ:?}") - } - } - "float16" => { - if matches!(typ, FbsBaseType::Array | FbsBaseType::Vector) { - // Array of float16 handled later - } else if typ == FbsBaseType::UShort { - return Self::Float16; - } else { - panic!( - "{fqname}: 'float16' can only be used on 'ushort' or `[ushort]`, got {typ:?}" - ) - } - } - _ => { - panic!("{fqname}: Unknown {ATTR_RERUN_OVERRIDE_TYPE:?}: {type_override:?}"); + match (typ, type_override.as_str()) { + (FbsBaseType::UShort, "float16") => { + return Self::Float16; } + (FbsBaseType::Array | FbsBaseType::Vector, "float16") => {} + _ => unreachable!( + "UShort -> float16 is the only permitted type override. Not {typ:#?}->{type_override}" + ), } } @@ -1381,9 +1358,6 @@ impl Type { Self::Float64 => Some(Self::Vector { elem_type: ElementType::Float64, }), - Self::Binary => Some(Self::Vector { - elem_type: ElementType::Binary, - }), Self::String => Some(Self::Vector { elem_type: ElementType::String, }), @@ -1424,7 +1398,6 @@ impl Type { | Self::Float16 | Self::Float32 | Self::Float64 - | Self::Binary | Self::String | Self::Object { .. } => None, } @@ -1465,7 +1438,7 @@ impl Type { | Self::Float32 | Self::Float64 => true, - Self::Binary | Self::String | Self::Vector { .. } => false, + Self::String | Self::Vector { .. } => false, Self::Array { elem_type, .. } => elem_type.has_default_destructor(objects), @@ -1550,18 +1523,8 @@ pub enum ElementType { Float16, Float32, Float64, - - /// A list of bytes of arbitrary length. - /// - /// 32-bit or 64-bit - Binary, - - /// Utf8 String, - - Object { - fqname: String, - }, + Object { fqname: String }, } impl ElementType { @@ -1652,7 +1615,7 @@ impl ElementType { | Self::Float32 | Self::Float64 => true, - Self::Binary | Self::String => false, + Self::String => false, Self::Object { fqname } => objects[fqname].has_default_destructor(objects), } @@ -1674,7 +1637,7 @@ impl ElementType { | Self::Float16 | Self::Float32 | Self::Float64 => true, - Self::Bool | Self::Binary | Self::String | Self::Object { .. } => false, + Self::Bool | Self::Object { .. } | Self::String => false, } } diff --git a/crates/build/re_types_builder/src/type_registry.rs b/crates/build/re_types_builder/src/type_registry.rs index d79bdd36613c..fdb7e2c52fca 100644 --- a/crates/build/re_types_builder/src/type_registry.rs +++ b/crates/build/re_types_builder/src/type_registry.rs @@ -163,7 +163,6 @@ impl TypeRegistry { Type::Float16 => LazyDatatype::Atomic(AtomicDataType::Float16), Type::Float32 => LazyDatatype::Atomic(AtomicDataType::Float32), Type::Float64 => LazyDatatype::Atomic(AtomicDataType::Float64), - Type::Binary => LazyDatatype::Binary, Type::String => LazyDatatype::Utf8, Type::Array { elem_type, length } => LazyDatatype::FixedSizeList( LazyField { @@ -216,7 +215,6 @@ impl TypeRegistry { ElementType::Float16 => LazyDatatype::Atomic(AtomicDataType::Float16), ElementType::Float32 => LazyDatatype::Atomic(AtomicDataType::Float32), ElementType::Float64 => LazyDatatype::Atomic(AtomicDataType::Float64), - ElementType::Binary => LazyDatatype::Binary, ElementType::String => LazyDatatype::Utf8, ElementType::Object { fqname } => LazyDatatype::Unresolved { fqname }, } diff --git a/crates/store/re_chunk/src/chunk.rs b/crates/store/re_chunk/src/chunk.rs index 8d664059021c..5e13dda091af 100644 --- a/crates/store/re_chunk/src/chunk.rs +++ b/crates/store/re_chunk/src/chunk.rs @@ -12,7 +12,7 @@ use arrow::{ use itertools::{Either, Itertools as _, izip}; use nohash_hasher::IntMap; -use re_arrow_util::{ArrowArrayDowncastRef as _, widen_binary_arrays}; +use re_arrow_util::ArrowArrayDowncastRef as _; use re_byte_size::SizeBytes as _; use re_log_types::{ AbsoluteTimeRange, EntityPath, NonMinI64, TimeInt, TimeType, Timeline, TimelineName, @@ -101,8 +101,6 @@ impl ChunkComponents { let Some(right_array) = right.get(descr) else { anyhow::bail!("rhs is missing {descr:?}"); }; - let left_array = widen_binary_arrays(left_array); - let right_array = widen_binary_arrays(right_array); re_arrow_util::ensure_similar(&left_array.to_data(), &right_array.to_data()) .with_context(|| format!("Component {descr:?}"))?; } diff --git a/crates/store/re_chunk/src/iter.rs b/crates/store/re_chunk/src/iter.rs index f8e6465e44f2..ba1192fb4da7 100644 --- a/crates/store/re_chunk/src/iter.rs +++ b/crates/store/re_chunk/src/iter.rs @@ -4,7 +4,7 @@ use arrow::{ array::{ Array as ArrowArray, ArrayRef as ArrowArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray as ArrowBooleanArray, FixedSizeListArray as ArrowFixedSizeListArray, - LargeBinaryArray, ListArray as ArrowListArray, PrimitiveArray as ArrowPrimitiveArray, + ListArray as ArrowListArray, PrimitiveArray as ArrowPrimitiveArray, StringArray as ArrowStringArray, StructArray as ArrowStructArray, }, buffer::{BooleanBuffer as ArrowBooleanBuffer, Buffer, ScalarBuffer as ArrowScalarBuffer}, @@ -12,7 +12,7 @@ use arrow::{ }; use itertools::{Either, Itertools as _, izip}; -use re_arrow_util::ArrowArrayDowncastRef as _; +use re_arrow_util::{ArrowArrayDowncastRef as _, offsets_lengths}; use re_log_types::{TimeInt, TimePoint, TimelineName}; use re_span::Span; use re_types_core::{ArrowString, Component, ComponentDescriptor}; @@ -205,7 +205,7 @@ impl Chunk { }; let offsets = list_array.offsets().iter().map(|idx| *idx as usize); - let lengths = list_array.offsets().lengths(); + let lengths = offsets_lengths(list_array.offsets()); if let Some(validity) = list_array.nulls() { Either::Right(Either::Left( @@ -520,7 +520,7 @@ where let values = values.values(); let offsets = inner_list_array.offsets(); - let lengths = offsets.lengths().collect_vec(); + let lengths = offsets_lengths(inner_list_array.offsets()).collect_vec(); // NOTE: No need for validity checks here, `component_spans` already takes care of that. Either::Right(component_spans.map(move |span| { @@ -533,7 +533,7 @@ where })) } -// We special case `&[u8]` so that it works both for `List[u8]` and `Binary/LargeBinary` arrays. +// We special case `&[u8]` so that it works both for `List[u8]` and `Binary` arrays. fn slice_as_u8<'a>( component_descriptor: ComponentDescriptor, array: &'a dyn ArrowArray, @@ -542,31 +542,17 @@ fn slice_as_u8<'a>( if let Some(binary_array) = array.downcast_array_ref::() { let values = binary_array.values(); let offsets = binary_array.offsets(); - let lengths = offsets.lengths().collect_vec(); - - // NOTE: No need for validity checks here, `component_spans` already takes care of that. - Either::Left(Either::Left(component_spans.map(move |span| { - let offsets = &offsets[span.range()]; - let lengths = &lengths[span.range()]; - izip!(offsets, lengths) - // NOTE: Not an actual clone, just a refbump of the underlying buffer. - .map(|(&idx, &len)| values.clone().slice_with_length(idx as _, len)) - .collect_vec() - }))) - } else if let Some(binary_array) = array.downcast_array_ref::() { - let values = binary_array.values(); - let offsets = binary_array.offsets(); - let lengths = offsets.lengths().collect_vec(); + let lengths = offsets_lengths(binary_array.offsets()).collect_vec(); // NOTE: No need for validity checks here, `component_spans` already takes care of that. - Either::Left(Either::Right(component_spans.map(move |span| { + Either::Left(component_spans.map(move |span| { let offsets = &offsets[span.range()]; let lengths = &lengths[span.range()]; izip!(offsets, lengths) // NOTE: Not an actual clone, just a refbump of the underlying buffer. .map(|(&idx, &len)| values.clone().slice_with_length(idx as _, len)) .collect_vec() - }))) + })) } else { Either::Right( slice_as_buffer_native::( @@ -653,7 +639,7 @@ where }; let inner_offsets = inner_list_array.offsets(); - let inner_lengths = inner_offsets.lengths().collect_vec(); + let inner_lengths = offsets_lengths(inner_list_array.offsets()).collect_vec(); let Some(fixed_size_list_array) = inner_list_array .values() @@ -752,7 +738,7 @@ impl ChunkComponentSlicer for String { let values = utf8_array.values().clone(); let offsets = utf8_array.offsets().clone(); - let lengths = offsets.lengths().collect_vec(); + let lengths = offsets_lengths(utf8_array.offsets()).collect_vec(); // NOTE: No need for validity checks here, `component_spans` already takes care of that. Either::Right(component_spans.map(move |range| { diff --git a/crates/store/re_data_loader/src/loader_archetype.rs b/crates/store/re_data_loader/src/loader_archetype.rs index e9caba767b66..d04551ece964 100644 --- a/crates/store/re_data_loader/src/loader_archetype.rs +++ b/crates/store/re_data_loader/src/loader_archetype.rs @@ -193,10 +193,7 @@ fn load_video( re_log_types::TimeCell::ZERO_DURATION, ); - let video_asset = { - re_tracing::profile_scope!("serialize-as-arrow"); - AssetVideo::new(contents) - }; + let video_asset = AssetVideo::new(contents); let video_frame_reference_chunk = match video_asset.read_frame_timestamps_nanos() { Ok(frame_timestamps_nanos) => { diff --git a/crates/store/re_sorbet/src/migrations/mod.rs b/crates/store/re_sorbet/src/migrations/mod.rs index b17209c6945f..3084cb591429 100644 --- a/crates/store/re_sorbet/src/migrations/mod.rs +++ b/crates/store/re_sorbet/src/migrations/mod.rs @@ -1,10 +1,6 @@ #![expect(non_snake_case)] //! These are the migrations that are introduced for each Sorbet version. -//! -//! When you introduce a breaking change, these are the steps: -//! * Bump [`SorbetSchema::METADATA_VERSION`] -//! * Add a new `mod vX_Y_Z__to__vX_Y_W` use std::cmp::Ordering; @@ -22,7 +18,6 @@ mod make_list_arrays; mod v0_0_1__to__v0_0_2; mod v0_0_2__to__v0_1_0; mod v0_1_0__to__v0_1_1; -mod v0_1_1__to__v0_1_2; /// This trait needs to be implemented by any new migrations. It ensures that /// all migrations adhere to the same contract. @@ -114,7 +109,7 @@ pub fn migrate_record_batch(mut batch: RecordBatch) -> RecordBatch { Ok(batch_version) => match batch_version.cmp(&SorbetSchema::METADATA_VERSION) { Ordering::Equal => { // Provide this code path as an early out to avoid unnecessary comparisons. - re_log::trace!("Batch version matches Sorbet version ({batch_version})"); + re_log::trace!("Batch version matches Sorbet version."); batch } Ordering::Less => { @@ -125,11 +120,10 @@ pub fn migrate_record_batch(mut batch: RecordBatch) -> RecordBatch { ); batch } else { - re_log::debug_once!("Performing migrations from {batch_version}…"); + re_log::trace!("Performing migrations…"); batch = maybe_apply::(&batch_version, batch); batch = maybe_apply::(&batch_version, batch); batch = maybe_apply::(&batch_version, batch); - batch = maybe_apply::(&batch_version, batch); batch } } diff --git a/crates/store/re_sorbet/src/migrations/v0_1_1__to__v0_1_2.rs b/crates/store/re_sorbet/src/migrations/v0_1_1__to__v0_1_2.rs deleted file mode 100644 index afbf29df5bdd..000000000000 --- a/crates/store/re_sorbet/src/migrations/v0_1_1__to__v0_1_2.rs +++ /dev/null @@ -1,151 +0,0 @@ -//! Breaking changes: -//! * `Blob` is encoded as `Binary` instead of `List[u8]` -use std::sync::Arc; - -use arrow::{ - array::{ - Array, ArrayRef, AsArray as _, ListArray, RecordBatch, RecordBatchOptions, UInt8Array, - }, - datatypes::{DataType, Field, FieldRef, Schema}, -}; - -use re_log::ResultExt as _; - -pub struct Migration; - -impl super::Migration for Migration { - const SOURCE_VERSION: semver::Version = semver::Version::new(0, 1, 1); - const TARGET_VERSION: semver::Version = semver::Version::new(0, 1, 2); - - fn migrate(batch: RecordBatch) -> RecordBatch { - migrate_blobs(batch) - } -} - -/// Change datatype from `List[u8]` to `Binary` for blobs -fn migrate_blobs(batch: RecordBatch) -> RecordBatch { - re_tracing::profile_function!(); - - /// Is this a `List>` ? - fn is_list_list_u8(datatype: &DataType) -> bool { - if let DataType::List(list_field) = datatype - && let DataType::List(innermost_field) = list_field.data_type() - { - innermost_field.data_type() == &DataType::UInt8 - } else { - false - } - } - - fn is_blob_field(field: &Field) -> bool { - let components_with_blobs = [ - "rerun.components.Blob", - "rerun.components.ImageBuffer", - "rerun.components.VideoSample", - ]; - - if let Some(component_type) = field.metadata().get("rerun:component_type") - && components_with_blobs.contains(&component_type.as_str()) - { - is_list_list_u8(field.data_type()) - } else { - false - } - } - - let needs_migration = batch - .schema() - .fields() - .iter() - .any(|field| is_blob_field(field)); - - if !needs_migration { - return batch; - } - - let num_columns = batch.num_columns(); - let mut fields: Vec = Vec::with_capacity(num_columns); - let mut columns: Vec = Vec::with_capacity(num_columns); - - for (field, array) in itertools::izip!(batch.schema().fields(), batch.columns()) { - if is_blob_field(field) { - if let Some(new_array) = convert_list_list_u8_to_list_binary(array.as_ref()) { - let new_field = Field::new( - field.name(), - new_array.data_type().clone(), - field.is_nullable(), - ) - .with_metadata(field.metadata().clone()); - - fields.push(new_field.into()); - columns.push(Arc::new(new_array)); - - re_log::debug_once!( - "Changed datatype of '{}' from List[u8] to Binary", - field.name() - ); - continue; - } else { - re_log::warn_once!("Failed to convert {} to Binary", field.name()); - } - } - - fields.push(field.clone()); - columns.push(array.clone()); - } - - let schema = Arc::new(Schema::new_with_metadata( - fields, - batch.schema().metadata.clone(), - )); - - RecordBatch::try_new_with_options( - schema.clone(), - columns, - &RecordBatchOptions::default().with_row_count(Some(batch.num_rows())), - ) - .ok_or_log_error() - .unwrap_or_else(|| RecordBatch::new_empty(schema)) -} - -/// `List[List[u8]]` -> `List[Binary]` -fn convert_list_list_u8_to_list_binary(list_array: &dyn Array) -> Option { - re_tracing::profile_function!(); - - // The outer `List[List[u8]]` - let list_array = list_array.as_list_opt()?; - - // The inner List[u8] array - let inner_list_array: &ListArray = list_array.values().as_list_opt()?; - - // The underlying u8 values - let u8_array: &UInt8Array = inner_list_array.values().as_primitive_opt()?; - - // We consistently use 64-bit offsets for binary data in order to keep our backwards-compatibility checks simpler. - // Create the binary array reusing existing buffers - let binary_array = arrow::array::LargeBinaryArray::try_new( - arrow::buffer::OffsetBuffer::new( - inner_list_array - .offsets() - .iter() - .map(|&o| o as i64) - .collect(), - ), - u8_array.values().clone().into_inner(), - inner_list_array.nulls().cloned(), - ) - .ok()?; - - // Create the outer list array with binary inner type - let outer_list = ListArray::try_new( - Arc::new(Field::new("item", DataType::LargeBinary, true)), - list_array.offsets().clone(), - Arc::new(binary_array), - list_array.nulls().cloned(), - ) - .ok()?; - - debug_assert_eq!(list_array.len(), outer_list.len()); - - Some(outer_list) -} diff --git a/crates/store/re_sorbet/src/sorbet_schema.rs b/crates/store/re_sorbet/src/sorbet_schema.rs index 07f9316c4918..eafd6d419de7 100644 --- a/crates/store/re_sorbet/src/sorbet_schema.rs +++ b/crates/store/re_sorbet/src/sorbet_schema.rs @@ -42,7 +42,7 @@ impl SorbetSchema { /// This is bumped everytime we require a migration, but notable it is /// decoupled from the Rerun version to avoid confusion as there will not /// be a new Sorbet version for each Rerun version. - pub(crate) const METADATA_VERSION: semver::Version = semver::Version::new(0, 1, 2); + pub(crate) const METADATA_VERSION: semver::Version = semver::Version::new(0, 1, 1); } impl SorbetSchema { diff --git a/crates/store/re_types/definitions/rerun/attributes.fbs b/crates/store/re_types/definitions/rerun/attributes.fbs index c77fd9085b51..81a3dfb78346 100644 --- a/crates/store/re_types/definitions/rerun/attributes.fbs +++ b/crates/store/re_types/definitions/rerun/attributes.fbs @@ -28,9 +28,7 @@ attribute "attr.rerun.log_missing_as_empty"; /// Override the type of a field. /// -/// The only permitted values are: -/// - `binary`, to override `[ubyte]` -/// - `float16`, to override `ushort` +/// The only permitted value is "float16", which can only be used to override the type of a ushort. /// /// For lists this will apply to the inner element. attribute "attr.rerun.override_type"; diff --git a/crates/store/re_types/definitions/rerun/datatypes/blob.fbs b/crates/store/re_types/definitions/rerun/datatypes/blob.fbs index d5334afdeb5c..e3fa4f9bdb1c 100644 --- a/crates/store/re_types/definitions/rerun/datatypes/blob.fbs +++ b/crates/store/re_types/definitions/rerun/datatypes/blob.fbs @@ -13,5 +13,5 @@ table Blob ( "attr.rust.repr": "transparent", "attr.rust.tuple_struct" ) { - data: [ubyte] (order: 100, "attr.rerun.override_type": "binary"); + data: [ubyte] (order: 100); } diff --git a/crates/store/re_types/src/archetypes/asset_video_ext.rs b/crates/store/re_types/src/archetypes/asset_video_ext.rs index 99561c0758e4..df4d31d220b0 100644 --- a/crates/store/re_types/src/archetypes/asset_video_ext.rs +++ b/crates/store/re_types/src/archetypes/asset_video_ext.rs @@ -52,7 +52,7 @@ impl AssetVideo { re_tracing::profile_function!(); let Some(blob_bytes) = self.blob.as_ref().and_then(Blob::serialized_blob_as_slice) else { - return Err(re_video::VideoLoadError::NoVideoTrack); // Error type is close enough + return Ok(Vec::new()); }; let Some(media_type) = self diff --git a/crates/store/re_types/src/blueprint/datatypes/component_column_selector.rs b/crates/store/re_types/src/blueprint/datatypes/component_column_selector.rs index 3424efbd61d8..138bdff7b0f4 100644 --- a/crates/store/re_types/src/blueprint/datatypes/component_column_selector.rs +++ b/crates/store/re_types/src/blueprint/datatypes/component_column_selector.rs @@ -105,7 +105,7 @@ impl ::re_types_core::Loggable for ComponentColumnSelector { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( entity_path.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), @@ -142,7 +142,7 @@ impl ::re_types_core::Loggable for ComponentColumnSelector { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( component.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), diff --git a/crates/store/re_types/src/blueprint/datatypes/selected_columns.rs b/crates/store/re_types/src/blueprint/datatypes/selected_columns.rs index cd169788c511..b42d4e595a46 100644 --- a/crates/store/re_types/src/blueprint/datatypes/selected_columns.rs +++ b/crates/store/re_types/src/blueprint/datatypes/selected_columns.rs @@ -135,10 +135,9 @@ impl ::re_types_core::Loggable for SelectedColumns { )), offsets, { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( time_columns_inner_data.iter().map(|datum| datum.0.len()), ); - #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = diff --git a/crates/store/re_types/src/datatypes/annotation_info.rs b/crates/store/re_types/src/datatypes/annotation_info.rs index ad6e8550a06f..955ffc8a2f78 100644 --- a/crates/store/re_types/src/datatypes/annotation_info.rs +++ b/crates/store/re_types/src/datatypes/annotation_info.rs @@ -114,7 +114,7 @@ impl ::re_types_core::Loggable for AnnotationInfo { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( label.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), diff --git a/crates/store/re_types/src/datatypes/blob.rs b/crates/store/re_types/src/datatypes/blob.rs index 5a5e0413d6df..de7eba8815ac 100644 --- a/crates/store/re_types/src/datatypes/blob.rs +++ b/crates/store/re_types/src/datatypes/blob.rs @@ -24,7 +24,7 @@ use ::re_types_core::{DeserializationError, DeserializationResult}; /// Ref-counted internally and therefore cheap to clone. #[derive(Clone, Debug, PartialEq)] #[repr(transparent)] -pub struct Blob(pub ::arrow::buffer::Buffer); +pub struct Blob(pub ::arrow::buffer::ScalarBuffer); ::re_types_core::macros::impl_into_cow!(Blob); @@ -33,7 +33,11 @@ impl ::re_types_core::Loggable for Blob { fn arrow_datatype() -> arrow::datatypes::DataType { #![allow(clippy::wildcard_imports)] use arrow::datatypes::*; - DataType::LargeBinary + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::UInt8, + false, + ))) } fn to_arrow_opt<'a>( @@ -60,20 +64,28 @@ impl ::re_types_core::Loggable for Blob { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( data0 .iter() - .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), + .map(|opt| opt.as_ref().map_or(0, |datum| datum.len())), ); - - #[allow(clippy::unwrap_used)] - let capacity = offsets.last().copied().unwrap() as usize; - let mut buffer_builder = arrow::array::builder::BufferBuilder::::new(capacity); - for data in data0.iter().flatten() { - buffer_builder.append_slice(data); - } - let inner_data: arrow::buffer::Buffer = buffer_builder.finish(); - as_array_ref(LargeBinaryArray::new(offsets, inner_data, data0_validity)) + let data0_inner_data: ScalarBuffer<_> = data0 + .iter() + .flatten() + .map(|b| b as &[_]) + .collect::>() + .concat() + .into(); + let data0_inner_validity: Option = None; + as_array_ref(ListArray::try_new( + std::sync::Arc::new(Field::new("item", DataType::UInt8, false)), + offsets, + as_array_ref(PrimitiveArray::::new( + data0_inner_data, + data0_inner_validity, + )), + data0_validity, + )?) } }) } @@ -88,52 +100,53 @@ impl ::re_types_core::Loggable for Blob { use ::re_types_core::{arrow_zip_validity::ZipValidity, Loggable as _, ResultExt as _}; use arrow::{array::*, buffer::*, datatypes::*}; Ok({ - fn extract_from_binary( - arrow_data: &arrow::array::GenericByteArray>, - ) -> DeserializationResult>> - where - O: ::arrow::array::OffsetSizeTrait, - { - use ::arrow::array::Array as _; - use ::re_types_core::arrow_zip_validity::ZipValidity; - let arrow_data_buf = arrow_data.values(); + let arrow_data = arrow_data + .as_any() + .downcast_ref::() + .ok_or_else(|| { + let expected = Self::arrow_datatype(); + let actual = arrow_data.data_type().clone(); + DeserializationError::datatype_mismatch(expected, actual) + }) + .with_context("rerun.datatypes.Blob#data")?; + if arrow_data.is_empty() { + Vec::new() + } else { + let arrow_data_inner = { + let arrow_data_inner = &**arrow_data.values(); + arrow_data_inner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + let expected = DataType::UInt8; + let actual = arrow_data_inner.data_type().clone(); + DeserializationError::datatype_mismatch(expected, actual) + }) + .with_context("rerun.datatypes.Blob#data")? + .values() + }; let offsets = arrow_data.offsets(); ZipValidity::new_with_validity(offsets.windows(2), arrow_data.nulls()) .map(|elem| { elem.map(|window| { - let start = window[0].as_usize(); - let end = window[1].as_usize(); - let len = end - start; - if arrow_data_buf.len() < end { + let start = window[0] as usize; + let end = window[1] as usize; + if arrow_data_inner.len() < end { return Err(DeserializationError::offset_slice_oob( (start, end), - arrow_data_buf.len(), + arrow_data_inner.len(), )); } #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] - let data = arrow_data_buf.slice_with_length(start, len); + let data = arrow_data_inner.clone().slice(start, end - start); Ok(data) }) .transpose() }) - .collect::>>>() - } - if let Some(arrow_data) = arrow_data.as_any().downcast_ref::() { - extract_from_binary(arrow_data) - .with_context("rerun.datatypes.Blob#data")? - .into_iter() - } else if let Some(arrow_data) = arrow_data.as_any().downcast_ref::() - { - extract_from_binary(arrow_data) - .with_context("rerun.datatypes.Blob#data")? - .into_iter() - } else { - let expected = Self::arrow_datatype(); - let actual = arrow_data.data_type().clone(); - return Err(DeserializationError::datatype_mismatch(expected, actual)) - .with_context("rerun.datatypes.Blob#data"); + .collect::>>>()? } + .into_iter() } .map(|v| v.ok_or_else(DeserializationError::missing_data)) .map(|res| res.map(|v| Some(Self(v)))) @@ -143,14 +156,14 @@ impl ::re_types_core::Loggable for Blob { } } -impl From<::arrow::buffer::Buffer> for Blob { +impl From<::arrow::buffer::ScalarBuffer> for Blob { #[inline] - fn from(data: ::arrow::buffer::Buffer) -> Self { + fn from(data: ::arrow::buffer::ScalarBuffer) -> Self { Self(data) } } -impl From for ::arrow::buffer::Buffer { +impl From for ::arrow::buffer::ScalarBuffer { #[inline] fn from(value: Blob) -> Self { value.0 @@ -165,6 +178,6 @@ impl ::re_byte_size::SizeBytes for Blob { #[inline] fn is_pod() -> bool { - <::arrow::buffer::Buffer>::is_pod() + <::arrow::buffer::ScalarBuffer>::is_pod() } } diff --git a/crates/store/re_types/src/datatypes/blob_ext.rs b/crates/store/re_types/src/datatypes/blob_ext.rs index 17d816fca4d8..5e0e8e5f81b8 100644 --- a/crates/store/re_types/src/datatypes/blob_ext.rs +++ b/crates/store/re_types/src/datatypes/blob_ext.rs @@ -1,5 +1,3 @@ -use arrow::{array::Array as _, buffer::ScalarBuffer}; - use super::Blob; impl Blob { @@ -12,45 +10,32 @@ impl Blob { /// Panics iff `offset + length` is larger than `len`. #[inline] pub fn sliced(self, range: std::ops::Range) -> Self { - self.0.slice_with_length(range.start, range.len()).into() + self.0.slice(range.start, range.len()).into() } /// Returns the bytes of a serialized blob batch without copying it. /// - /// Returns `None` if the serialized component batch didn't have the expected type or shape. + /// Returns `None` if the serialized component batch didn't have the expected shape. pub fn serialized_blob_as_slice( serialized_blob: &re_types_core::SerializedComponentBatch, ) -> Option<&[u8]> { - Self::binary_array_as_slice(&serialized_blob.array) - } - - /// Returns the bytes of a serialized blob batch without copying it. - /// - /// Returns `None` if the serialized component batch didn't have the expected type or shape. - pub fn binary_array_as_slice(array: &std::sync::Arc) -> Option<&[u8]> { - if let Some(blob_data) = array.as_any().downcast_ref::() { - if blob_data.len() == 1 { - return Some(blob_data.value(0)); - } - } - - if let Some(blob_data) = array + let blob_list_array = serialized_blob + .array + .as_any() + .downcast_ref::()?; + let blob_data = blob_list_array + .values() .as_any() - .downcast_ref::() - { - if blob_data.len() == 1 { - return Some(blob_data.value(0)); - } - } + .downcast_ref::>()?; - None + Some(blob_data.values().inner().as_slice()) } } impl Eq for Blob {} -impl From> for Blob { - fn from(buffer: ScalarBuffer) -> Self { +impl From for Blob { + fn from(buffer: arrow::buffer::Buffer) -> Self { Self(buffer.into()) } } @@ -68,10 +53,10 @@ impl From<&[u8]> for Blob { } impl std::ops::Deref for Blob { - type Target = arrow::buffer::Buffer; + type Target = arrow::buffer::ScalarBuffer; #[inline] - fn deref(&self) -> &arrow::buffer::Buffer { + fn deref(&self) -> &arrow::buffer::ScalarBuffer { &self.0 } } diff --git a/crates/store/re_types/src/datatypes/tensor_data.rs b/crates/store/re_types/src/datatypes/tensor_data.rs index 3f3ea01614d5..b5d8c0bd8f74 100644 --- a/crates/store/re_types/src/datatypes/tensor_data.rs +++ b/crates/store/re_types/src/datatypes/tensor_data.rs @@ -192,7 +192,7 @@ impl ::re_types_core::Loggable for TensorData { std::sync::Arc::new(Field::new("item", DataType::Utf8, false)), offsets, { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( names_inner_data.iter().map(|datum| datum.len()), ); #[allow(clippy::unwrap_used)] @@ -203,7 +203,6 @@ impl ::re_types_core::Loggable for TensorData { buffer_builder.append_slice(data.as_bytes()); } let inner_data: arrow::buffer::Buffer = buffer_builder.finish(); - #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] as_array_ref(unsafe { StringArray::new_unchecked( diff --git a/crates/store/re_types/src/datatypes/utf8pair.rs b/crates/store/re_types/src/datatypes/utf8pair.rs index faf9d7665daf..7f2fdabc6ea8 100644 --- a/crates/store/re_types/src/datatypes/utf8pair.rs +++ b/crates/store/re_types/src/datatypes/utf8pair.rs @@ -84,7 +84,7 @@ impl ::re_types_core::Loggable for Utf8Pair { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( first.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), @@ -117,11 +117,12 @@ impl ::re_types_core::Loggable for Utf8Pair { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( second.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), ); + #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = diff --git a/crates/store/re_types/src/image.rs b/crates/store/re_types/src/image.rs index e170d34147c0..fc7787841de9 100644 --- a/crates/store/re_types/src/image.rs +++ b/crates/store/re_types/src/image.rs @@ -150,20 +150,28 @@ where /// Converts it to what is useful for the image API. pub fn blob_and_datatype_from_tensor(tensor_buffer: TensorBuffer) -> (Blob, ChannelDatatype) { match tensor_buffer { - TensorBuffer::U8(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::U8), - TensorBuffer::U16(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::U16), - TensorBuffer::U32(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::U32), - TensorBuffer::U64(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::U64), - TensorBuffer::I8(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::I8), - TensorBuffer::I16(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::I16), - TensorBuffer::I32(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::I32), - TensorBuffer::I64(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::I64), - TensorBuffer::F16(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::F16), - TensorBuffer::F32(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::F32), - TensorBuffer::F64(buffer) => (Blob(buffer.inner().clone()), ChannelDatatype::F64), + TensorBuffer::U8(buffer) => (Blob(buffer), ChannelDatatype::U8), + TensorBuffer::U16(buffer) => (Blob(cast_to_u8(&buffer)), ChannelDatatype::U16), + TensorBuffer::U32(buffer) => (Blob(cast_to_u8(&buffer)), ChannelDatatype::U32), + TensorBuffer::U64(buffer) => (Blob(cast_to_u8(&buffer)), ChannelDatatype::U64), + TensorBuffer::I8(buffer) => (Blob(cast_to_u8(&buffer)), ChannelDatatype::I8), + TensorBuffer::I16(buffer) => (Blob(cast_to_u8(&buffer)), ChannelDatatype::I16), + TensorBuffer::I32(buffer) => (Blob(cast_to_u8(&buffer)), ChannelDatatype::I32), + TensorBuffer::I64(buffer) => (Blob(cast_to_u8(&buffer)), ChannelDatatype::I64), + TensorBuffer::F16(buffer) => (Blob(cast_to_u8(&buffer)), ChannelDatatype::F16), + TensorBuffer::F32(buffer) => (Blob(cast_to_u8(&buffer)), ChannelDatatype::F32), + TensorBuffer::F64(buffer) => (Blob(cast_to_u8(&buffer)), ChannelDatatype::F64), } } +/// Reinterpret POD (plain-old-data) types to `u8`. +#[inline] +pub fn cast_to_u8( + buffer: &arrow::buffer::ScalarBuffer, +) -> ScalarBuffer { + arrow::buffer::ScalarBuffer::new(buffer.inner().clone(), 0, buffer.inner().len()) +} + // ---------------------------------------------------------------------------- /// Types that implement this can be used as image channel types. diff --git a/crates/store/re_types/src/testing/components/affix_fuzzer10.rs b/crates/store/re_types/src/testing/components/affix_fuzzer10.rs index d2c4769762e4..e9ca73d98797 100644 --- a/crates/store/re_types/src/testing/components/affix_fuzzer10.rs +++ b/crates/store/re_types/src/testing/components/affix_fuzzer10.rs @@ -63,7 +63,7 @@ impl ::re_types_core::Loggable for AffixFuzzer10 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( data0 .iter() .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), diff --git a/crates/store/re_types/src/testing/components/affix_fuzzer12.rs b/crates/store/re_types/src/testing/components/affix_fuzzer12.rs index b7765d1cacfa..be5c3eee7dc0 100644 --- a/crates/store/re_types/src/testing/components/affix_fuzzer12.rs +++ b/crates/store/re_types/src/testing/components/affix_fuzzer12.rs @@ -78,7 +78,7 @@ impl ::re_types_core::Loggable for AffixFuzzer12 { std::sync::Arc::new(Field::new("item", DataType::Utf8, false)), offsets, { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( data0_inner_data.iter().map(|datum| datum.len()), ); diff --git a/crates/store/re_types/src/testing/components/affix_fuzzer13.rs b/crates/store/re_types/src/testing/components/affix_fuzzer13.rs index 7bfbbb6eea5b..b4c60584843e 100644 --- a/crates/store/re_types/src/testing/components/affix_fuzzer13.rs +++ b/crates/store/re_types/src/testing/components/affix_fuzzer13.rs @@ -78,7 +78,7 @@ impl ::re_types_core::Loggable for AffixFuzzer13 { std::sync::Arc::new(Field::new("item", DataType::Utf8, false)), offsets, { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( data0_inner_data.iter().map(|datum| datum.len()), ); diff --git a/crates/store/re_types/src/testing/components/affix_fuzzer9.rs b/crates/store/re_types/src/testing/components/affix_fuzzer9.rs index 30792d6d7f59..b50c7b022d93 100644 --- a/crates/store/re_types/src/testing/components/affix_fuzzer9.rs +++ b/crates/store/re_types/src/testing/components/affix_fuzzer9.rs @@ -63,7 +63,7 @@ impl ::re_types_core::Loggable for AffixFuzzer9 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( data0 .iter() .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), diff --git a/crates/store/re_types/src/testing/datatypes/affix_fuzzer1.rs b/crates/store/re_types/src/testing/datatypes/affix_fuzzer1.rs index 56f0d62cbaf3..085e6b75d98f 100644 --- a/crates/store/re_types/src/testing/datatypes/affix_fuzzer1.rs +++ b/crates/store/re_types/src/testing/datatypes/affix_fuzzer1.rs @@ -184,11 +184,12 @@ impl ::re_types_core::Loggable for AffixFuzzer1 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( single_string_required.iter().map(|opt| { opt.as_ref().map(|datum| datum.len()).unwrap_or_default() }), ); + #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = @@ -224,11 +225,12 @@ impl ::re_types_core::Loggable for AffixFuzzer1 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( single_string_optional.iter().map(|opt| { opt.as_ref().map(|datum| datum.len()).unwrap_or_default() }), ); + #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = @@ -323,12 +325,11 @@ impl ::re_types_core::Loggable for AffixFuzzer1 { std::sync::Arc::new(Field::new("item", DataType::Utf8, false)), offsets, { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( many_strings_required_inner_data .iter() .map(|datum| datum.len()), ); - #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = @@ -384,12 +385,11 @@ impl ::re_types_core::Loggable for AffixFuzzer1 { std::sync::Arc::new(Field::new("item", DataType::Utf8, false)), offsets, { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( many_strings_optional_inner_data .iter() .map(|datum| datum.len()), ); - #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = diff --git a/crates/store/re_types/src/testing/datatypes/affix_fuzzer20.rs b/crates/store/re_types/src/testing/datatypes/affix_fuzzer20.rs index 9b96f2e39cd1..5ea3a9b9d486 100644 --- a/crates/store/re_types/src/testing/datatypes/affix_fuzzer20.rs +++ b/crates/store/re_types/src/testing/datatypes/affix_fuzzer20.rs @@ -117,10 +117,11 @@ impl ::re_types_core::Loggable for AffixFuzzer20 { any_nones.then(|| somes.into()) }; { - let offsets = - arrow::buffer::OffsetBuffer::from_lengths(s.iter().map(|opt| { + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + s.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() - })); + }), + ); #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = diff --git a/crates/store/re_types/src/testing/datatypes/string_component.rs b/crates/store/re_types/src/testing/datatypes/string_component.rs index da5ba0d703f0..76f240dcc5b0 100644 --- a/crates/store/re_types/src/testing/datatypes/string_component.rs +++ b/crates/store/re_types/src/testing/datatypes/string_component.rs @@ -57,7 +57,7 @@ impl ::re_types_core::Loggable for StringComponent { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( data0 .iter() .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), diff --git a/crates/store/re_types_core/src/datatypes/entity_path.rs b/crates/store/re_types_core/src/datatypes/entity_path.rs index 7c69a91534b3..98f3ad62c3d4 100644 --- a/crates/store/re_types_core/src/datatypes/entity_path.rs +++ b/crates/store/re_types_core/src/datatypes/entity_path.rs @@ -58,7 +58,7 @@ impl crate::Loggable for EntityPath { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( data0 .iter() .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), diff --git a/crates/store/re_types_core/src/datatypes/utf8.rs b/crates/store/re_types_core/src/datatypes/utf8.rs index b87df0b09c4e..0e8ec92700f9 100644 --- a/crates/store/re_types_core/src/datatypes/utf8.rs +++ b/crates/store/re_types_core/src/datatypes/utf8.rs @@ -58,7 +58,7 @@ impl crate::Loggable for Utf8 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( data0 .iter() .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), diff --git a/crates/store/re_types_core/src/datatypes/visible_time_range.rs b/crates/store/re_types_core/src/datatypes/visible_time_range.rs index 4bce624337d1..76cdfc395a04 100644 --- a/crates/store/re_types_core/src/datatypes/visible_time_range.rs +++ b/crates/store/re_types_core/src/datatypes/visible_time_range.rs @@ -100,7 +100,7 @@ impl crate::Loggable for VisibleTimeRange { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::::from_lengths( timeline.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), diff --git a/crates/top/rerun_c/src/video.rs b/crates/top/rerun_c/src/video.rs index c53d1b82ddfa..56c25bc91645 100644 --- a/crates/top/rerun_c/src/video.rs +++ b/crates/top/rerun_c/src/video.rs @@ -16,10 +16,6 @@ pub extern "C" fn rr_video_asset_read_frame_timestamps_nanos( CError::unexpected_null("video_bytes").write_error(error); return std::ptr::null_mut(); } - if video_bytes_len == 0 { - CError::new(CErrorCode::VideoLoadError, "Zero video bytes").write_error(error); - return std::ptr::null_mut(); - } let Some(alloc_func) = alloc_func else { CError::unexpected_null("alloc_func").write_error(error); return std::ptr::null_mut(); @@ -48,7 +44,7 @@ pub extern "C" fn rr_video_asset_read_frame_timestamps_nanos( Err(err) => { CError::new( CErrorCode::VideoLoadError, - &format!("Failed to load video: {err}"), + &format!("Failed to play video: {err}"), ) .write_error(error); return std::ptr::null_mut(); diff --git a/crates/utils/re_arrow_util/src/arrays.rs b/crates/utils/re_arrow_util/src/arrays.rs index 69121dcce432..cfed9cf10273 100644 --- a/crates/utils/re_arrow_util/src/arrays.rs +++ b/crates/utils/re_arrow_util/src/arrays.rs @@ -62,6 +62,20 @@ pub fn into_arrow_ref(array: impl Array + 'static) -> ArrayRef { std::sync::Arc::new(array) } +/// Returns an iterator with the lengths of the offsets. +pub fn offsets_lengths(offsets: &OffsetBuffer) -> impl Iterator + '_ { + // TODO(emilk): remove when we update to Arrow 54 (which has an API for this) + offsets.windows(2).map(|w| { + let start = w[0]; + let end = w[1]; + debug_assert!( + start <= end && 0 <= start, + "Bad arrow offset buffer: {start}, {end}" + ); + end.saturating_sub(start).max(0) as usize + }) +} + /// Repartitions a [`ListArray`] according to the specified `lengths`, ignoring previous partitioning. /// /// The specified `lengths` must sum to the total length underlying values (i.e. the child array). diff --git a/crates/utils/re_arrow_util/src/lib.rs b/crates/utils/re_arrow_util/src/lib.rs index a355c493498a..0f7a50a0256a 100644 --- a/crates/utils/re_arrow_util/src/lib.rs +++ b/crates/utils/re_arrow_util/src/lib.rs @@ -9,75 +9,3 @@ pub use self::arrays::*; pub use self::batches::*; pub use self::compare::*; pub use self::format_data_type::*; - -// ---------------------------------------------------------------- - -use std::sync::Arc; - -use arrow::{ - array::{Array as _, AsArray as _, ListArray}, - datatypes::{DataType, Field}, -}; - -/// Convert any `BinaryArray` to `LargeBinaryArray`, because we treat them logivally the same -pub fn widen_binary_arrays(list_array: &ListArray) -> ListArray { - let list_data_type = list_array.data_type(); - if let DataType::List(field) = list_data_type - && field.data_type() == &DataType::Binary - { - re_tracing::profile_function!(); - let large_binary_field = Field::new("item", DataType::LargeBinary, true); - let target_type = DataType::List(Arc::new(large_binary_field)); - - #[expect(clippy::unwrap_used)] - arrow::compute::kernels::cast::cast(list_array, &target_type) - .unwrap() - .as_list() - .clone() - } else { - list_array.clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow::array::{BinaryBuilder, ListBuilder}; - - #[test] - fn test_widen_list_binary() { - // Create test data - let mut list_builder = ListBuilder::new(BinaryBuilder::new()); - - // First list: [b"hello", b"world"] - list_builder.values().append_value(b"hello"); - list_builder.values().append_value(b"world"); - list_builder.append(true); - - // Second list: [b"rust", b"arrow"] - list_builder.values().append_value(b"rust"); - list_builder.values().append_value(b"arrow"); - list_builder.append(true); - - // Third list: null - list_builder.append_null(); - - let original_list = list_builder.finish(); - - // Widen to LargeBinaryArray - let widened_list = widen_binary_arrays(&original_list); - - // Verify the result - assert_eq!(widened_list.len(), 3); - assert!(!widened_list.is_null(0)); - assert!(!widened_list.is_null(1)); - assert!(widened_list.is_null(2)); - - // Check data type - if let DataType::List(field) = widened_list.data_type() { - assert_eq!(field.data_type(), &DataType::LargeBinary); - } else { - panic!("Expected List data type"); - } - } -} diff --git a/crates/utils/re_mcap/src/layers/raw.rs b/crates/utils/re_mcap/src/layers/raw.rs index 421a436e4349..4e0af00f2d23 100644 --- a/crates/utils/re_mcap/src/layers/raw.rs +++ b/crates/utils/re_mcap/src/layers/raw.rs @@ -1,4 +1,4 @@ -use arrow::array::LargeBinaryBuilder; +use arrow::array::{ListBuilder, UInt8Builder}; use re_chunk::{ChunkId, external::arrow::array::FixedSizeListBuilder}; use re_types::{ Component as _, ComponentDescriptor, components, reflection::ComponentDescriptorExt as _, @@ -6,11 +6,11 @@ use re_types::{ use crate::{ Error, LayerIdentifier, MessageLayer, - parsers::{MessageParser, ParserContext, util::fixed_size_list_builder}, + parsers::{MessageParser, ParserContext, util::blob_list_builder}, }; struct RawMcapMessageParser { - data: FixedSizeListBuilder, + data: FixedSizeListBuilder>, } impl RawMcapMessageParser { @@ -18,7 +18,7 @@ impl RawMcapMessageParser { fn new(num_rows: usize) -> Self { Self { - data: fixed_size_list_builder(1, num_rows), + data: blob_list_builder(num_rows), } } } @@ -30,7 +30,8 @@ impl MessageParser for RawMcapMessageParser { msg: &::mcap::Message<'_>, ) -> anyhow::Result<()> { re_tracing::profile_function!(); - self.data.values().append_value(&msg.data); + self.data.values().values().append_slice(&msg.data); + self.data.values().append(true); self.data.append(true); Ok(()) } diff --git a/crates/utils/re_mcap/src/parsers/mod.rs b/crates/utils/re_mcap/src/parsers/mod.rs index 174edd2c3d5d..55ebb144e617 100644 --- a/crates/utils/re_mcap/src/parsers/mod.rs +++ b/crates/utils/re_mcap/src/parsers/mod.rs @@ -7,12 +7,32 @@ pub use decode::{ChannelId, MessageParser, ParserContext}; /// Defines utility functions shared across parsers. pub(crate) mod util { - use arrow::array::{ArrayBuilder, FixedSizeListBuilder}; + use arrow::{ + array::{FixedSizeListBuilder, ListBuilder, UInt8Builder}, + datatypes::{DataType, Field}, + }; + use re_types::{Loggable as _, components}; + use std::sync::Arc; - pub(crate) fn fixed_size_list_builder( + pub(crate) fn fixed_size_list_builder( value_length: i32, capacity: usize, - ) -> FixedSizeListBuilder { - FixedSizeListBuilder::with_capacity(Default::default(), value_length, capacity) + ) -> arrow::array::FixedSizeListBuilder { + arrow::array::FixedSizeListBuilder::with_capacity( + Default::default(), + value_length, + capacity, + ) + } + + pub(crate) fn blob_list_builder( + capacity: usize, + ) -> FixedSizeListBuilder> { + let list_builder = ListBuilder::::default() + .with_field(Arc::new(Field::new_list_field(DataType::UInt8, false))); + + FixedSizeListBuilder::with_capacity(list_builder, 1, capacity).with_field(Arc::new( + Field::new_list_field(components::Blob::arrow_datatype(), false), + )) } } diff --git a/crates/utils/re_mcap/src/parsers/ros2msg/sensor_msgs/point_cloud_2.rs b/crates/utils/re_mcap/src/parsers/ros2msg/sensor_msgs/point_cloud_2.rs index 77dcd76256f3..4943c36bac05 100644 --- a/crates/utils/re_mcap/src/parsers/ros2msg/sensor_msgs/point_cloud_2.rs +++ b/crates/utils/re_mcap/src/parsers/ros2msg/sensor_msgs/point_cloud_2.rs @@ -3,8 +3,8 @@ use std::io::Cursor; use super::super::definitions::sensor_msgs::{self, PointField, PointFieldDatatype}; use arrow::{ array::{ - BooleanBuilder, FixedSizeListBuilder, LargeBinaryBuilder, ListBuilder, StringBuilder, - StructBuilder, UInt8Builder, UInt32Builder, + BooleanBuilder, FixedSizeListBuilder, ListBuilder, StringBuilder, StructBuilder, + UInt8Builder, UInt32Builder, }, datatypes::{DataType, Field, Fields}, }; @@ -22,7 +22,7 @@ use crate::{ parsers::{ cdr, decode::{MessageParser, ParserContext}, - util::fixed_size_list_builder, + util::{blob_list_builder, fixed_size_list_builder}, }, }; @@ -35,7 +35,7 @@ pub struct PointCloud2MessageParser { is_bigendian: FixedSizeListBuilder, point_step: FixedSizeListBuilder, row_step: FixedSizeListBuilder, - data: FixedSizeListBuilder, + data: FixedSizeListBuilder>, is_dense: FixedSizeListBuilder, // We lazily create this, only if we can interpret the point cloud semantically. @@ -75,7 +75,7 @@ impl PointCloud2MessageParser { is_bigendian: fixed_size_list_builder(1, num_rows), point_step: fixed_size_list_builder(1, num_rows), row_step: fixed_size_list_builder(1, num_rows), - data: fixed_size_list_builder(1, num_rows), + data: blob_list_builder(num_rows), is_dense: fixed_size_list_builder(1, num_rows), points_3ds: None, @@ -257,7 +257,7 @@ impl MessageParser for PointCloud2MessageParser { point_step.values().append_slice(&[point_cloud.point_step]); row_step.values().append_slice(&[point_cloud.row_step]); - data.values().append_value(&point_cloud.data); + data.values().values().append_slice(&point_cloud.data); is_dense.values().append_slice(&[point_cloud.is_dense]); height.append(true); @@ -267,6 +267,7 @@ impl MessageParser for PointCloud2MessageParser { row_step.append(true); is_dense.append(true); + data.values().append(true); data.append(true); Ok(()) diff --git a/crates/utils/re_video/src/demux/mod.rs b/crates/utils/re_video/src/demux/mod.rs index dbe08efc5e72..e57ec1047cdc 100644 --- a/crates/utils/re_video/src/demux/mod.rs +++ b/crates/utils/re_video/src/demux/mod.rs @@ -465,10 +465,6 @@ impl VideoDataDescription { media_type: &str, debug_name: &str, ) -> Result { - if data.is_empty() { - return Err(VideoLoadError::ZeroBytes); - } - re_tracing::profile_function!(); match media_type { "video/mp4" => Self::load_mp4(data, debug_name), @@ -833,10 +829,7 @@ impl SampleMetadata { /// Errors that can occur when loading a video. #[derive(thiserror::Error, Debug)] pub enum VideoLoadError { - #[error("The video file is empty (zero bytes)")] - ZeroBytes, - - #[error("MP4 error: {0}")] + #[error("Failed to determine media type from data: {0}")] ParseMp4(#[from] re_mp4::Error), #[error("Video file has no video tracks")] diff --git a/crates/utils/re_video/src/demux/mp4.rs b/crates/utils/re_video/src/demux/mp4.rs index 06dc97733f0e..961d50922b85 100644 --- a/crates/utils/re_video/src/demux/mp4.rs +++ b/crates/utils/re_video/src/demux/mp4.rs @@ -19,7 +19,6 @@ use std::io::Cursor; impl VideoDataDescription { pub fn load_mp4(bytes: &[u8], debug_name: &str) -> Result { re_tracing::profile_function!(); - let mp4 = { re_tracing::profile_scope!("Mp4::read_bytes"); re_mp4::Mp4::read_bytes(bytes)? diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/VideoSample_placeholder.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/VideoSample_placeholder.png index e6461d640f11..5d1858b7254e 100644 --- a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/VideoSample_placeholder.png +++ b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/VideoSample_placeholder.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c78265faf1086880f4307cd13a33b32e0f5dfdca11927480ea7dfbd720b1469b -size 3056 +oid sha256:b052a99437a77e4c3ddb6613b3d944fe90e872b50b2be52d0c1a7041ee926b35 +size 2843 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/custom_large_blob_any_value_large_blob.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/custom_large_blob_any_value_large_blob.png new file mode 100644 index 000000000000..03bb1a182256 --- /dev/null +++ b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/custom_large_blob_any_value_large_blob.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14c8dec3337ae04ad74e88a1a1dc98c5503bb43fabc620f51f8c92a5190afdfc +size 3670 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/custom_small_array_any_value_small_array.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/custom_small_array_any_value_small_array.png index 11a813d082c4..3850596436a3 100644 --- a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/custom_small_array_any_value_small_array.png +++ b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/custom_small_array_any_value_small_array.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3b6c910752fc5348ca3a6f122f00f3a08d2c6152daee1ac666dcb21f09843e1 -size 4059 +oid sha256:38bf2e72b7b08aaf3c27a15b096cfee9e443cbfa72d3336b3cce381f7a5eba99 +size 3151 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/one_large_blob_any_value_one_large_blob.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/one_large_blob_any_value_one_large_blob.png deleted file mode 100644 index 0f356869535c..000000000000 --- a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/one_large_blob_any_value_one_large_blob.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:17ca10070dbf6c6b2cd17e75dae941909ab73068fe185c775967bd76eda2f4fc -size 3649 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/one_small_blob_any_value_one_small_blob.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/one_small_blob_any_value_one_small_blob.png deleted file mode 100644 index c5fa581351bd..000000000000 --- a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/one_small_blob_any_value_one_small_blob.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bab838f7d573b54f6324c448bd170ec144e4c8e6c325ce01499c2bf357d95ec7 -size 3047 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/two_large_blobs_any_value_two_large_blobs.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/two_large_blobs_any_value_two_large_blobs.png deleted file mode 100644 index 4abc594e4066..000000000000 --- a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_narrow/two_large_blobs_any_value_two_large_blobs.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52d93f776a5de0e6976a0dcdf5acc30e77d7ff55aa610f51ee7a4111aee4de4a -size 4105 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/VideoSample_placeholder.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/VideoSample_placeholder.png index c429f0e19217..7bca27a85996 100644 --- a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/VideoSample_placeholder.png +++ b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/VideoSample_placeholder.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b5fe5fde8351c6b31046d97f6f79e6fdacd8000c695a75fdf3878e1c7d3b1d0 -size 3373 +oid sha256:e2dfb7e6848faae48ec178407bcc37524bfc98a03ce4c5b6a44a45616db27f83 +size 3158 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/custom_large_blob_any_value_large_blob.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/custom_large_blob_any_value_large_blob.png new file mode 100644 index 000000000000..9ac0c0c64efa --- /dev/null +++ b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/custom_large_blob_any_value_large_blob.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0df8f8be85731d04abf18c7196ae70c7944ac802d0a8ae2011cf05ed2f7d27 +size 3981 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/custom_small_array_any_value_small_array.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/custom_small_array_any_value_small_array.png index db966c4dfaf5..94b28772b908 100644 --- a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/custom_small_array_any_value_small_array.png +++ b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/custom_small_array_any_value_small_array.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93951d4e2fadf80af35cb07dcd0d776febdbdc8c1b8bdb9492f50e00c1b37548 -size 4970 +oid sha256:b12caf4b4d09cb8de8d296a6290eefa8793279e3c09ca4c0be67500e8332e200 +size 3459 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/one_large_blob_any_value_one_large_blob.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/one_large_blob_any_value_one_large_blob.png deleted file mode 100644 index 771b136fba6d..000000000000 --- a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/one_large_blob_any_value_one_large_blob.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba4368804019097ac9fa6d609ef2be3df3ba1d457a50b91ed61449f4da45e200 -size 3961 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/one_small_blob_any_value_one_small_blob.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/one_small_blob_any_value_one_small_blob.png deleted file mode 100644 index 5ae075deb66a..000000000000 --- a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/one_small_blob_any_value_one_small_blob.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea569e1d409fdfba25ed45e74c98326f6f077b076a895cc0f1d552db1e0b1edb -size 3364 diff --git a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/two_large_blobs_any_value_two_large_blobs.png b/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/two_large_blobs_any_value_two_large_blobs.png deleted file mode 100644 index d76c9ef66a28..000000000000 --- a/crates/viewer/re_component_ui/tests/snapshots/all_components_list_item_wide/two_large_blobs_any_value_two_large_blobs.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4a65d6a53d4cefc87611b9dcb0418981d67bd672f2addf1eebef0f74224005f2 -size 5081 diff --git a/crates/viewer/re_component_ui/tests/test_all_components_ui.rs b/crates/viewer/re_component_ui/tests/test_all_components_ui.rs index acd8bd190710..fad880740dbc 100644 --- a/crates/viewer/re_component_ui/tests/test_all_components_ui.rs +++ b/crates/viewer/re_component_ui/tests/test_all_components_ui.rs @@ -103,19 +103,9 @@ fn test_cases(reflection: &Reflection) -> Vec { "any_value_small_array", ), TestCase::from_arrow( - ComponentType::from("one_small_blob"), - arrow::array::BinaryArray::from_vec(vec![&[1, 2, 3]]), - "any_value_one_small_blob", - ), - TestCase::from_arrow( - ComponentType::from("one_large_blob"), - arrow::array::LargeBinaryArray::from_vec(vec![&vec![42_u8; 3001]]), - "any_value_one_large_blob", - ), - TestCase::from_arrow( - ComponentType::from("two_large_blobs"), - arrow::array::LargeBinaryArray::from_vec(vec![&vec![42_u8; 3001], &vec![69_u8; 6001]]), - "any_value_two_large_blobs", + ComponentType::from("custom_large_blob"), + arrow::array::UInt8Array::from(vec![42; 3001]), + "any_value_large_blob", ), TestCase::from_arrow( ComponentType::from("custom_struct_array"), diff --git a/crates/viewer/re_ui/src/arrow_ui.rs b/crates/viewer/re_ui/src/arrow_ui.rs index 4087bf5b37aa..18317a49a437 100644 --- a/crates/viewer/re_ui/src/arrow_ui.rs +++ b/crates/viewer/re_ui/src/arrow_ui.rs @@ -41,9 +41,6 @@ pub fn arrow_ui(ui: &mut egui::Ui, ui_layout: UiLayout, array: &dyn arrow::array return; } - // Special-case binary data (e.g. blobs). - // We don't want to show their contents (too slow, since they are usually huge), - // so we only show their size: if let Some(binaries) = array.downcast_array_ref::() && binaries.len() == 1 { @@ -96,7 +93,9 @@ pub fn arrow_ui(ui: &mut egui::Ui, ui_layout: UiLayout, array: &dyn arrow::array } else { let instance_count_str = re_format::format_uint(instance_count); - let string = if let Some(dtype) = simple_datatype_string(array.data_type()) { + let string = if array.data_type() == &DataType::UInt8 { + re_format::format_bytes(instance_count as _) + } else if let Some(dtype) = simple_datatype_string(array.data_type()) { format!("{instance_count_str} items of {dtype}") } else if let DataType::Struct(fields) = array.data_type() { format!( diff --git a/crates/viewer/re_ui/tests/snapshots/arrow_ui.png b/crates/viewer/re_ui/tests/snapshots/arrow_ui.png index c80399ae9fcb..1ea7aa482181 100644 --- a/crates/viewer/re_ui/tests/snapshots/arrow_ui.png +++ b/crates/viewer/re_ui/tests/snapshots/arrow_ui.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7099b0b334f607ef84e645af30cb138f524affa8b5cf90c21a33bc15eabe1fd -size 49286 +oid sha256:61e49c24e3291cc1cd0a917b5b26b32951c1cd095d9131030ed13f9c3c76b4cb +size 49877 diff --git a/crates/viewer/re_viewer_context/src/cache/video_stream_cache.rs b/crates/viewer/re_viewer_context/src/cache/video_stream_cache.rs index b004521dc8dc..093df81e374d 100644 --- a/crates/viewer/re_viewer_context/src/cache/video_stream_cache.rs +++ b/crates/viewer/re_viewer_context/src/cache/video_stream_cache.rs @@ -292,46 +292,6 @@ fn read_samples_from_chunk( ) -> Result<(), VideoStreamProcessingError> { re_tracing::profile_function!(); - let sample_descr = VideoStream::descriptor_sample(); - let Some(raw_array) = chunk.raw_component_array(&sample_descr) else { - // This chunk doesn't have any video chunks. - return Ok(()); - }; - - if let Some(binary_array) = raw_array.downcast_array_ref::() { - read_sample_from_binary_array(timeline, chunk, video_descr, chunk_buffers, binary_array); - Ok(()) - } else if let Some(binary_array) = - raw_array.downcast_array_ref::() - { - read_sample_from_binary_array(timeline, chunk, video_descr, chunk_buffers, binary_array); - Ok(()) - } else { - Err(VideoStreamProcessingError::InvalidVideoSampleType( - raw_array.data_type().clone(), - )) - } -} - -fn read_sample_from_binary_array( - timeline: TimelineName, - chunk: &re_chunk::Chunk, - video_descr: &mut re_video::VideoDataDescription, - chunk_buffers: &mut StableIndexDeque, - binary_array: &arrow::array::GenericByteArray>, -) { - // The underlying data within a chunk is logically a Vec>, - // where the inner Vec always has a len=1, because we're dealing with a "mono-component" - // (each VideoStream has exactly one VideoSample instance per time)`. - // - // Because of how arrow works, the bytes of all the blobs are actually sequential in memory (yay!) in a single buffer, - // what you call values below (could use a better name btw). - // - // We want to figure out the byte offsets of each blob within the arrow buffer that holds all the blobs, - // i.e. get out a Vec. - - let sample_descr = VideoStream::descriptor_sample(); - let re_video::VideoDataDescription { codec, samples, @@ -340,6 +300,12 @@ fn read_sample_from_binary_array( .. } = video_descr; + let sample_descr = VideoStream::descriptor_sample(); + let Some(raw_array) = chunk.raw_component_array(&sample_descr) else { + // This chunk doesn't have any video chunks. + return Ok(()); + }; + let mut previous_max_presentation_timestamp = samples .back() .map_or(re_video::Time::MIN, |s| s.presentation_timestamp); @@ -356,21 +322,41 @@ fn read_sample_from_binary_array( re_log::warn_once!( "Out of order logging on video streams is not supported. Ignoring any out of order samples." ); - return; + return Ok(()); } } None => { // This chunk doesn't have any data on this timeline. - return; + return Ok(()); } } // Make sure our index is sorted by the timeline we're interested in. let chunk = chunk.sorted_by_timeline_if_unsorted(&timeline); - let buffer = binary_array.values(); + // The underlying data within a chunk is logically a Vec>, + // where the inner Vec always has a len=1, because we're dealing with a "mono-component" + // (each VideoStream has exactly one VideoSample instance per time)`. + // + // Because of how arrow works, the bytes of all the blobs are actually sequential in memory (yay!) in a single buffer, + // what you call values below (could use a better name btw). + // + // We want to figure out the byte offsets of each blob within the arrow buffer that holds all the blobs, + // i.e. get out a Vec. + let inner_list_array = raw_array + .downcast_array_ref::() + .ok_or(VideoStreamProcessingError::InvalidVideoSampleType( + raw_array.data_type().clone(), + ))?; + let values = inner_list_array + .values() + .downcast_array_ref::>() + .ok_or(VideoStreamProcessingError::InvalidVideoSampleType( + raw_array.data_type().clone(), + ))?; + let values = values.values().inner(); - let offsets = binary_array.offsets(); + let offsets = inner_list_array.offsets(); let lengths = offsets.lengths().collect::>(); let buffer_index = chunk_buffers.next_index(); @@ -394,8 +380,8 @@ fn read_sample_from_binary_array( } let sample_idx = sample_base_idx + start; - let byte_span = Span { start: offsets[start].as_usize(), len: lengths[start] }; - let sample_bytes = &buffer[byte_span.range()]; + let byte_span = Span { start:offsets[start] as usize, len: lengths[start] }; + let sample_bytes = &values[byte_span.range()]; // Note that the conversion of this time value is already handled by `VideoDataDescription::timescale`: // For sequence time we use a scale of 1, for nanoseconds time we use a scale of 1_000_000_000. @@ -473,7 +459,7 @@ fn read_sample_from_binary_array( // Any new samples actually added? Early out if not. if sample_base_idx == samples.next_index() { - return; + return Ok(()); } // Fill out durations for all new samples plus the first existing sample for which we didn't know the duration yet. @@ -500,7 +486,7 @@ fn read_sample_from_binary_array( } chunk_buffers.push_back(SampleBuffer { - buffer: buffer.clone(), + buffer: values.clone(), source_chunk_id: chunk.id(), sample_index_range: sample_base_idx..samples.next_index(), }); @@ -513,6 +499,8 @@ fn read_sample_from_binary_array( chunk.entity_path() ); } + + Ok(()) } impl Cache for VideoStreamCache { diff --git a/docs/content/reference/migration/migration-0-25.md b/docs/content/reference/migration/migration-0-25.md index d9a826b1e085..c43258d825ff 100644 --- a/docs/content/reference/migration/migration-0-25.md +++ b/docs/content/reference/migration/migration-0-25.md @@ -29,14 +29,3 @@ Previously this could only be configured for gRPC sinks, and it was configured o In the C++ and Python APIs, negative timeouts used to have special meaning. Now they are no longer permitted. The Python flush calls now raises an error if the flushing did not complete successfully. - - -## Changed arrow encoding of blobs -We used to encode blobs as `List`, which was rather unidiomatic. -Now they are instead encoded as `Binary`. -Old data will be migrated on ingestion (zero-copy). - -Affects the following components: -- [`Blob`](https://rerun.io/docs/reference/types/components/blob) -- [`ImageBuffer`](https://rerun.io/docs/reference/types/components/image_buffer) -- [`VideoSample`](https://rerun.io/docs/reference/types/components/video_sample) diff --git a/docs/content/reference/types/components/blob.md b/docs/content/reference/types/components/blob.md index 75b7067222cc..b9fe1a1087f4 100644 --- a/docs/content/reference/types/components/blob.md +++ b/docs/content/reference/types/components/blob.md @@ -11,7 +11,7 @@ A binary blob of data. ## Arrow datatype ``` -binary +List ``` ## API reference links diff --git a/docs/content/reference/types/components/image_buffer.md b/docs/content/reference/types/components/image_buffer.md index acef29d4c395..601f857b54eb 100644 --- a/docs/content/reference/types/components/image_buffer.md +++ b/docs/content/reference/types/components/image_buffer.md @@ -13,7 +13,7 @@ To interpret the contents of this buffer, see, [`components.ImageFormat`](https: ## Arrow datatype ``` -binary +List ``` ## API reference links diff --git a/docs/content/reference/types/components/video_sample.md b/docs/content/reference/types/components/video_sample.md index 18fba252a0f5..244719f15c61 100644 --- a/docs/content/reference/types/components/video_sample.md +++ b/docs/content/reference/types/components/video_sample.md @@ -16,7 +16,7 @@ Keyframes may require additional data, for details see [`components.VideoCodec`] ## Arrow datatype ``` -binary +List ``` ## API reference links diff --git a/docs/content/reference/types/datatypes/blob.md b/docs/content/reference/types/datatypes/blob.md index 864daa7810c2..389ccec848e1 100644 --- a/docs/content/reference/types/datatypes/blob.md +++ b/docs/content/reference/types/datatypes/blob.md @@ -8,7 +8,7 @@ A binary blob of data. ## Arrow datatype ``` -binary +List ``` ## API reference links diff --git a/rerun_cpp/src/rerun/archetypes/asset_video_ext.cpp b/rerun_cpp/src/rerun/archetypes/asset_video_ext.cpp index 5f5edb766dcd..ed206eb844a1 100644 --- a/rerun_cpp/src/rerun/archetypes/asset_video_ext.cpp +++ b/rerun_cpp/src/rerun/archetypes/asset_video_ext.cpp @@ -93,35 +93,16 @@ namespace rerun::archetypes { if (!blob.has_value()) { return std::vector(); } - - auto& array = blob.value().array; - - int64_t num_bytes; - const uint8_t* bytes; - - if (auto binary_array = std::dynamic_pointer_cast(array)) { - if (binary_array->length() != 1) { - return Error( - ErrorCode::InvalidComponent, - "Video blob array should be a single video file" - ); - } - - int32_t num_bytes_i32; - bytes = binary_array->GetValue(0, &num_bytes_i32); - num_bytes = static_cast(num_bytes_i32); - } else if (auto large_binary_array = std::dynamic_pointer_cast(array)) { - if (large_binary_array->length() != 1) { - return Error( - ErrorCode::InvalidComponent, - "Video blob array should be a single video file" - ); - } - - bytes = large_binary_array->GetValue(0, &num_bytes); - } else { - return Error(ErrorCode::InvalidComponent, "Video blob array is not a binary array"); + auto blob_list_array = std::dynamic_pointer_cast(blob.value().array); + if (!blob_list_array) { + return Error(ErrorCode::InvalidComponent, "Blob array is not a primitive array"); + } + auto blob_array = + std::dynamic_pointer_cast(blob_list_array->values()); + if (!blob_array) { + return Error(ErrorCode::InvalidComponent, "Blob array is not a primitive array"); } + auto blob_array_data = blob_array->values(); rr_string media_type_c = detail::to_rr_string(std::nullopt); if (media_type.has_value()) { @@ -136,8 +117,8 @@ namespace rerun::archetypes { rr_error status = {}; rr_video_asset_read_frame_timestamps_nanos( - bytes, - static_cast(num_bytes), + blob_array_data->data(), + static_cast(blob_array_data->size()), media_type_c, &frame_timestamps, &alloc_timestamps, diff --git a/rerun_cpp/src/rerun/datatypes/blob.cpp b/rerun_cpp/src/rerun/datatypes/blob.cpp index 29e27d67f8f5..10b05e46fcdf 100644 --- a/rerun_cpp/src/rerun/datatypes/blob.cpp +++ b/rerun_cpp/src/rerun/datatypes/blob.cpp @@ -10,7 +10,7 @@ namespace rerun::datatypes {} namespace rerun { const std::shared_ptr& Loggable::arrow_datatype() { - static const auto datatype = arrow::large_binary(); + static const auto datatype = arrow::list(arrow::field("item", arrow::uint8(), false)); return datatype; } @@ -24,7 +24,7 @@ namespace rerun { ARROW_ASSIGN_OR_RAISE(auto builder, arrow::MakeBuilder(datatype, pool)) if (instances && num_instances > 0) { RR_RETURN_NOT_OK(Loggable::fill_arrow_array_builder( - static_cast(builder.get()), + static_cast(builder.get()), instances, num_instances )); @@ -35,7 +35,7 @@ namespace rerun { } rerun::Error Loggable::fill_arrow_array_builder( - arrow::LargeBinaryBuilder* builder, const datatypes::Blob* elements, size_t num_elements + arrow::ListBuilder* builder, const datatypes::Blob* elements, size_t num_elements ) { if (builder == nullptr) { return rerun::Error(ErrorCode::UnexpectedNullArgument, "Passed array builder is null."); @@ -47,11 +47,17 @@ namespace rerun { ); } + auto value_builder = static_cast(builder->value_builder()); ARROW_RETURN_NOT_OK(builder->Reserve(static_cast(num_elements))); + ARROW_RETURN_NOT_OK(value_builder->Reserve(static_cast(num_elements * 2))); + for (size_t elem_idx = 0; elem_idx < num_elements; elem_idx += 1) { - ARROW_RETURN_NOT_OK(builder->Append( - elements[elem_idx].data.data(), - static_cast(elements[elem_idx].data.size()) + const auto& element = elements[elem_idx]; + ARROW_RETURN_NOT_OK(builder->Append()); + ARROW_RETURN_NOT_OK(value_builder->AppendValues( + element.data.data(), + static_cast(element.data.size()), + nullptr )); } diff --git a/rerun_cpp/src/rerun/datatypes/blob.hpp b/rerun_cpp/src/rerun/datatypes/blob.hpp index 4f99b67da379..7008b940d6dd 100644 --- a/rerun_cpp/src/rerun/datatypes/blob.hpp +++ b/rerun_cpp/src/rerun/datatypes/blob.hpp @@ -13,7 +13,7 @@ namespace arrow { class Array; class DataType; - class LargeBinaryBuilder; + class ListBuilder; } // namespace arrow namespace rerun::datatypes { @@ -60,7 +60,7 @@ namespace rerun { /// Fills an arrow array builder with an array of this type. static rerun::Error fill_arrow_array_builder( - arrow::LargeBinaryBuilder* builder, const datatypes::Blob* elements, size_t num_elements + arrow::ListBuilder* builder, const datatypes::Blob* elements, size_t num_elements ); }; } // namespace rerun diff --git a/rerun_py/rerun_sdk/rerun/_converters.py b/rerun_py/rerun_sdk/rerun/_converters.py index 1184c7b87543..c02b4ab59122 100644 --- a/rerun_py/rerun_sdk/rerun/_converters.py +++ b/rerun_py/rerun_sdk/rerun/_converters.py @@ -71,20 +71,6 @@ def bool_or_none(data: bool | None) -> bool | None: return bool(data) -@overload -def bytes_or_none(data: None) -> None: ... - - -@overload -def bytes_or_none(data: bytes) -> bytes: ... - - -def bytes_or_none(data: bytes | None) -> bytes | None: - if data is None: - return None - return bytes(data) - - @overload def str_or_none(data: None) -> None: ... diff --git a/rerun_py/rerun_sdk/rerun/archetypes/image_ext.py b/rerun_py/rerun_sdk/rerun/archetypes/image_ext.py index c3721489ca04..863040edb2d7 100644 --- a/rerun_py/rerun_sdk/rerun/archetypes/image_ext.py +++ b/rerun_py/rerun_sdk/rerun/archetypes/image_ext.py @@ -272,19 +272,13 @@ def as_pil_image(self: Any) -> PILImage.Image: f"Converting image with pixel_format {image_format.pixel_format} into PIL is not yet supported" ) - buffer = self.buffer.as_arrow_array() - - if len(buffer) != 1: - raise ValueError(f"Expected exactly 1 buffer, got {len(buffer)}") - - blob_bytes = buffer[0].as_py() - array = np.frombuffer(blob_bytes, dtype=image_format.channel_datatype.to_np_dtype()) + buf = self.buffer.as_arrow_array().values.to_numpy().view(image_format.channel_datatype.to_np_dtype()) # Note: np array shape is always (height, width, channels) if image_format.color_model == ColorModel.L: - image = array.reshape(image_format.height, image_format.width) # type: ignore[assignment] + image = buf.reshape(image_format.height, image_format.width) else: - image = array.reshape(image_format.height, image_format.width, image_format.color_model.num_channels()) # type: ignore[assignment] + image = buf.reshape(image_format.height, image_format.width, image_format.color_model.num_channels()) # PIL assumes L or RGB[A]: if image_format.color_model == ColorModel.BGR: diff --git a/rerun_py/rerun_sdk/rerun/datatypes/blob.py b/rerun_py/rerun_sdk/rerun/datatypes/blob.py index f0af43821af4..68a950808bc0 100644 --- a/rerun_py/rerun_sdk/rerun/datatypes/blob.py +++ b/rerun_py/rerun_sdk/rerun/datatypes/blob.py @@ -16,6 +16,9 @@ from .._baseclasses import ( BaseBatch, ) +from .._converters import ( + to_np_uint8, +) from .blob_ext import BlobExt __all__ = ["Blob", "BlobArrayLike", "BlobBatch", "BlobLike"] @@ -31,7 +34,15 @@ def __init__(self: Any, data: BlobLike) -> None: # You can define your own __init__ function as a member of BlobExt in blob_ext.py self.__attrs_init__(data=data) - data: bytes = field(converter=bytes) + data: npt.NDArray[np.uint8] = field(converter=to_np_uint8) + + def __array__(self, dtype: npt.DTypeLike = None, copy: bool | None = None) -> npt.NDArray[Any]: + # You can define your own __array__ function as a member of BlobExt in blob_ext.py + return np.asarray(self.data, dtype=dtype, copy=copy) + + def __len__(self) -> int: + # You can define your own __len__ function as a member of BlobExt in blob_ext.py + return len(self.data) if TYPE_CHECKING: @@ -43,7 +54,7 @@ def __init__(self: Any, data: BlobLike) -> None: class BlobBatch(BaseBatch[BlobArrayLike]): - _ARROW_DATATYPE = pa.large_binary() + _ARROW_DATATYPE = pa.list_(pa.field("item", pa.uint8(), nullable=False, metadata={})) @staticmethod def _native_to_pa_array(data: BlobArrayLike, data_type: pa.DataType) -> pa.Array: diff --git a/rerun_py/rerun_sdk/rerun/datatypes/blob_ext.py b/rerun_py/rerun_sdk/rerun/datatypes/blob_ext.py index ff1fc5455f4f..8390cb0768d0 100644 --- a/rerun_py/rerun_sdk/rerun/datatypes/blob_ext.py +++ b/rerun_py/rerun_sdk/rerun/datatypes/blob_ext.py @@ -25,42 +25,51 @@ def native_to_pa_array_override(data: BlobArrayLike, data_type: pa.DataType) -> if isinstance(data, BlobBatch): return data.as_arrow_array() - # numpy fast path: + # pure-numpy fast path elif isinstance(data, np.ndarray): if len(data) == 0: - return pa.array([], type=pa.large_binary()) + inners = [] elif data.ndim == 1: - return pa.array([np.array(data, dtype=np.uint8).tobytes()], type=pa.large_binary()) + inners = [pa.array(np.array(data, dtype=np.uint8).flatten())] else: - return pa.array([np.array(arr, dtype=np.uint8).tobytes() for arr in data], type=pa.large_binary()) + o = 0 + offsets = [o] + [o := next_offset(o, arr) for arr in data] + inner = pa.array(np.array(data, dtype=np.uint8).flatten()) + return pa.ListArray.from_arrays(offsets, inner, type=data_type) + # pure-object elif isinstance(data, Blob): - return pa.array([data.data], type=pa.large_binary()) + inners = [pa.array(np.array(data.data, dtype=np.uint8).flatten())] + # pure-bytes elif isinstance(data, bytes): - return pa.array([data], type=pa.large_binary()) + inners = [pa.array(np.frombuffer(data, dtype=np.uint8))] elif hasattr(data, "read"): - return pa.array([data.read()], type=pa.large_binary()) + inners = [pa.array(np.frombuffer(data.read(), dtype=np.uint8))] + # sequences elif isinstance(data, Sequence): if len(data) == 0: - return pa.array([], type=pa.large_binary()) + inners = [] elif isinstance(data[0], Blob): - return pa.array( - [ - np.array( - datum.data, # type: ignore[union-attr] - dtype=np.uint8, - ).tobytes() - for datum in data - ], - type=pa.large_binary(), - ) + inners = [pa.array(np.array(datum.data, dtype=np.uint8).flatten()) for datum in data] # type: ignore[union-attr] elif isinstance(data[0], bytes): - return pa.array(list(data), type=pa.large_binary()) # type: ignore[arg-type] + inners = [pa.array(np.frombuffer(datum, dtype=np.uint8)) for datum in data] # type: ignore[arg-type] else: - return pa.array([np.array(datum, dtype=np.uint8).tobytes() for datum in data], type=pa.large_binary()) + inners = [pa.array(np.array(datum, dtype=np.uint8).flatten()) for datum in data] else: - return pa.array([np.array(data.data, dtype=np.uint8).tobytes()], type=pa.large_binary()) + inners = [pa.array(np.array(data.data, dtype=np.uint8).flatten())] + + if len(inners) == 0: + offsets = pa.array([0], type=pa.int32()) + inner = np.array([], dtype=np.uint8).flatten() + return pa.ListArray.from_arrays(offsets, inner, type=data_type) + + o = 0 + offsets = [o] + [o := next_offset(o, inner) for inner in inners] + + inner = pa.concat_arrays(inners) + + return pa.ListArray.from_arrays(offsets, inner, type=data_type) diff --git a/rerun_py/src/video.rs b/rerun_py/src/video.rs index 9448b76e622d..511b36781d59 100644 --- a/rerun_py/src/video.rs +++ b/rerun_py/src/video.rs @@ -2,7 +2,7 @@ use pyo3::{Bound, PyAny, PyResult, exceptions::PyRuntimeError, pyfunction}; -use re_chunk::ArrowArray as _; +use re_arrow_util::ArrowArrayDowncastRef as _; use re_video::VideoLoadError; use crate::arrow::array_to_rust; @@ -20,13 +20,19 @@ pub fn asset_video_read_frame_timestamps_nanos( media_type: Option<&str>, ) -> PyResult> { let video_bytes_arrow_array = array_to_rust(video_bytes_arrow_array)?; - let video_bytes = binary_array_as_slice(&video_bytes_arrow_array).ok_or_else(|| { - PyRuntimeError::new_err(format!( - "Expected video bytes to be a single BinaryArray, instead it has the datatype {:?} x {}", - video_bytes_arrow_array.data_type(), - video_bytes_arrow_array.len(), - )) - })?; + + let video_bytes_arrow_uint8_array = video_bytes_arrow_array + .downcast_array_ref::() + .and_then(|arr| arr.values().downcast_array_ref::()) + .ok_or_else(|| { + PyRuntimeError::new_err(format!( + "Expected arrow array to be a list with a single uint8 array, instead it has the datatype {:?}", + video_bytes_arrow_array.data_type() + )) + })?; + + let video_bytes = video_bytes_arrow_uint8_array.values().as_ref(); + let Some(media_type) = media_type.or_else(|| infer::Infer::new().get(video_bytes).map(|v| v.mime_type())) else { @@ -43,22 +49,3 @@ pub fn asset_video_read_frame_timestamps_nanos( .collect(), ) } - -fn binary_array_as_slice(array: &std::sync::Arc) -> Option<&[u8]> { - if let Some(blob_data) = array.as_any().downcast_ref::() { - if blob_data.len() == 1 { - return Some(blob_data.value(0)); - } - } - - if let Some(blob_data) = array - .as_any() - .downcast_ref::() - { - if blob_data.len() == 1 { - return Some(blob_data.value(0)); - } - } - - None -}