diff --git a/crates/build/re_types_builder/src/codegen/cpp/array_builder.rs b/crates/build/re_types_builder/src/codegen/cpp/array_builder.rs index f5af9a134e29..1a000e59672f 100644 --- a/crates/build/re_types_builder/src/codegen/cpp/array_builder.rs +++ b/crates/build/re_types_builder/src/codegen/cpp/array_builder.rs @@ -66,6 +66,11 @@ fn arrow_array_builder_type_and_declaration( ); ident } + Type::Binary => { + let ident = format_ident!("LargeBinaryBuilder"); + declarations.insert("arrow", ForwardDecl::Class(ident.clone())); + ident + } Type::String => { let ident = format_ident!("StringBuilder"); declarations.insert("arrow", ForwardDecl::Class(ident.clone())); diff --git a/crates/build/re_types_builder/src/codegen/cpp/mod.rs b/crates/build/re_types_builder/src/codegen/cpp/mod.rs index 9d39c252b961..a7bd212ec272 100644 --- a/crates/build/re_types_builder/src/codegen/cpp/mod.rs +++ b/crates/build/re_types_builder/src/codegen/cpp/mod.rs @@ -2003,6 +2003,7 @@ fn quote_fill_arrow_array_builder( ElementType::Float16 => Some("HalfFloatBuilder"), ElementType::Float32 => Some("FloatBuilder"), ElementType::Float64 => Some("DoubleBuilder"), + ElementType::Binary => Some("BinaryBuilder"), ElementType::String => Some("StringBuilder"), ElementType::Object{..} => None, }; @@ -2233,7 +2234,7 @@ fn quote_append_single_value_to_builder( value_access: &TokenStream, includes: &mut Includes, ) -> TokenStream { - match &typ { + match typ { Type::Unit => { quote!(ARROW_RETURN_NOT_OK(#value_builder->AppendNull());) } @@ -2252,6 +2253,11 @@ fn quote_append_single_value_to_builder( | Type::String => { quote!(ARROW_RETURN_NOT_OK(#value_builder->Append(#value_access));) } + Type::Binary => { + quote!( + ARROW_RETURN_NOT_OK(#value_builder->Append(#value_access.data(), static_cast(#value_access.size()))); + ) + } Type::Float16 => { // Cast `rerun::half` to a `uint16_t`` quote! { @@ -2290,6 +2296,14 @@ fn quote_append_single_value_to_builder( ); } } + ElementType::Binary => { + quote! { + for (size_t item_idx = 0; item_idx < #num_items_per_element; item_idx += 1) { + auto&& data = &#value_access[elem_idx].data; + ARROW_RETURN_NOT_OK(#value_builder->Append(data.data(), static_cast(data.size()))); + } + } + } ElementType::String => { quote! { for (size_t item_idx = 0; item_idx < #num_items_per_element; item_idx += 1) { @@ -2447,6 +2461,10 @@ fn quote_field_type(includes: &mut Includes, obj_field: &ObjectField) -> TokenSt } Type::Float32 => quote! { float }, Type::Float64 => quote! { double }, + Type::Binary => { + includes.insert_rerun("collection.hpp"); + quote! { rerun::Collection } + } Type::String => { includes.insert_system("string"); quote! { std::string } @@ -2507,6 +2525,10 @@ fn quote_element_type(includes: &mut Includes, typ: &ElementType) -> TokenStream } ElementType::Float32 => quote! { float }, ElementType::Float64 => quote! { double }, + ElementType::Binary => { + includes.insert_rerun("collection.hpp"); + quote! { rerun::Collection } + } ElementType::String => { includes.insert_system("string"); quote! { std::string } @@ -2648,6 +2670,7 @@ fn quote_arrow_datatype( Type::Float16 => quote!(arrow::float16()), Type::Float32 => quote!(arrow::float32()), Type::Float64 => quote!(arrow::float64()), + Type::Binary => quote!(arrow::large_binary()), Type::String => quote!(arrow::utf8()), Type::Bool => quote!(arrow::boolean()), diff --git a/crates/build/re_types_builder/src/codegen/docs/website.rs b/crates/build/re_types_builder/src/codegen/docs/website.rs index 9ef210c22afa..e5532cd000b3 100644 --- a/crates/build/re_types_builder/src/codegen/docs/website.rs +++ b/crates/build/re_types_builder/src/codegen/docs/website.rs @@ -414,6 +414,7 @@ fn write_fields(reporter: &Reporter, objects: &Objects, o: &mut String, object: Type::Float16 => atomic("float16"), Type::Float32 => atomic("float32"), Type::Float64 => atomic("float64"), + Type::Binary => atomic("binary"), Type::String => atomic("utf8"), Type::Array { elem_type, length } => { diff --git a/crates/build/re_types_builder/src/codegen/python/mod.rs b/crates/build/re_types_builder/src/codegen/python/mod.rs index 57d745fc3a71..7cf1cf63a41b 100644 --- a/crates/build/re_types_builder/src/codegen/python/mod.rs +++ b/crates/build/re_types_builder/src/codegen/python/mod.rs @@ -1673,6 +1673,7 @@ fn quote_field_type_from_field( | Type::Int64 => "int".to_owned(), Type::Bool => "bool".to_owned(), Type::Float16 | Type::Float32 | Type::Float64 => "float".to_owned(), + Type::Binary => "bytes".to_owned(), Type::String => "str".to_owned(), Type::Array { elem_type, @@ -1691,6 +1692,7 @@ fn quote_field_type_from_field( ElementType::Float16 => "npt.NDArray[np.float16]".to_owned(), ElementType::Float32 => "npt.NDArray[np.float32]".to_owned(), ElementType::Float64 => "npt.NDArray[np.float64]".to_owned(), + ElementType::Binary => "list[bytes]".to_owned(), ElementType::String => "list[str]".to_owned(), ElementType::Object { .. } => { let typ = quote_type_from_element_type(elem_type); @@ -1752,6 +1754,13 @@ fn quote_field_converter_from_field( "float".to_owned() } } + Type::Binary => { + if field.is_nullable { + "bytes_or_none".to_owned() + } else { + "bytes".to_owned() + } + } Type::String => { if field.is_nullable { "str_or_none".to_owned() @@ -1868,6 +1877,7 @@ fn quote_type_from_type(typ: &Type) -> String { | Type::Int64 => "int".to_owned(), Type::Bool => "bool".to_owned(), Type::Float16 | Type::Float32 | Type::Float64 => "float".to_owned(), + Type::Binary => "bytes".to_owned(), Type::String => "str".to_owned(), Type::Object { fqname } => fqname_to_type(fqname), Type::Array { elem_type, .. } | Type::Vector { elem_type } => { @@ -2026,6 +2036,7 @@ fn np_dtype_from_type(t: &Type) -> Option<&'static str> { Type::Float32 => Some("np.float32"), Type::Float64 => Some("np.float64"), Type::Unit + | Type::Binary | Type::String | Type::Array { .. } | Type::Vector { .. } @@ -2122,7 +2133,11 @@ fn quote_arrow_serialization( code.push_indented(2, &field_fwd, 1); } - Type::Unit | Type::String | Type::Array { .. } | Type::Vector { .. } => { + Type::Unit + | Type::Binary + | Type::String + | Type::Array { .. } + | Type::Vector { .. } => { return Err( "We lack codegen for arrow-serialization of general structs".to_owned() ); @@ -2249,6 +2264,7 @@ return pa.array(pa_data, type=data_type) | Type::Float16 | Type::Float32 | Type::Float64 + | Type::Binary | Type::String => { let datatype = quote_arrow_datatype(&type_registry.get(&field.fqname)); format!("pa.array({variant_kind_list}, type={datatype})") @@ -2806,7 +2822,7 @@ fn quote_arrow_datatype(datatype: &DataType) -> String { DataType::Atomic(AtomicDataType::Float32) => "pa.float32()".to_owned(), DataType::Atomic(AtomicDataType::Float64) => "pa.float64()".to_owned(), - DataType::Binary => "pa.binary()".to_owned(), + DataType::Binary => "pa.large_binary()".to_owned(), DataType::Utf8 => "pa.utf8()".to_owned(), diff --git a/crates/build/re_types_builder/src/codegen/rust/api.rs b/crates/build/re_types_builder/src/codegen/rust/api.rs index 0b57dc79b7ea..1b229d237747 100644 --- a/crates/build/re_types_builder/src/codegen/rust/api.rs +++ b/crates/build/re_types_builder/src/codegen/rust/api.rs @@ -783,6 +783,7 @@ impl quote::ToTokens for TypeTokenizer<'_> { Type::Float16 => quote!(half::f16), Type::Float32 => quote!(f32), Type::Float64 => quote!(f64), + Type::Binary => quote!(::arrow::buffer::Buffer), Type::String => quote!(::re_types_core::ArrowString), Type::Array { elem_type, length } => { if *unwrap { @@ -821,6 +822,7 @@ impl quote::ToTokens for &ElementType { ElementType::Float16 => quote!(half::f16), ElementType::Float32 => quote!(f32), ElementType::Float64 => quote!(f64), + ElementType::Binary => quote!(::arrow::buffer::Buffer), ElementType::String => quote!(::re_types_core::ArrowString), ElementType::Object { fqname } => quote_fqname_as_type_path(fqname), } diff --git a/crates/build/re_types_builder/src/codegen/rust/arrow.rs b/crates/build/re_types_builder/src/codegen/rust/arrow.rs index f3b5025fa22e..92fe805feb29 100644 --- a/crates/build/re_types_builder/src/codegen/rust/arrow.rs +++ b/crates/build/re_types_builder/src/codegen/rust/arrow.rs @@ -35,7 +35,7 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> { DataType::Atomic(AtomicDataType::Float32) => quote!(DataType::Float32), DataType::Atomic(AtomicDataType::Float64) => quote!(DataType::Float64), - DataType::Binary => quote!(DataType::Binary), + DataType::Binary => quote!(DataType::LargeBinary), DataType::Utf8 => quote!(DataType::Utf8), diff --git a/crates/build/re_types_builder/src/codegen/rust/deserializer.rs b/crates/build/re_types_builder/src/codegen/rust/deserializer.rs index bd26513db2fb..172ea1d2320d 100644 --- a/crates/build/re_types_builder/src/codegen/rust/deserializer.rs +++ b/crates/build/re_types_builder/src/codegen/rust/deserializer.rs @@ -537,6 +537,68 @@ fn quote_arrow_field_deserializer( } } + DataType::Binary => { + // Special code to handle deserializing both 32-bit and 64-bit opffsets (BinaryArray vs LargeBinaryArray) + quote! {{ + fn extract_from_binary( + arrow_data: &arrow::array::GenericByteArray>, + ) -> DeserializationResult>> + where + O: ::arrow::array::OffsetSizeTrait, + { + use ::arrow::array::Array as _; + use ::re_types_core::arrow_zip_validity::ZipValidity; + + let arrow_data_buf = arrow_data.values(); + let offsets = arrow_data.offsets(); + + ZipValidity::new_with_validity(offsets.windows(2), arrow_data.nulls()) + .map(|elem| { + elem.map(|window| { + // NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs. + + let start = window[0].as_usize(); + let end = window[1].as_usize(); + let len = end - start; + + // NOTE: It is absolutely crucial we explicitly handle the + // boundchecks manually first, otherwise rustc completely chokes + // when slicing the data (as in: a 100x perf drop)! + if arrow_data_buf.len() < end { + // error context is appended below during final collection + return Err(DeserializationError::offset_slice_oob( + (start, end), + arrow_data_buf.len(), + )); + } + + #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] + let data = arrow_data_buf.slice_with_length(start, len); + Ok(data) + }) + .transpose() + }) + .collect::>>>() + } + + if let Some(arrow_data) = #data_src.as_any().downcast_ref::() { + extract_from_binary(arrow_data) + .with_context(#obj_field_fqname)? + .into_iter() + } else if let Some(arrow_data) = #data_src.as_any().downcast_ref::() + { + extract_from_binary(arrow_data) + .with_context(#obj_field_fqname)? + .into_iter() + } else { + let expected = Self::arrow_datatype(); + let actual = arrow_data.data_type().clone(); + return Err(DeserializationError::datatype_mismatch(expected, actual)) + .with_context(#obj_field_fqname); + } + }} + } + DataType::Utf8 => { let quoted_downcast = { let cast_as = quote!(StringArray); @@ -824,7 +886,7 @@ fn quote_arrow_field_deserializer( quote!(#fqname_use::from_arrow_opt(#data_src).with_context(#obj_field_fqname)?.into_iter()) } - _ => unimplemented!("{datatype:#?}"), + DataType::Object { .. } => unimplemented!("{datatype:#?}"), } } diff --git a/crates/build/re_types_builder/src/codegen/rust/serializer.rs b/crates/build/re_types_builder/src/codegen/rust/serializer.rs index 48108bfcd2bb..3a27808bf03f 100644 --- a/crates/build/re_types_builder/src/codegen/rust/serializer.rs +++ b/crates/build/re_types_builder/src/codegen/rust/serializer.rs @@ -583,7 +583,14 @@ fn quote_arrow_field_serializer( } } - DataType::Utf8 => { + DataType::Binary | DataType::Utf8 => { + let is_binary = datatype.to_logical_type() == &DataType::Binary; + let as_bytes = if is_binary { + quote!() + } else { + quote!(.as_bytes()) + }; + // NOTE: We need values for all slots, regardless of what the validity says, // hence `unwrap_or_default`. let (quoted_member_accessor, quoted_transparent_length) = if inner_is_arrow_transparent @@ -623,7 +630,7 @@ fn quote_arrow_field_serializer( let inner_data_and_offsets = if elements_are_nullable { quote! { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( #data_src.iter().map(|opt| opt.as_ref() #quoted_transparent_length .unwrap_or_default()) ); @@ -636,13 +643,13 @@ fn quote_arrow_field_serializer( // NOTE: Flattening to remove the guaranteed layer of nullability: we don't care // about it while building the backing buffer since it's all offsets driven. for data in #data_src.iter().flatten() { - buffer_builder.append_slice(data #quoted_member_accessor.as_bytes()); + buffer_builder.append_slice(data #quoted_member_accessor #as_bytes); } let inner_data: arrow::buffer::Buffer = buffer_builder.finish(); } } else { quote! { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( #data_src.iter() #quoted_transparent_length ); @@ -653,22 +660,29 @@ fn quote_arrow_field_serializer( let mut buffer_builder = arrow::array::builder::BufferBuilder::::new(capacity); for data in &#data_src { - buffer_builder.append_slice(data #quoted_member_accessor.as_bytes()); + buffer_builder.append_slice(data #quoted_member_accessor #as_bytes); } let inner_data: arrow::buffer::Buffer = buffer_builder.finish(); } }; - quote! {{ - #inner_data_and_offsets - - // Safety: we're building this from actual native strings, so no need to do the - // whole utf8 validation _again_. - // It would be nice to use quote_comment here and put this safety notice in the generated code, - // but that seems to push us over some complexity limit causing rustfmt to fail. - #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] - as_array_ref(unsafe { StringArray::new_unchecked(offsets, inner_data, #validity_src) }) - }} + if is_binary { + quote! {{ + #inner_data_and_offsets + as_array_ref(LargeBinaryArray::new(offsets, inner_data, #validity_src)) + }} + } else { + quote! {{ + #inner_data_and_offsets + + // Safety: we're building this from actual native strings, so no need to do the + // whole utf8 validation _again_. + // It would be nice to use quote_comment here and put this safety notice in the generated code, + // but that seems to push us over some complexity limit causing rustfmt to fail. + #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] + as_array_ref(unsafe { StringArray::new_unchecked(offsets, inner_data, #validity_src) }) + }} + } } DataType::List(inner_field) | DataType::FixedSizeList(inner_field, _) => { @@ -919,6 +933,6 @@ fn quote_arrow_field_serializer( }} } - _ => unimplemented!("{datatype:#?}"), + DataType::Object { .. } => unimplemented!("{datatype:#?}"), } } diff --git a/crates/build/re_types_builder/src/data_type.rs b/crates/build/re_types_builder/src/data_type.rs index 68b7eb3f7f6c..28e0533abd19 100644 --- a/crates/build/re_types_builder/src/data_type.rs +++ b/crates/build/re_types_builder/src/data_type.rs @@ -112,6 +112,7 @@ impl std::fmt::Display for AtomicDataType { pub enum DataType { Atomic(AtomicDataType), + // 32-bit or 64-bit Binary, Utf8, @@ -153,8 +154,12 @@ impl DataType { pub enum LazyDatatype { Atomic(AtomicDataType), + /// A list of bytes of arbitrary length. + /// + /// 32-bit or 64-bit Binary, + /// Utf8 Utf8, /// Elements are non-nullable diff --git a/crates/build/re_types_builder/src/objects.rs b/crates/build/re_types_builder/src/objects.rs index 1d2b3507eb59..a173e3c03c05 100644 --- a/crates/build/re_types_builder/src/objects.rs +++ b/crates/build/re_types_builder/src/objects.rs @@ -1190,7 +1190,15 @@ pub enum Type { Float16, Float32, Float64, + + /// A list of bytes of arbitrary length. + /// + /// 32-bit or 64-bit + Binary, + + /// Utf8 String, + Array { elem_type: ElementType, length: usize, @@ -1218,6 +1226,7 @@ impl From for Type { ElementType::Float16 => Self::Float16, ElementType::Float32 => Self::Float32, ElementType::Float64 => Self::Float64, + ElementType::Binary => Self::Binary, ElementType::String => Self::String, ElementType::Object { fqname } => Self::Object { fqname }, } @@ -1236,14 +1245,28 @@ impl Type { let typ = field_type.base_type(); if let Some(type_override) = attrs.try_get::(fqname, ATTR_RERUN_OVERRIDE_TYPE) { - match (typ, type_override.as_str()) { - (FbsBaseType::UShort, "float16") => { - return Self::Float16; + match type_override.as_str() { + "binary" => { + if typ == FbsBaseType::Vector && field_type.element() == FbsBaseType::UByte { + return Self::Binary; + } else { + panic!("{fqname}: 'binary' can only be used on '[ubyte]', got {typ:?}") + } + } + "float16" => { + if matches!(typ, FbsBaseType::Array | FbsBaseType::Vector) { + // Array of float16 handled later + } else if typ == FbsBaseType::UShort { + return Self::Float16; + } else { + panic!( + "{fqname}: 'float16' can only be used on 'ushort' or `[ushort]`, got {typ:?}" + ) + } + } + _ => { + panic!("{fqname}: Unknown {ATTR_RERUN_OVERRIDE_TYPE:?}: {type_override:?}"); } - (FbsBaseType::Array | FbsBaseType::Vector, "float16") => {} - _ => unreachable!( - "UShort -> float16 is the only permitted type override. Not {typ:#?}->{type_override}" - ), } } @@ -1358,6 +1381,9 @@ impl Type { Self::Float64 => Some(Self::Vector { elem_type: ElementType::Float64, }), + Self::Binary => Some(Self::Vector { + elem_type: ElementType::Binary, + }), Self::String => Some(Self::Vector { elem_type: ElementType::String, }), @@ -1398,6 +1424,7 @@ impl Type { | Self::Float16 | Self::Float32 | Self::Float64 + | Self::Binary | Self::String | Self::Object { .. } => None, } @@ -1438,7 +1465,7 @@ impl Type { | Self::Float32 | Self::Float64 => true, - Self::String | Self::Vector { .. } => false, + Self::Binary | Self::String | Self::Vector { .. } => false, Self::Array { elem_type, .. } => elem_type.has_default_destructor(objects), @@ -1523,8 +1550,18 @@ pub enum ElementType { Float16, Float32, Float64, + + /// A list of bytes of arbitrary length. + /// + /// 32-bit or 64-bit + Binary, + + /// Utf8 String, - Object { fqname: String }, + + Object { + fqname: String, + }, } impl ElementType { @@ -1615,7 +1652,7 @@ impl ElementType { | Self::Float32 | Self::Float64 => true, - Self::String => false, + Self::Binary | Self::String => false, Self::Object { fqname } => objects[fqname].has_default_destructor(objects), } @@ -1637,7 +1674,7 @@ impl ElementType { | Self::Float16 | Self::Float32 | Self::Float64 => true, - Self::Bool | Self::Object { .. } | Self::String => false, + Self::Bool | Self::Binary | Self::String | Self::Object { .. } => false, } } diff --git a/crates/build/re_types_builder/src/type_registry.rs b/crates/build/re_types_builder/src/type_registry.rs index fdb7e2c52fca..d79bdd36613c 100644 --- a/crates/build/re_types_builder/src/type_registry.rs +++ b/crates/build/re_types_builder/src/type_registry.rs @@ -163,6 +163,7 @@ impl TypeRegistry { Type::Float16 => LazyDatatype::Atomic(AtomicDataType::Float16), Type::Float32 => LazyDatatype::Atomic(AtomicDataType::Float32), Type::Float64 => LazyDatatype::Atomic(AtomicDataType::Float64), + Type::Binary => LazyDatatype::Binary, Type::String => LazyDatatype::Utf8, Type::Array { elem_type, length } => LazyDatatype::FixedSizeList( LazyField { @@ -215,6 +216,7 @@ impl TypeRegistry { ElementType::Float16 => LazyDatatype::Atomic(AtomicDataType::Float16), ElementType::Float32 => LazyDatatype::Atomic(AtomicDataType::Float32), ElementType::Float64 => LazyDatatype::Atomic(AtomicDataType::Float64), + ElementType::Binary => LazyDatatype::Binary, ElementType::String => LazyDatatype::Utf8, ElementType::Object { fqname } => LazyDatatype::Unresolved { fqname }, } diff --git a/crates/store/re_chunk/src/chunk.rs b/crates/store/re_chunk/src/chunk.rs index 5e13dda091af..8d664059021c 100644 --- a/crates/store/re_chunk/src/chunk.rs +++ b/crates/store/re_chunk/src/chunk.rs @@ -12,7 +12,7 @@ use arrow::{ use itertools::{Either, Itertools as _, izip}; use nohash_hasher::IntMap; -use re_arrow_util::ArrowArrayDowncastRef as _; +use re_arrow_util::{ArrowArrayDowncastRef as _, widen_binary_arrays}; use re_byte_size::SizeBytes as _; use re_log_types::{ AbsoluteTimeRange, EntityPath, NonMinI64, TimeInt, TimeType, Timeline, TimelineName, @@ -101,6 +101,8 @@ impl ChunkComponents { let Some(right_array) = right.get(descr) else { anyhow::bail!("rhs is missing {descr:?}"); }; + let left_array = widen_binary_arrays(left_array); + let right_array = widen_binary_arrays(right_array); re_arrow_util::ensure_similar(&left_array.to_data(), &right_array.to_data()) .with_context(|| format!("Component {descr:?}"))?; } diff --git a/crates/store/re_chunk/src/iter.rs b/crates/store/re_chunk/src/iter.rs index ba1192fb4da7..f8e6465e44f2 100644 --- a/crates/store/re_chunk/src/iter.rs +++ b/crates/store/re_chunk/src/iter.rs @@ -4,7 +4,7 @@ use arrow::{ array::{ Array as ArrowArray, ArrayRef as ArrowArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray as ArrowBooleanArray, FixedSizeListArray as ArrowFixedSizeListArray, - ListArray as ArrowListArray, PrimitiveArray as ArrowPrimitiveArray, + LargeBinaryArray, ListArray as ArrowListArray, PrimitiveArray as ArrowPrimitiveArray, StringArray as ArrowStringArray, StructArray as ArrowStructArray, }, buffer::{BooleanBuffer as ArrowBooleanBuffer, Buffer, ScalarBuffer as ArrowScalarBuffer}, @@ -12,7 +12,7 @@ use arrow::{ }; use itertools::{Either, Itertools as _, izip}; -use re_arrow_util::{ArrowArrayDowncastRef as _, offsets_lengths}; +use re_arrow_util::ArrowArrayDowncastRef as _; use re_log_types::{TimeInt, TimePoint, TimelineName}; use re_span::Span; use re_types_core::{ArrowString, Component, ComponentDescriptor}; @@ -205,7 +205,7 @@ impl Chunk { }; let offsets = list_array.offsets().iter().map(|idx| *idx as usize); - let lengths = offsets_lengths(list_array.offsets()); + let lengths = list_array.offsets().lengths(); if let Some(validity) = list_array.nulls() { Either::Right(Either::Left( @@ -520,7 +520,7 @@ where let values = values.values(); let offsets = inner_list_array.offsets(); - let lengths = offsets_lengths(inner_list_array.offsets()).collect_vec(); + let lengths = offsets.lengths().collect_vec(); // NOTE: No need for validity checks here, `component_spans` already takes care of that. Either::Right(component_spans.map(move |span| { @@ -533,7 +533,7 @@ where })) } -// We special case `&[u8]` so that it works both for `List[u8]` and `Binary` arrays. +// We special case `&[u8]` so that it works both for `List[u8]` and `Binary/LargeBinary` arrays. fn slice_as_u8<'a>( component_descriptor: ComponentDescriptor, array: &'a dyn ArrowArray, @@ -542,17 +542,31 @@ fn slice_as_u8<'a>( if let Some(binary_array) = array.downcast_array_ref::() { let values = binary_array.values(); let offsets = binary_array.offsets(); - let lengths = offsets_lengths(binary_array.offsets()).collect_vec(); + let lengths = offsets.lengths().collect_vec(); // NOTE: No need for validity checks here, `component_spans` already takes care of that. - Either::Left(component_spans.map(move |span| { + Either::Left(Either::Left(component_spans.map(move |span| { let offsets = &offsets[span.range()]; let lengths = &lengths[span.range()]; izip!(offsets, lengths) // NOTE: Not an actual clone, just a refbump of the underlying buffer. .map(|(&idx, &len)| values.clone().slice_with_length(idx as _, len)) .collect_vec() - })) + }))) + } else if let Some(binary_array) = array.downcast_array_ref::() { + let values = binary_array.values(); + let offsets = binary_array.offsets(); + let lengths = offsets.lengths().collect_vec(); + + // NOTE: No need for validity checks here, `component_spans` already takes care of that. + Either::Left(Either::Right(component_spans.map(move |span| { + let offsets = &offsets[span.range()]; + let lengths = &lengths[span.range()]; + izip!(offsets, lengths) + // NOTE: Not an actual clone, just a refbump of the underlying buffer. + .map(|(&idx, &len)| values.clone().slice_with_length(idx as _, len)) + .collect_vec() + }))) } else { Either::Right( slice_as_buffer_native::( @@ -639,7 +653,7 @@ where }; let inner_offsets = inner_list_array.offsets(); - let inner_lengths = offsets_lengths(inner_list_array.offsets()).collect_vec(); + let inner_lengths = inner_offsets.lengths().collect_vec(); let Some(fixed_size_list_array) = inner_list_array .values() @@ -738,7 +752,7 @@ impl ChunkComponentSlicer for String { let values = utf8_array.values().clone(); let offsets = utf8_array.offsets().clone(); - let lengths = offsets_lengths(utf8_array.offsets()).collect_vec(); + let lengths = offsets.lengths().collect_vec(); // NOTE: No need for validity checks here, `component_spans` already takes care of that. Either::Right(component_spans.map(move |range| { diff --git a/crates/store/re_data_loader/src/loader_archetype.rs b/crates/store/re_data_loader/src/loader_archetype.rs index d04551ece964..e9caba767b66 100644 --- a/crates/store/re_data_loader/src/loader_archetype.rs +++ b/crates/store/re_data_loader/src/loader_archetype.rs @@ -193,7 +193,10 @@ fn load_video( re_log_types::TimeCell::ZERO_DURATION, ); - let video_asset = AssetVideo::new(contents); + let video_asset = { + re_tracing::profile_scope!("serialize-as-arrow"); + AssetVideo::new(contents) + }; let video_frame_reference_chunk = match video_asset.read_frame_timestamps_nanos() { Ok(frame_timestamps_nanos) => { diff --git a/crates/store/re_sorbet/src/migrations/mod.rs b/crates/store/re_sorbet/src/migrations/mod.rs index 3084cb591429..8590ab7769ac 100644 --- a/crates/store/re_sorbet/src/migrations/mod.rs +++ b/crates/store/re_sorbet/src/migrations/mod.rs @@ -1,6 +1,10 @@ #![expect(non_snake_case)] //! These are the migrations that are introduced for each Sorbet version. +//! +//! When you introduce a breaking change, these are the steps: +//! * Bump [`SorbetSchema::METADATA_VERSION`] +//! * Add a new `mod vX_Y_Z__to__vX_Y_W` use std::cmp::Ordering; @@ -109,7 +113,7 @@ pub fn migrate_record_batch(mut batch: RecordBatch) -> RecordBatch { Ok(batch_version) => match batch_version.cmp(&SorbetSchema::METADATA_VERSION) { Ordering::Equal => { // Provide this code path as an early out to avoid unnecessary comparisons. - re_log::trace!("Batch version matches Sorbet version."); + re_log::trace!("Batch version matches Sorbet version ({batch_version})"); batch } Ordering::Less => { @@ -120,7 +124,7 @@ pub fn migrate_record_batch(mut batch: RecordBatch) -> RecordBatch { ); batch } else { - re_log::trace!("Performing migrations…"); + re_log::debug_once!("Performing migrations from {batch_version}…"); batch = maybe_apply::(&batch_version, batch); batch = maybe_apply::(&batch_version, batch); batch = maybe_apply::(&batch_version, batch); diff --git a/crates/store/re_types/definitions/rerun/attributes.fbs b/crates/store/re_types/definitions/rerun/attributes.fbs index 81a3dfb78346..c77fd9085b51 100644 --- a/crates/store/re_types/definitions/rerun/attributes.fbs +++ b/crates/store/re_types/definitions/rerun/attributes.fbs @@ -28,7 +28,9 @@ attribute "attr.rerun.log_missing_as_empty"; /// Override the type of a field. /// -/// The only permitted value is "float16", which can only be used to override the type of a ushort. +/// The only permitted values are: +/// - `binary`, to override `[ubyte]` +/// - `float16`, to override `ushort` /// /// For lists this will apply to the inner element. attribute "attr.rerun.override_type"; diff --git a/crates/store/re_types/src/archetypes/asset_video_ext.rs b/crates/store/re_types/src/archetypes/asset_video_ext.rs index df4d31d220b0..99561c0758e4 100644 --- a/crates/store/re_types/src/archetypes/asset_video_ext.rs +++ b/crates/store/re_types/src/archetypes/asset_video_ext.rs @@ -52,7 +52,7 @@ impl AssetVideo { re_tracing::profile_function!(); let Some(blob_bytes) = self.blob.as_ref().and_then(Blob::serialized_blob_as_slice) else { - return Ok(Vec::new()); + return Err(re_video::VideoLoadError::NoVideoTrack); // Error type is close enough }; let Some(media_type) = self diff --git a/crates/store/re_types/src/blueprint/datatypes/component_column_selector.rs b/crates/store/re_types/src/blueprint/datatypes/component_column_selector.rs index 138bdff7b0f4..3424efbd61d8 100644 --- a/crates/store/re_types/src/blueprint/datatypes/component_column_selector.rs +++ b/crates/store/re_types/src/blueprint/datatypes/component_column_selector.rs @@ -105,7 +105,7 @@ impl ::re_types_core::Loggable for ComponentColumnSelector { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( entity_path.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), @@ -142,7 +142,7 @@ impl ::re_types_core::Loggable for ComponentColumnSelector { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( component.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), diff --git a/crates/store/re_types/src/blueprint/datatypes/selected_columns.rs b/crates/store/re_types/src/blueprint/datatypes/selected_columns.rs index b42d4e595a46..cd169788c511 100644 --- a/crates/store/re_types/src/blueprint/datatypes/selected_columns.rs +++ b/crates/store/re_types/src/blueprint/datatypes/selected_columns.rs @@ -135,9 +135,10 @@ impl ::re_types_core::Loggable for SelectedColumns { )), offsets, { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( time_columns_inner_data.iter().map(|datum| datum.0.len()), ); + #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = diff --git a/crates/store/re_types/src/datatypes/annotation_info.rs b/crates/store/re_types/src/datatypes/annotation_info.rs index 955ffc8a2f78..ad6e8550a06f 100644 --- a/crates/store/re_types/src/datatypes/annotation_info.rs +++ b/crates/store/re_types/src/datatypes/annotation_info.rs @@ -114,7 +114,7 @@ impl ::re_types_core::Loggable for AnnotationInfo { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( label.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), diff --git a/crates/store/re_types/src/datatypes/tensor_data.rs b/crates/store/re_types/src/datatypes/tensor_data.rs index b5d8c0bd8f74..3f3ea01614d5 100644 --- a/crates/store/re_types/src/datatypes/tensor_data.rs +++ b/crates/store/re_types/src/datatypes/tensor_data.rs @@ -192,7 +192,7 @@ impl ::re_types_core::Loggable for TensorData { std::sync::Arc::new(Field::new("item", DataType::Utf8, false)), offsets, { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( names_inner_data.iter().map(|datum| datum.len()), ); #[allow(clippy::unwrap_used)] @@ -203,6 +203,7 @@ impl ::re_types_core::Loggable for TensorData { buffer_builder.append_slice(data.as_bytes()); } let inner_data: arrow::buffer::Buffer = buffer_builder.finish(); + #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] as_array_ref(unsafe { StringArray::new_unchecked( diff --git a/crates/store/re_types/src/datatypes/utf8pair.rs b/crates/store/re_types/src/datatypes/utf8pair.rs index 7f2fdabc6ea8..faf9d7665daf 100644 --- a/crates/store/re_types/src/datatypes/utf8pair.rs +++ b/crates/store/re_types/src/datatypes/utf8pair.rs @@ -84,7 +84,7 @@ impl ::re_types_core::Loggable for Utf8Pair { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( first.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), @@ -117,12 +117,11 @@ impl ::re_types_core::Loggable for Utf8Pair { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( second.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), ); - #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = diff --git a/crates/store/re_types/src/testing/components/affix_fuzzer10.rs b/crates/store/re_types/src/testing/components/affix_fuzzer10.rs index e9ca73d98797..d2c4769762e4 100644 --- a/crates/store/re_types/src/testing/components/affix_fuzzer10.rs +++ b/crates/store/re_types/src/testing/components/affix_fuzzer10.rs @@ -63,7 +63,7 @@ impl ::re_types_core::Loggable for AffixFuzzer10 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( data0 .iter() .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), diff --git a/crates/store/re_types/src/testing/components/affix_fuzzer12.rs b/crates/store/re_types/src/testing/components/affix_fuzzer12.rs index be5c3eee7dc0..b7765d1cacfa 100644 --- a/crates/store/re_types/src/testing/components/affix_fuzzer12.rs +++ b/crates/store/re_types/src/testing/components/affix_fuzzer12.rs @@ -78,7 +78,7 @@ impl ::re_types_core::Loggable for AffixFuzzer12 { std::sync::Arc::new(Field::new("item", DataType::Utf8, false)), offsets, { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( data0_inner_data.iter().map(|datum| datum.len()), ); diff --git a/crates/store/re_types/src/testing/components/affix_fuzzer13.rs b/crates/store/re_types/src/testing/components/affix_fuzzer13.rs index b4c60584843e..7bfbbb6eea5b 100644 --- a/crates/store/re_types/src/testing/components/affix_fuzzer13.rs +++ b/crates/store/re_types/src/testing/components/affix_fuzzer13.rs @@ -78,7 +78,7 @@ impl ::re_types_core::Loggable for AffixFuzzer13 { std::sync::Arc::new(Field::new("item", DataType::Utf8, false)), offsets, { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( data0_inner_data.iter().map(|datum| datum.len()), ); diff --git a/crates/store/re_types/src/testing/components/affix_fuzzer9.rs b/crates/store/re_types/src/testing/components/affix_fuzzer9.rs index b50c7b022d93..30792d6d7f59 100644 --- a/crates/store/re_types/src/testing/components/affix_fuzzer9.rs +++ b/crates/store/re_types/src/testing/components/affix_fuzzer9.rs @@ -63,7 +63,7 @@ impl ::re_types_core::Loggable for AffixFuzzer9 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( data0 .iter() .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), diff --git a/crates/store/re_types/src/testing/datatypes/affix_fuzzer1.rs b/crates/store/re_types/src/testing/datatypes/affix_fuzzer1.rs index 085e6b75d98f..56f0d62cbaf3 100644 --- a/crates/store/re_types/src/testing/datatypes/affix_fuzzer1.rs +++ b/crates/store/re_types/src/testing/datatypes/affix_fuzzer1.rs @@ -184,12 +184,11 @@ impl ::re_types_core::Loggable for AffixFuzzer1 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( single_string_required.iter().map(|opt| { opt.as_ref().map(|datum| datum.len()).unwrap_or_default() }), ); - #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = @@ -225,12 +224,11 @@ impl ::re_types_core::Loggable for AffixFuzzer1 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( single_string_optional.iter().map(|opt| { opt.as_ref().map(|datum| datum.len()).unwrap_or_default() }), ); - #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = @@ -325,11 +323,12 @@ impl ::re_types_core::Loggable for AffixFuzzer1 { std::sync::Arc::new(Field::new("item", DataType::Utf8, false)), offsets, { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( many_strings_required_inner_data .iter() .map(|datum| datum.len()), ); + #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = @@ -385,11 +384,12 @@ impl ::re_types_core::Loggable for AffixFuzzer1 { std::sync::Arc::new(Field::new("item", DataType::Utf8, false)), offsets, { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( many_strings_optional_inner_data .iter() .map(|datum| datum.len()), ); + #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = diff --git a/crates/store/re_types/src/testing/datatypes/affix_fuzzer20.rs b/crates/store/re_types/src/testing/datatypes/affix_fuzzer20.rs index 5ea3a9b9d486..9b96f2e39cd1 100644 --- a/crates/store/re_types/src/testing/datatypes/affix_fuzzer20.rs +++ b/crates/store/re_types/src/testing/datatypes/affix_fuzzer20.rs @@ -117,11 +117,10 @@ impl ::re_types_core::Loggable for AffixFuzzer20 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( - s.iter().map(|opt| { + let offsets = + arrow::buffer::OffsetBuffer::from_lengths(s.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() - }), - ); + })); #[allow(clippy::unwrap_used)] let capacity = offsets.last().copied().unwrap() as usize; let mut buffer_builder = diff --git a/crates/store/re_types/src/testing/datatypes/string_component.rs b/crates/store/re_types/src/testing/datatypes/string_component.rs index 76f240dcc5b0..da5ba0d703f0 100644 --- a/crates/store/re_types/src/testing/datatypes/string_component.rs +++ b/crates/store/re_types/src/testing/datatypes/string_component.rs @@ -57,7 +57,7 @@ impl ::re_types_core::Loggable for StringComponent { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( data0 .iter() .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), diff --git a/crates/store/re_types_core/src/datatypes/entity_path.rs b/crates/store/re_types_core/src/datatypes/entity_path.rs index 98f3ad62c3d4..7c69a91534b3 100644 --- a/crates/store/re_types_core/src/datatypes/entity_path.rs +++ b/crates/store/re_types_core/src/datatypes/entity_path.rs @@ -58,7 +58,7 @@ impl crate::Loggable for EntityPath { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( data0 .iter() .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), diff --git a/crates/store/re_types_core/src/datatypes/utf8.rs b/crates/store/re_types_core/src/datatypes/utf8.rs index 0e8ec92700f9..b87df0b09c4e 100644 --- a/crates/store/re_types_core/src/datatypes/utf8.rs +++ b/crates/store/re_types_core/src/datatypes/utf8.rs @@ -58,7 +58,7 @@ impl crate::Loggable for Utf8 { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( data0 .iter() .map(|opt| opt.as_ref().map(|datum| datum.len()).unwrap_or_default()), diff --git a/crates/store/re_types_core/src/datatypes/visible_time_range.rs b/crates/store/re_types_core/src/datatypes/visible_time_range.rs index 76cdfc395a04..4bce624337d1 100644 --- a/crates/store/re_types_core/src/datatypes/visible_time_range.rs +++ b/crates/store/re_types_core/src/datatypes/visible_time_range.rs @@ -100,7 +100,7 @@ impl crate::Loggable for VisibleTimeRange { any_nones.then(|| somes.into()) }; { - let offsets = arrow::buffer::OffsetBuffer::::from_lengths( + let offsets = arrow::buffer::OffsetBuffer::from_lengths( timeline.iter().map(|opt| { opt.as_ref().map(|datum| datum.0.len()).unwrap_or_default() }), diff --git a/crates/top/rerun_c/src/video.rs b/crates/top/rerun_c/src/video.rs index 56c25bc91645..c53d1b82ddfa 100644 --- a/crates/top/rerun_c/src/video.rs +++ b/crates/top/rerun_c/src/video.rs @@ -16,6 +16,10 @@ pub extern "C" fn rr_video_asset_read_frame_timestamps_nanos( CError::unexpected_null("video_bytes").write_error(error); return std::ptr::null_mut(); } + if video_bytes_len == 0 { + CError::new(CErrorCode::VideoLoadError, "Zero video bytes").write_error(error); + return std::ptr::null_mut(); + } let Some(alloc_func) = alloc_func else { CError::unexpected_null("alloc_func").write_error(error); return std::ptr::null_mut(); @@ -44,7 +48,7 @@ pub extern "C" fn rr_video_asset_read_frame_timestamps_nanos( Err(err) => { CError::new( CErrorCode::VideoLoadError, - &format!("Failed to play video: {err}"), + &format!("Failed to load video: {err}"), ) .write_error(error); return std::ptr::null_mut(); diff --git a/crates/utils/re_arrow_util/src/arrays.rs b/crates/utils/re_arrow_util/src/arrays.rs index cfed9cf10273..69121dcce432 100644 --- a/crates/utils/re_arrow_util/src/arrays.rs +++ b/crates/utils/re_arrow_util/src/arrays.rs @@ -62,20 +62,6 @@ pub fn into_arrow_ref(array: impl Array + 'static) -> ArrayRef { std::sync::Arc::new(array) } -/// Returns an iterator with the lengths of the offsets. -pub fn offsets_lengths(offsets: &OffsetBuffer) -> impl Iterator + '_ { - // TODO(emilk): remove when we update to Arrow 54 (which has an API for this) - offsets.windows(2).map(|w| { - let start = w[0]; - let end = w[1]; - debug_assert!( - start <= end && 0 <= start, - "Bad arrow offset buffer: {start}, {end}" - ); - end.saturating_sub(start).max(0) as usize - }) -} - /// Repartitions a [`ListArray`] according to the specified `lengths`, ignoring previous partitioning. /// /// The specified `lengths` must sum to the total length underlying values (i.e. the child array). diff --git a/crates/utils/re_arrow_util/src/lib.rs b/crates/utils/re_arrow_util/src/lib.rs index 0f7a50a0256a..a355c493498a 100644 --- a/crates/utils/re_arrow_util/src/lib.rs +++ b/crates/utils/re_arrow_util/src/lib.rs @@ -9,3 +9,75 @@ pub use self::arrays::*; pub use self::batches::*; pub use self::compare::*; pub use self::format_data_type::*; + +// ---------------------------------------------------------------- + +use std::sync::Arc; + +use arrow::{ + array::{Array as _, AsArray as _, ListArray}, + datatypes::{DataType, Field}, +}; + +/// Convert any `BinaryArray` to `LargeBinaryArray`, because we treat them logivally the same +pub fn widen_binary_arrays(list_array: &ListArray) -> ListArray { + let list_data_type = list_array.data_type(); + if let DataType::List(field) = list_data_type + && field.data_type() == &DataType::Binary + { + re_tracing::profile_function!(); + let large_binary_field = Field::new("item", DataType::LargeBinary, true); + let target_type = DataType::List(Arc::new(large_binary_field)); + + #[expect(clippy::unwrap_used)] + arrow::compute::kernels::cast::cast(list_array, &target_type) + .unwrap() + .as_list() + .clone() + } else { + list_array.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{BinaryBuilder, ListBuilder}; + + #[test] + fn test_widen_list_binary() { + // Create test data + let mut list_builder = ListBuilder::new(BinaryBuilder::new()); + + // First list: [b"hello", b"world"] + list_builder.values().append_value(b"hello"); + list_builder.values().append_value(b"world"); + list_builder.append(true); + + // Second list: [b"rust", b"arrow"] + list_builder.values().append_value(b"rust"); + list_builder.values().append_value(b"arrow"); + list_builder.append(true); + + // Third list: null + list_builder.append_null(); + + let original_list = list_builder.finish(); + + // Widen to LargeBinaryArray + let widened_list = widen_binary_arrays(&original_list); + + // Verify the result + assert_eq!(widened_list.len(), 3); + assert!(!widened_list.is_null(0)); + assert!(!widened_list.is_null(1)); + assert!(widened_list.is_null(2)); + + // Check data type + if let DataType::List(field) = widened_list.data_type() { + assert_eq!(field.data_type(), &DataType::LargeBinary); + } else { + panic!("Expected List data type"); + } + } +} diff --git a/crates/utils/re_video/src/demux/mod.rs b/crates/utils/re_video/src/demux/mod.rs index e57ec1047cdc..dbe08efc5e72 100644 --- a/crates/utils/re_video/src/demux/mod.rs +++ b/crates/utils/re_video/src/demux/mod.rs @@ -465,6 +465,10 @@ impl VideoDataDescription { media_type: &str, debug_name: &str, ) -> Result { + if data.is_empty() { + return Err(VideoLoadError::ZeroBytes); + } + re_tracing::profile_function!(); match media_type { "video/mp4" => Self::load_mp4(data, debug_name), @@ -829,7 +833,10 @@ impl SampleMetadata { /// Errors that can occur when loading a video. #[derive(thiserror::Error, Debug)] pub enum VideoLoadError { - #[error("Failed to determine media type from data: {0}")] + #[error("The video file is empty (zero bytes)")] + ZeroBytes, + + #[error("MP4 error: {0}")] ParseMp4(#[from] re_mp4::Error), #[error("Video file has no video tracks")] diff --git a/crates/viewer/re_ui/src/arrow_ui.rs b/crates/viewer/re_ui/src/arrow_ui.rs index 18317a49a437..95bee335e45b 100644 --- a/crates/viewer/re_ui/src/arrow_ui.rs +++ b/crates/viewer/re_ui/src/arrow_ui.rs @@ -41,6 +41,9 @@ pub fn arrow_ui(ui: &mut egui::Ui, ui_layout: UiLayout, array: &dyn arrow::array return; } + // Special-case binary data (e.g. blobs). + // We don't want to show their contents (too slow, since they are usually huge), + // so we only show their size: if let Some(binaries) = array.downcast_array_ref::() && binaries.len() == 1 { diff --git a/rerun_py/rerun_sdk/rerun/_converters.py b/rerun_py/rerun_sdk/rerun/_converters.py index c02b4ab59122..1184c7b87543 100644 --- a/rerun_py/rerun_sdk/rerun/_converters.py +++ b/rerun_py/rerun_sdk/rerun/_converters.py @@ -71,6 +71,20 @@ def bool_or_none(data: bool | None) -> bool | None: return bool(data) +@overload +def bytes_or_none(data: None) -> None: ... + + +@overload +def bytes_or_none(data: bytes) -> bytes: ... + + +def bytes_or_none(data: bytes | None) -> bytes | None: + if data is None: + return None + return bytes(data) + + @overload def str_or_none(data: None) -> None: ...