Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ fn arrow_array_builder_type_and_declaration(
);
ident
}
Type::Binary => {
let ident = format_ident!("LargeBinaryBuilder");
declarations.insert("arrow", ForwardDecl::Class(ident.clone()));
ident
}
Type::String => {
let ident = format_ident!("StringBuilder");
declarations.insert("arrow", ForwardDecl::Class(ident.clone()));
Expand Down
25 changes: 24 additions & 1 deletion crates/build/re_types_builder/src/codegen/cpp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2003,6 +2003,7 @@ fn quote_fill_arrow_array_builder(
ElementType::Float16 => Some("HalfFloatBuilder"),
ElementType::Float32 => Some("FloatBuilder"),
ElementType::Float64 => Some("DoubleBuilder"),
ElementType::Binary => Some("BinaryBuilder"),
ElementType::String => Some("StringBuilder"),
ElementType::Object{..} => None,
};
Expand Down Expand Up @@ -2233,7 +2234,7 @@ fn quote_append_single_value_to_builder(
value_access: &TokenStream,
includes: &mut Includes,
) -> TokenStream {
match &typ {
match typ {
Type::Unit => {
quote!(ARROW_RETURN_NOT_OK(#value_builder->AppendNull());)
}
Expand All @@ -2252,6 +2253,11 @@ fn quote_append_single_value_to_builder(
| Type::String => {
quote!(ARROW_RETURN_NOT_OK(#value_builder->Append(#value_access));)
}
Type::Binary => {
quote!(
ARROW_RETURN_NOT_OK(#value_builder->Append(#value_access.data(), static_cast<int64_t>(#value_access.size())));
)
}
Type::Float16 => {
// Cast `rerun::half` to a `uint16_t``
quote! {
Expand Down Expand Up @@ -2290,6 +2296,14 @@ fn quote_append_single_value_to_builder(
);
}
}
ElementType::Binary => {
quote! {
for (size_t item_idx = 0; item_idx < #num_items_per_element; item_idx += 1) {
auto&& data = &#value_access[elem_idx].data;
ARROW_RETURN_NOT_OK(#value_builder->Append(data.data(), static_cast<int32_t>(data.size())));
}
}
}
ElementType::String => {
quote! {
for (size_t item_idx = 0; item_idx < #num_items_per_element; item_idx += 1) {
Expand Down Expand Up @@ -2447,6 +2461,10 @@ fn quote_field_type(includes: &mut Includes, obj_field: &ObjectField) -> TokenSt
}
Type::Float32 => quote! { float },
Type::Float64 => quote! { double },
Type::Binary => {
includes.insert_rerun("collection.hpp");
quote! { rerun::Collection<uint8_t> }
}
Type::String => {
includes.insert_system("string");
quote! { std::string }
Expand Down Expand Up @@ -2507,6 +2525,10 @@ fn quote_element_type(includes: &mut Includes, typ: &ElementType) -> TokenStream
}
ElementType::Float32 => quote! { float },
ElementType::Float64 => quote! { double },
ElementType::Binary => {
includes.insert_rerun("collection.hpp");
quote! { rerun::Collection<uint8_t> }
}
ElementType::String => {
includes.insert_system("string");
quote! { std::string }
Expand Down Expand Up @@ -2648,6 +2670,7 @@ fn quote_arrow_datatype(
Type::Float16 => quote!(arrow::float16()),
Type::Float32 => quote!(arrow::float32()),
Type::Float64 => quote!(arrow::float64()),
Type::Binary => quote!(arrow::large_binary()),
Type::String => quote!(arrow::utf8()),
Type::Bool => quote!(arrow::boolean()),

Expand Down
1 change: 1 addition & 0 deletions crates/build/re_types_builder/src/codegen/docs/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,7 @@ fn write_fields(reporter: &Reporter, objects: &Objects, o: &mut String, object:
Type::Float16 => atomic("float16"),
Type::Float32 => atomic("float32"),
Type::Float64 => atomic("float64"),
Type::Binary => atomic("binary"),
Type::String => atomic("utf8"),

Type::Array { elem_type, length } => {
Expand Down
20 changes: 18 additions & 2 deletions crates/build/re_types_builder/src/codegen/python/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1673,6 +1673,7 @@ fn quote_field_type_from_field(
| Type::Int64 => "int".to_owned(),
Type::Bool => "bool".to_owned(),
Type::Float16 | Type::Float32 | Type::Float64 => "float".to_owned(),
Type::Binary => "bytes".to_owned(),
Type::String => "str".to_owned(),
Type::Array {
elem_type,
Expand All @@ -1691,6 +1692,7 @@ fn quote_field_type_from_field(
ElementType::Float16 => "npt.NDArray[np.float16]".to_owned(),
ElementType::Float32 => "npt.NDArray[np.float32]".to_owned(),
ElementType::Float64 => "npt.NDArray[np.float64]".to_owned(),
ElementType::Binary => "list[bytes]".to_owned(),
ElementType::String => "list[str]".to_owned(),
ElementType::Object { .. } => {
let typ = quote_type_from_element_type(elem_type);
Expand Down Expand Up @@ -1752,6 +1754,13 @@ fn quote_field_converter_from_field(
"float".to_owned()
}
}
Type::Binary => {
if field.is_nullable {
"bytes_or_none".to_owned()
} else {
"bytes".to_owned()
}
}
Type::String => {
if field.is_nullable {
"str_or_none".to_owned()
Expand Down Expand Up @@ -1868,6 +1877,7 @@ fn quote_type_from_type(typ: &Type) -> String {
| Type::Int64 => "int".to_owned(),
Type::Bool => "bool".to_owned(),
Type::Float16 | Type::Float32 | Type::Float64 => "float".to_owned(),
Type::Binary => "bytes".to_owned(),
Type::String => "str".to_owned(),
Type::Object { fqname } => fqname_to_type(fqname),
Type::Array { elem_type, .. } | Type::Vector { elem_type } => {
Expand Down Expand Up @@ -2026,6 +2036,7 @@ fn np_dtype_from_type(t: &Type) -> Option<&'static str> {
Type::Float32 => Some("np.float32"),
Type::Float64 => Some("np.float64"),
Type::Unit
| Type::Binary
| Type::String
| Type::Array { .. }
| Type::Vector { .. }
Expand Down Expand Up @@ -2122,7 +2133,11 @@ fn quote_arrow_serialization(
code.push_indented(2, &field_fwd, 1);
}

Type::Unit | Type::String | Type::Array { .. } | Type::Vector { .. } => {
Type::Unit
| Type::Binary
| Type::String
| Type::Array { .. }
| Type::Vector { .. } => {
return Err(
"We lack codegen for arrow-serialization of general structs".to_owned()
);
Expand Down Expand Up @@ -2249,6 +2264,7 @@ return pa.array(pa_data, type=data_type)
| Type::Float16
| Type::Float32
| Type::Float64
| Type::Binary
| Type::String => {
let datatype = quote_arrow_datatype(&type_registry.get(&field.fqname));
format!("pa.array({variant_kind_list}, type={datatype})")
Expand Down Expand Up @@ -2806,7 +2822,7 @@ fn quote_arrow_datatype(datatype: &DataType) -> String {
DataType::Atomic(AtomicDataType::Float32) => "pa.float32()".to_owned(),
DataType::Atomic(AtomicDataType::Float64) => "pa.float64()".to_owned(),

DataType::Binary => "pa.binary()".to_owned(),
DataType::Binary => "pa.large_binary()".to_owned(),

DataType::Utf8 => "pa.utf8()".to_owned(),

Expand Down
2 changes: 2 additions & 0 deletions crates/build/re_types_builder/src/codegen/rust/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,7 @@ impl quote::ToTokens for TypeTokenizer<'_> {
Type::Float16 => quote!(half::f16),
Type::Float32 => quote!(f32),
Type::Float64 => quote!(f64),
Type::Binary => quote!(::arrow::buffer::Buffer),
Type::String => quote!(::re_types_core::ArrowString),
Type::Array { elem_type, length } => {
if *unwrap {
Expand Down Expand Up @@ -821,6 +822,7 @@ impl quote::ToTokens for &ElementType {
ElementType::Float16 => quote!(half::f16),
ElementType::Float32 => quote!(f32),
ElementType::Float64 => quote!(f64),
ElementType::Binary => quote!(::arrow::buffer::Buffer),
ElementType::String => quote!(::re_types_core::ArrowString),
ElementType::Object { fqname } => quote_fqname_as_type_path(fqname),
}
Expand Down
2 changes: 1 addition & 1 deletion crates/build/re_types_builder/src/codegen/rust/arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
DataType::Atomic(AtomicDataType::Float32) => quote!(DataType::Float32),
DataType::Atomic(AtomicDataType::Float64) => quote!(DataType::Float64),

DataType::Binary => quote!(DataType::Binary),
DataType::Binary => quote!(DataType::LargeBinary),

DataType::Utf8 => quote!(DataType::Utf8),

Expand Down
64 changes: 63 additions & 1 deletion crates/build/re_types_builder/src/codegen/rust/deserializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,68 @@ fn quote_arrow_field_deserializer(
}
}

DataType::Binary => {
// Special code to handle deserializing both 32-bit and 64-bit opffsets (BinaryArray vs LargeBinaryArray)
quote! {{
fn extract_from_binary<O>(
arrow_data: &arrow::array::GenericByteArray<arrow::datatypes::GenericBinaryType<O>>,
) -> DeserializationResult<std::vec::Vec<Option<arrow::buffer::Buffer>>>
where
O: ::arrow::array::OffsetSizeTrait,
{
use ::arrow::array::Array as _;
use ::re_types_core::arrow_zip_validity::ZipValidity;

let arrow_data_buf = arrow_data.values();
let offsets = arrow_data.offsets();

ZipValidity::new_with_validity(offsets.windows(2), arrow_data.nulls())
.map(|elem| {
elem.map(|window| {
// NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs.

let start = window[0].as_usize();
let end = window[1].as_usize();
let len = end - start;

// NOTE: It is absolutely crucial we explicitly handle the
// boundchecks manually first, otherwise rustc completely chokes
// when slicing the data (as in: a 100x perf drop)!
if arrow_data_buf.len() < end {
// error context is appended below during final collection
return Err(DeserializationError::offset_slice_oob(
(start, end),
arrow_data_buf.len(),
));
}

#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
let data = arrow_data_buf.slice_with_length(start, len);
Ok(data)
})
.transpose()
})
.collect::<DeserializationResult<Vec<Option<_>>>>()
}

if let Some(arrow_data) = #data_src.as_any().downcast_ref::<BinaryArray>() {
extract_from_binary(arrow_data)
.with_context(#obj_field_fqname)?
.into_iter()
} else if let Some(arrow_data) = #data_src.as_any().downcast_ref::<LargeBinaryArray>()
{
extract_from_binary(arrow_data)
.with_context(#obj_field_fqname)?
.into_iter()
} else {
let expected = Self::arrow_datatype();
let actual = arrow_data.data_type().clone();
return Err(DeserializationError::datatype_mismatch(expected, actual))
.with_context(#obj_field_fqname);
}
}}
}

DataType::Utf8 => {
let quoted_downcast = {
let cast_as = quote!(StringArray);
Expand Down Expand Up @@ -824,7 +886,7 @@ fn quote_arrow_field_deserializer(
quote!(#fqname_use::from_arrow_opt(#data_src).with_context(#obj_field_fqname)?.into_iter())
}

_ => unimplemented!("{datatype:#?}"),
DataType::Object { .. } => unimplemented!("{datatype:#?}"),
}
}

Expand Down
46 changes: 30 additions & 16 deletions crates/build/re_types_builder/src/codegen/rust/serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,14 @@ fn quote_arrow_field_serializer(
}
}

DataType::Utf8 => {
DataType::Binary | DataType::Utf8 => {
let is_binary = datatype.to_logical_type() == &DataType::Binary;
let as_bytes = if is_binary {
quote!()
} else {
quote!(.as_bytes())
};

// NOTE: We need values for all slots, regardless of what the validity says,
// hence `unwrap_or_default`.
let (quoted_member_accessor, quoted_transparent_length) = if inner_is_arrow_transparent
Expand Down Expand Up @@ -623,7 +630,7 @@ fn quote_arrow_field_serializer(

let inner_data_and_offsets = if elements_are_nullable {
quote! {
let offsets = arrow::buffer::OffsetBuffer::<i32>::from_lengths(
let offsets = arrow::buffer::OffsetBuffer::from_lengths(
#data_src.iter().map(|opt| opt.as_ref() #quoted_transparent_length .unwrap_or_default())
);

Expand All @@ -636,13 +643,13 @@ fn quote_arrow_field_serializer(
// NOTE: Flattening to remove the guaranteed layer of nullability: we don't care
// about it while building the backing buffer since it's all offsets driven.
for data in #data_src.iter().flatten() {
buffer_builder.append_slice(data #quoted_member_accessor.as_bytes());
buffer_builder.append_slice(data #quoted_member_accessor #as_bytes);
}
let inner_data: arrow::buffer::Buffer = buffer_builder.finish();
}
} else {
quote! {
let offsets = arrow::buffer::OffsetBuffer::<i32>::from_lengths(
let offsets = arrow::buffer::OffsetBuffer::from_lengths(
#data_src.iter() #quoted_transparent_length
);

Expand All @@ -653,22 +660,29 @@ fn quote_arrow_field_serializer(

let mut buffer_builder = arrow::array::builder::BufferBuilder::<u8>::new(capacity);
for data in &#data_src {
buffer_builder.append_slice(data #quoted_member_accessor.as_bytes());
buffer_builder.append_slice(data #quoted_member_accessor #as_bytes);
}
let inner_data: arrow::buffer::Buffer = buffer_builder.finish();
}
};

quote! {{
#inner_data_and_offsets

// Safety: we're building this from actual native strings, so no need to do the
// whole utf8 validation _again_.
// It would be nice to use quote_comment here and put this safety notice in the generated code,
// but that seems to push us over some complexity limit causing rustfmt to fail.
#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
as_array_ref(unsafe { StringArray::new_unchecked(offsets, inner_data, #validity_src) })
}}
if is_binary {
quote! {{
#inner_data_and_offsets
as_array_ref(LargeBinaryArray::new(offsets, inner_data, #validity_src))
}}
} else {
quote! {{
#inner_data_and_offsets

// Safety: we're building this from actual native strings, so no need to do the
// whole utf8 validation _again_.
// It would be nice to use quote_comment here and put this safety notice in the generated code,
// but that seems to push us over some complexity limit causing rustfmt to fail.
#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
as_array_ref(unsafe { StringArray::new_unchecked(offsets, inner_data, #validity_src) })
}}
}
}

DataType::List(inner_field) | DataType::FixedSizeList(inner_field, _) => {
Expand Down Expand Up @@ -919,6 +933,6 @@ fn quote_arrow_field_serializer(
}}
}

_ => unimplemented!("{datatype:#?}"),
DataType::Object { .. } => unimplemented!("{datatype:#?}"),
}
}
5 changes: 5 additions & 0 deletions crates/build/re_types_builder/src/data_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ impl std::fmt::Display for AtomicDataType {
pub enum DataType {
Atomic(AtomicDataType),

// 32-bit or 64-bit
Binary,

Utf8,
Expand Down Expand Up @@ -153,8 +154,12 @@ impl DataType {
pub enum LazyDatatype {
Atomic(AtomicDataType),

/// A list of bytes of arbitrary length.
///
/// 32-bit or 64-bit
Binary,

/// Utf8
Utf8,

/// Elements are non-nullable
Expand Down
Loading
Loading