Skip to content

Commit a78ef1b

Browse files
andrea-realeemilk
andauthored
Revert BinaryArray migration (#11039) (#11065)
(cherry picking from #11039) ### Related * Reverts #10875 * Reverts #11005 * Fixes #11028 * Closes #11032 * Re-opens #10929 * Re-opens #10973 ### What Reverts the the change from `List<u8>` to `Binary` because of Rerun Cloud migration woes. Hopefully temporarily. Co-authored-by: Emil Ernerfeldt <[email protected]>
1 parent 0ba6de3 commit a78ef1b

File tree

75 files changed

+375
-837
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+375
-837
lines changed

crates/build/re_types_builder/src/codegen/cpp/array_builder.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,6 @@ fn arrow_array_builder_type_and_declaration(
6666
);
6767
ident
6868
}
69-
Type::Binary => {
70-
let ident = format_ident!("LargeBinaryBuilder");
71-
declarations.insert("arrow", ForwardDecl::Class(ident.clone()));
72-
ident
73-
}
7469
Type::String => {
7570
let ident = format_ident!("StringBuilder");
7671
declarations.insert("arrow", ForwardDecl::Class(ident.clone()));

crates/build/re_types_builder/src/codegen/cpp/mod.rs

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2003,7 +2003,6 @@ fn quote_fill_arrow_array_builder(
20032003
ElementType::Float16 => Some("HalfFloatBuilder"),
20042004
ElementType::Float32 => Some("FloatBuilder"),
20052005
ElementType::Float64 => Some("DoubleBuilder"),
2006-
ElementType::Binary => Some("BinaryBuilder"),
20072006
ElementType::String => Some("StringBuilder"),
20082007
ElementType::Object{..} => None,
20092008
};
@@ -2234,7 +2233,7 @@ fn quote_append_single_value_to_builder(
22342233
value_access: &TokenStream,
22352234
includes: &mut Includes,
22362235
) -> TokenStream {
2237-
match typ {
2236+
match &typ {
22382237
Type::Unit => {
22392238
quote!(ARROW_RETURN_NOT_OK(#value_builder->AppendNull());)
22402239
}
@@ -2253,11 +2252,6 @@ fn quote_append_single_value_to_builder(
22532252
| Type::String => {
22542253
quote!(ARROW_RETURN_NOT_OK(#value_builder->Append(#value_access));)
22552254
}
2256-
Type::Binary => {
2257-
quote!(
2258-
ARROW_RETURN_NOT_OK(#value_builder->Append(#value_access.data(), static_cast<int64_t>(#value_access.size())));
2259-
)
2260-
}
22612255
Type::Float16 => {
22622256
// Cast `rerun::half` to a `uint16_t``
22632257
quote! {
@@ -2296,14 +2290,6 @@ fn quote_append_single_value_to_builder(
22962290
);
22972291
}
22982292
}
2299-
ElementType::Binary => {
2300-
quote! {
2301-
for (size_t item_idx = 0; item_idx < #num_items_per_element; item_idx += 1) {
2302-
auto&& data = &#value_access[elem_idx].data;
2303-
ARROW_RETURN_NOT_OK(#value_builder->Append(data.data(), static_cast<int32_t>(data.size())));
2304-
}
2305-
}
2306-
}
23072293
ElementType::String => {
23082294
quote! {
23092295
for (size_t item_idx = 0; item_idx < #num_items_per_element; item_idx += 1) {
@@ -2461,10 +2447,6 @@ fn quote_field_type(includes: &mut Includes, obj_field: &ObjectField) -> TokenSt
24612447
}
24622448
Type::Float32 => quote! { float },
24632449
Type::Float64 => quote! { double },
2464-
Type::Binary => {
2465-
includes.insert_rerun("collection.hpp");
2466-
quote! { rerun::Collection<uint8_t> }
2467-
}
24682450
Type::String => {
24692451
includes.insert_system("string");
24702452
quote! { std::string }
@@ -2525,10 +2507,6 @@ fn quote_element_type(includes: &mut Includes, typ: &ElementType) -> TokenStream
25252507
}
25262508
ElementType::Float32 => quote! { float },
25272509
ElementType::Float64 => quote! { double },
2528-
ElementType::Binary => {
2529-
includes.insert_rerun("collection.hpp");
2530-
quote! { rerun::Collection<uint8_t> }
2531-
}
25322510
ElementType::String => {
25332511
includes.insert_system("string");
25342512
quote! { std::string }
@@ -2670,7 +2648,6 @@ fn quote_arrow_datatype(
26702648
Type::Float16 => quote!(arrow::float16()),
26712649
Type::Float32 => quote!(arrow::float32()),
26722650
Type::Float64 => quote!(arrow::float64()),
2673-
Type::Binary => quote!(arrow::large_binary()),
26742651
Type::String => quote!(arrow::utf8()),
26752652
Type::Bool => quote!(arrow::boolean()),
26762653

crates/build/re_types_builder/src/codegen/docs/website.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,6 @@ fn write_fields(reporter: &Reporter, objects: &Objects, o: &mut String, object:
414414
Type::Float16 => atomic("float16"),
415415
Type::Float32 => atomic("float32"),
416416
Type::Float64 => atomic("float64"),
417-
Type::Binary => atomic("binary"),
418417
Type::String => atomic("utf8"),
419418

420419
Type::Array { elem_type, length } => {

crates/build/re_types_builder/src/codegen/python/mod.rs

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1673,7 +1673,6 @@ fn quote_field_type_from_field(
16731673
| Type::Int64 => "int".to_owned(),
16741674
Type::Bool => "bool".to_owned(),
16751675
Type::Float16 | Type::Float32 | Type::Float64 => "float".to_owned(),
1676-
Type::Binary => "bytes".to_owned(),
16771676
Type::String => "str".to_owned(),
16781677
Type::Array {
16791678
elem_type,
@@ -1692,7 +1691,6 @@ fn quote_field_type_from_field(
16921691
ElementType::Float16 => "npt.NDArray[np.float16]".to_owned(),
16931692
ElementType::Float32 => "npt.NDArray[np.float32]".to_owned(),
16941693
ElementType::Float64 => "npt.NDArray[np.float64]".to_owned(),
1695-
ElementType::Binary => "list[bytes]".to_owned(),
16961694
ElementType::String => "list[str]".to_owned(),
16971695
ElementType::Object { .. } => {
16981696
let typ = quote_type_from_element_type(elem_type);
@@ -1754,13 +1752,6 @@ fn quote_field_converter_from_field(
17541752
"float".to_owned()
17551753
}
17561754
}
1757-
Type::Binary => {
1758-
if field.is_nullable {
1759-
"bytes_or_none".to_owned()
1760-
} else {
1761-
"bytes".to_owned()
1762-
}
1763-
}
17641755
Type::String => {
17651756
if field.is_nullable {
17661757
"str_or_none".to_owned()
@@ -1877,7 +1868,6 @@ fn quote_type_from_type(typ: &Type) -> String {
18771868
| Type::Int64 => "int".to_owned(),
18781869
Type::Bool => "bool".to_owned(),
18791870
Type::Float16 | Type::Float32 | Type::Float64 => "float".to_owned(),
1880-
Type::Binary => "bytes".to_owned(),
18811871
Type::String => "str".to_owned(),
18821872
Type::Object { fqname } => fqname_to_type(fqname),
18831873
Type::Array { elem_type, .. } | Type::Vector { elem_type } => {
@@ -2036,7 +2026,6 @@ fn np_dtype_from_type(t: &Type) -> Option<&'static str> {
20362026
Type::Float32 => Some("np.float32"),
20372027
Type::Float64 => Some("np.float64"),
20382028
Type::Unit
2039-
| Type::Binary
20402029
| Type::String
20412030
| Type::Array { .. }
20422031
| Type::Vector { .. }
@@ -2133,11 +2122,7 @@ fn quote_arrow_serialization(
21332122
code.push_indented(2, &field_fwd, 1);
21342123
}
21352124

2136-
Type::Unit
2137-
| Type::Binary
2138-
| Type::String
2139-
| Type::Array { .. }
2140-
| Type::Vector { .. } => {
2125+
Type::Unit | Type::String | Type::Array { .. } | Type::Vector { .. } => {
21412126
return Err(
21422127
"We lack codegen for arrow-serialization of general structs".to_owned()
21432128
);
@@ -2264,7 +2249,6 @@ return pa.array(pa_data, type=data_type)
22642249
| Type::Float16
22652250
| Type::Float32
22662251
| Type::Float64
2267-
| Type::Binary
22682252
| Type::String => {
22692253
let datatype = quote_arrow_datatype(&type_registry.get(&field.fqname));
22702254
format!("pa.array({variant_kind_list}, type={datatype})")
@@ -2822,7 +2806,7 @@ fn quote_arrow_datatype(datatype: &DataType) -> String {
28222806
DataType::Atomic(AtomicDataType::Float32) => "pa.float32()".to_owned(),
28232807
DataType::Atomic(AtomicDataType::Float64) => "pa.float64()".to_owned(),
28242808

2825-
DataType::Binary => "pa.large_binary()".to_owned(),
2809+
DataType::Binary => "pa.binary()".to_owned(),
28262810

28272811
DataType::Utf8 => "pa.utf8()".to_owned(),
28282812

crates/build/re_types_builder/src/codegen/rust/api.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,6 @@ impl quote::ToTokens for TypeTokenizer<'_> {
783783
Type::Float16 => quote!(half::f16),
784784
Type::Float32 => quote!(f32),
785785
Type::Float64 => quote!(f64),
786-
Type::Binary => quote!(::arrow::buffer::Buffer),
787786
Type::String => quote!(::re_types_core::ArrowString),
788787
Type::Array { elem_type, length } => {
789788
if *unwrap {
@@ -822,7 +821,6 @@ impl quote::ToTokens for &ElementType {
822821
ElementType::Float16 => quote!(half::f16),
823822
ElementType::Float32 => quote!(f32),
824823
ElementType::Float64 => quote!(f64),
825-
ElementType::Binary => quote!(::arrow::buffer::Buffer),
826824
ElementType::String => quote!(::re_types_core::ArrowString),
827825
ElementType::Object { fqname } => quote_fqname_as_type_path(fqname),
828826
}

crates/build/re_types_builder/src/codegen/rust/arrow.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
3535
DataType::Atomic(AtomicDataType::Float32) => quote!(DataType::Float32),
3636
DataType::Atomic(AtomicDataType::Float64) => quote!(DataType::Float64),
3737

38-
DataType::Binary => quote!(DataType::LargeBinary),
38+
DataType::Binary => quote!(DataType::Binary),
3939

4040
DataType::Utf8 => quote!(DataType::Utf8),
4141

crates/build/re_types_builder/src/codegen/rust/deserializer.rs

Lines changed: 6 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -537,75 +537,11 @@ fn quote_arrow_field_deserializer(
537537
}
538538
}
539539

540-
DataType::Binary => {
541-
// Special code to handle deserializing both 32-bit and 64-bit opffsets (BinaryArray vs LargeBinaryArray)
542-
quote! {{
543-
fn extract_from_binary<O>(
544-
arrow_data: &arrow::array::GenericByteArray<arrow::datatypes::GenericBinaryType<O>>,
545-
) -> DeserializationResult<std::vec::Vec<Option<arrow::buffer::Buffer>>>
546-
where
547-
O: ::arrow::array::OffsetSizeTrait,
548-
{
549-
use ::arrow::array::Array as _;
550-
use ::re_types_core::arrow_zip_validity::ZipValidity;
551-
552-
let arrow_data_buf = arrow_data.values();
553-
let offsets = arrow_data.offsets();
554-
555-
ZipValidity::new_with_validity(offsets.windows(2), arrow_data.nulls())
556-
.map(|elem| {
557-
elem.map(|window| {
558-
// NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs.
559-
560-
let start = window[0].as_usize();
561-
let end = window[1].as_usize();
562-
let len = end - start;
563-
564-
// NOTE: It is absolutely crucial we explicitly handle the
565-
// boundchecks manually first, otherwise rustc completely chokes
566-
// when slicing the data (as in: a 100x perf drop)!
567-
if arrow_data_buf.len() < end {
568-
// error context is appended below during final collection
569-
return Err(DeserializationError::offset_slice_oob(
570-
(start, end),
571-
arrow_data_buf.len(),
572-
));
573-
}
574-
575-
#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
576-
let data = arrow_data_buf.slice_with_length(start, len);
577-
Ok(data)
578-
})
579-
.transpose()
580-
})
581-
.collect::<DeserializationResult<Vec<Option<_>>>>()
582-
}
583-
584-
if let Some(arrow_data) = #data_src.as_any().downcast_ref::<BinaryArray>() {
585-
extract_from_binary(arrow_data)
586-
.with_context(#obj_field_fqname)?
587-
.into_iter()
588-
} else if let Some(arrow_data) = #data_src.as_any().downcast_ref::<LargeBinaryArray>()
589-
{
590-
extract_from_binary(arrow_data)
591-
.with_context(#obj_field_fqname)?
592-
.into_iter()
593-
} else {
594-
let expected = Self::arrow_datatype();
595-
let actual = arrow_data.data_type().clone();
596-
return Err(DeserializationError::datatype_mismatch(expected, actual))
597-
.with_context(#obj_field_fqname);
598-
}
599-
}}
600-
}
601-
602540
DataType::Utf8 => {
603-
let quoted_downcast = quote_array_downcast(
604-
obj_field_fqname,
605-
data_src,
606-
quote!(StringArray),
607-
quoted_datatype,
608-
);
541+
let quoted_downcast = {
542+
let cast_as = quote!(StringArray);
543+
quote_array_downcast(obj_field_fqname, data_src, cast_as, quoted_datatype)
544+
};
609545

610546
let quoted_iter_transparency = quote_iterator_transparency(
611547
objects,
@@ -641,8 +577,7 @@ fn quote_arrow_field_deserializer(
641577
(start, end), #data_src_buf.len(),
642578
));
643579
}
644-
645-
#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
580+
#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] // TODO(apache/arrow-rs#6900): slice_with_length_unchecked unsafe when https://github.com/apache/arrow-rs/pull/6901 is merged and released
646581
let data = #data_src_buf.slice_with_length(start, len);
647582

648583
Ok(data)
@@ -889,7 +824,7 @@ fn quote_arrow_field_deserializer(
889824
quote!(#fqname_use::from_arrow_opt(#data_src).with_context(#obj_field_fqname)?.into_iter())
890825
}
891826

892-
DataType::Object { .. } => unimplemented!("{datatype:#?}"),
827+
_ => unimplemented!("{datatype:#?}"),
893828
}
894829
}
895830

crates/build/re_types_builder/src/codegen/rust/serializer.rs

Lines changed: 16 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -583,14 +583,7 @@ fn quote_arrow_field_serializer(
583583
}
584584
}
585585

586-
DataType::Binary | DataType::Utf8 => {
587-
let is_binary = datatype.to_logical_type() == &DataType::Binary;
588-
let as_bytes = if is_binary {
589-
quote!()
590-
} else {
591-
quote!(.as_bytes())
592-
};
593-
586+
DataType::Utf8 => {
594587
// NOTE: We need values for all slots, regardless of what the validity says,
595588
// hence `unwrap_or_default`.
596589
let (quoted_member_accessor, quoted_transparent_length) = if inner_is_arrow_transparent
@@ -630,7 +623,7 @@ fn quote_arrow_field_serializer(
630623

631624
let inner_data_and_offsets = if elements_are_nullable {
632625
quote! {
633-
let offsets = arrow::buffer::OffsetBuffer::from_lengths(
626+
let offsets = arrow::buffer::OffsetBuffer::<i32>::from_lengths(
634627
#data_src.iter().map(|opt| opt.as_ref() #quoted_transparent_length .unwrap_or_default())
635628
);
636629

@@ -643,13 +636,13 @@ fn quote_arrow_field_serializer(
643636
// NOTE: Flattening to remove the guaranteed layer of nullability: we don't care
644637
// about it while building the backing buffer since it's all offsets driven.
645638
for data in #data_src.iter().flatten() {
646-
buffer_builder.append_slice(data #quoted_member_accessor #as_bytes);
639+
buffer_builder.append_slice(data #quoted_member_accessor.as_bytes());
647640
}
648641
let inner_data: arrow::buffer::Buffer = buffer_builder.finish();
649642
}
650643
} else {
651644
quote! {
652-
let offsets = arrow::buffer::OffsetBuffer::from_lengths(
645+
let offsets = arrow::buffer::OffsetBuffer::<i32>::from_lengths(
653646
#data_src.iter() #quoted_transparent_length
654647
);
655648

@@ -660,29 +653,22 @@ fn quote_arrow_field_serializer(
660653

661654
let mut buffer_builder = arrow::array::builder::BufferBuilder::<u8>::new(capacity);
662655
for data in &#data_src {
663-
buffer_builder.append_slice(data #quoted_member_accessor #as_bytes);
656+
buffer_builder.append_slice(data #quoted_member_accessor.as_bytes());
664657
}
665658
let inner_data: arrow::buffer::Buffer = buffer_builder.finish();
666659
}
667660
};
668661

669-
if is_binary {
670-
quote! {{
671-
#inner_data_and_offsets
672-
as_array_ref(LargeBinaryArray::new(offsets, inner_data, #validity_src))
673-
}}
674-
} else {
675-
quote! {{
676-
#inner_data_and_offsets
677-
678-
// Safety: we're building this from actual native strings, so no need to do the
679-
// whole utf8 validation _again_.
680-
// It would be nice to use quote_comment here and put this safety notice in the generated code,
681-
// but that seems to push us over some complexity limit causing rustfmt to fail.
682-
#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
683-
as_array_ref(unsafe { StringArray::new_unchecked(offsets, inner_data, #validity_src) })
684-
}}
685-
}
662+
quote! {{
663+
#inner_data_and_offsets
664+
665+
// Safety: we're building this from actual native strings, so no need to do the
666+
// whole utf8 validation _again_.
667+
// It would be nice to use quote_comment here and put this safety notice in the generated code,
668+
// but that seems to push us over some complexity limit causing rustfmt to fail.
669+
#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
670+
as_array_ref(unsafe { StringArray::new_unchecked(offsets, inner_data, #validity_src) })
671+
}}
686672
}
687673

688674
DataType::List(inner_field) | DataType::FixedSizeList(inner_field, _) => {
@@ -933,6 +919,6 @@ fn quote_arrow_field_serializer(
933919
}}
934920
}
935921

936-
DataType::Object { .. } => unimplemented!("{datatype:#?}"),
922+
_ => unimplemented!("{datatype:#?}"),
937923
}
938924
}

0 commit comments

Comments
 (0)