Skip to content

Commit 88815d0

Browse files
authored
Re-apply some reverted things (#11067)
### Related * #11065 * #11028 ### What We do not yet support migrating schemas on cloud (#11028) so we had to revert changing the datatype of `Blob` from `List<u8>` to `LargeBinary`. However, in the revert we threw away a lot of good baby with the bath water. This PR selectively cherry-pick back a bunch of things that we do want in, without actually changing any data formats. This lets us: * Get in some good stuff * Reduce merge conflicts on other PRs to main * Reduce future merge conflicts when we want to change the datatype again NOTE: everything in here has already been reviewed before ### TODO * [x] `@rerun-bot full-check`
1 parent 9867d5d commit 88815d0

File tree

38 files changed

+368
-90
lines changed

38 files changed

+368
-90
lines changed

crates/build/re_types_builder/src/codegen/cpp/array_builder.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ fn arrow_array_builder_type_and_declaration(
6666
);
6767
ident
6868
}
69+
Type::Binary => {
70+
let ident = format_ident!("LargeBinaryBuilder");
71+
declarations.insert("arrow", ForwardDecl::Class(ident.clone()));
72+
ident
73+
}
6974
Type::String => {
7075
let ident = format_ident!("StringBuilder");
7176
declarations.insert("arrow", ForwardDecl::Class(ident.clone()));

crates/build/re_types_builder/src/codegen/cpp/mod.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2003,6 +2003,7 @@ fn quote_fill_arrow_array_builder(
20032003
ElementType::Float16 => Some("HalfFloatBuilder"),
20042004
ElementType::Float32 => Some("FloatBuilder"),
20052005
ElementType::Float64 => Some("DoubleBuilder"),
2006+
ElementType::Binary => Some("BinaryBuilder"),
20062007
ElementType::String => Some("StringBuilder"),
20072008
ElementType::Object{..} => None,
20082009
};
@@ -2233,7 +2234,7 @@ fn quote_append_single_value_to_builder(
22332234
value_access: &TokenStream,
22342235
includes: &mut Includes,
22352236
) -> TokenStream {
2236-
match &typ {
2237+
match typ {
22372238
Type::Unit => {
22382239
quote!(ARROW_RETURN_NOT_OK(#value_builder->AppendNull());)
22392240
}
@@ -2252,6 +2253,11 @@ fn quote_append_single_value_to_builder(
22522253
| Type::String => {
22532254
quote!(ARROW_RETURN_NOT_OK(#value_builder->Append(#value_access));)
22542255
}
2256+
Type::Binary => {
2257+
quote!(
2258+
ARROW_RETURN_NOT_OK(#value_builder->Append(#value_access.data(), static_cast<int64_t>(#value_access.size())));
2259+
)
2260+
}
22552261
Type::Float16 => {
22562262
// Cast `rerun::half` to a `uint16_t``
22572263
quote! {
@@ -2290,6 +2296,14 @@ fn quote_append_single_value_to_builder(
22902296
);
22912297
}
22922298
}
2299+
ElementType::Binary => {
2300+
quote! {
2301+
for (size_t item_idx = 0; item_idx < #num_items_per_element; item_idx += 1) {
2302+
auto&& data = &#value_access[elem_idx].data;
2303+
ARROW_RETURN_NOT_OK(#value_builder->Append(data.data(), static_cast<int32_t>(data.size())));
2304+
}
2305+
}
2306+
}
22932307
ElementType::String => {
22942308
quote! {
22952309
for (size_t item_idx = 0; item_idx < #num_items_per_element; item_idx += 1) {
@@ -2447,6 +2461,10 @@ fn quote_field_type(includes: &mut Includes, obj_field: &ObjectField) -> TokenSt
24472461
}
24482462
Type::Float32 => quote! { float },
24492463
Type::Float64 => quote! { double },
2464+
Type::Binary => {
2465+
includes.insert_rerun("collection.hpp");
2466+
quote! { rerun::Collection<uint8_t> }
2467+
}
24502468
Type::String => {
24512469
includes.insert_system("string");
24522470
quote! { std::string }
@@ -2507,6 +2525,10 @@ fn quote_element_type(includes: &mut Includes, typ: &ElementType) -> TokenStream
25072525
}
25082526
ElementType::Float32 => quote! { float },
25092527
ElementType::Float64 => quote! { double },
2528+
ElementType::Binary => {
2529+
includes.insert_rerun("collection.hpp");
2530+
quote! { rerun::Collection<uint8_t> }
2531+
}
25102532
ElementType::String => {
25112533
includes.insert_system("string");
25122534
quote! { std::string }
@@ -2648,6 +2670,7 @@ fn quote_arrow_datatype(
26482670
Type::Float16 => quote!(arrow::float16()),
26492671
Type::Float32 => quote!(arrow::float32()),
26502672
Type::Float64 => quote!(arrow::float64()),
2673+
Type::Binary => quote!(arrow::large_binary()),
26512674
Type::String => quote!(arrow::utf8()),
26522675
Type::Bool => quote!(arrow::boolean()),
26532676

crates/build/re_types_builder/src/codegen/docs/website.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,7 @@ fn write_fields(reporter: &Reporter, objects: &Objects, o: &mut String, object:
414414
Type::Float16 => atomic("float16"),
415415
Type::Float32 => atomic("float32"),
416416
Type::Float64 => atomic("float64"),
417+
Type::Binary => atomic("binary"),
417418
Type::String => atomic("utf8"),
418419

419420
Type::Array { elem_type, length } => {

crates/build/re_types_builder/src/codegen/python/mod.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1673,6 +1673,7 @@ fn quote_field_type_from_field(
16731673
| Type::Int64 => "int".to_owned(),
16741674
Type::Bool => "bool".to_owned(),
16751675
Type::Float16 | Type::Float32 | Type::Float64 => "float".to_owned(),
1676+
Type::Binary => "bytes".to_owned(),
16761677
Type::String => "str".to_owned(),
16771678
Type::Array {
16781679
elem_type,
@@ -1691,6 +1692,7 @@ fn quote_field_type_from_field(
16911692
ElementType::Float16 => "npt.NDArray[np.float16]".to_owned(),
16921693
ElementType::Float32 => "npt.NDArray[np.float32]".to_owned(),
16931694
ElementType::Float64 => "npt.NDArray[np.float64]".to_owned(),
1695+
ElementType::Binary => "list[bytes]".to_owned(),
16941696
ElementType::String => "list[str]".to_owned(),
16951697
ElementType::Object { .. } => {
16961698
let typ = quote_type_from_element_type(elem_type);
@@ -1752,6 +1754,13 @@ fn quote_field_converter_from_field(
17521754
"float".to_owned()
17531755
}
17541756
}
1757+
Type::Binary => {
1758+
if field.is_nullable {
1759+
"bytes_or_none".to_owned()
1760+
} else {
1761+
"bytes".to_owned()
1762+
}
1763+
}
17551764
Type::String => {
17561765
if field.is_nullable {
17571766
"str_or_none".to_owned()
@@ -1868,6 +1877,7 @@ fn quote_type_from_type(typ: &Type) -> String {
18681877
| Type::Int64 => "int".to_owned(),
18691878
Type::Bool => "bool".to_owned(),
18701879
Type::Float16 | Type::Float32 | Type::Float64 => "float".to_owned(),
1880+
Type::Binary => "bytes".to_owned(),
18711881
Type::String => "str".to_owned(),
18721882
Type::Object { fqname } => fqname_to_type(fqname),
18731883
Type::Array { elem_type, .. } | Type::Vector { elem_type } => {
@@ -2026,6 +2036,7 @@ fn np_dtype_from_type(t: &Type) -> Option<&'static str> {
20262036
Type::Float32 => Some("np.float32"),
20272037
Type::Float64 => Some("np.float64"),
20282038
Type::Unit
2039+
| Type::Binary
20292040
| Type::String
20302041
| Type::Array { .. }
20312042
| Type::Vector { .. }
@@ -2122,7 +2133,11 @@ fn quote_arrow_serialization(
21222133
code.push_indented(2, &field_fwd, 1);
21232134
}
21242135

2125-
Type::Unit | Type::String | Type::Array { .. } | Type::Vector { .. } => {
2136+
Type::Unit
2137+
| Type::Binary
2138+
| Type::String
2139+
| Type::Array { .. }
2140+
| Type::Vector { .. } => {
21262141
return Err(
21272142
"We lack codegen for arrow-serialization of general structs".to_owned()
21282143
);
@@ -2249,6 +2264,7 @@ return pa.array(pa_data, type=data_type)
22492264
| Type::Float16
22502265
| Type::Float32
22512266
| Type::Float64
2267+
| Type::Binary
22522268
| Type::String => {
22532269
let datatype = quote_arrow_datatype(&type_registry.get(&field.fqname));
22542270
format!("pa.array({variant_kind_list}, type={datatype})")
@@ -2806,7 +2822,7 @@ fn quote_arrow_datatype(datatype: &DataType) -> String {
28062822
DataType::Atomic(AtomicDataType::Float32) => "pa.float32()".to_owned(),
28072823
DataType::Atomic(AtomicDataType::Float64) => "pa.float64()".to_owned(),
28082824

2809-
DataType::Binary => "pa.binary()".to_owned(),
2825+
DataType::Binary => "pa.large_binary()".to_owned(),
28102826

28112827
DataType::Utf8 => "pa.utf8()".to_owned(),
28122828

crates/build/re_types_builder/src/codegen/rust/api.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,7 @@ impl quote::ToTokens for TypeTokenizer<'_> {
783783
Type::Float16 => quote!(half::f16),
784784
Type::Float32 => quote!(f32),
785785
Type::Float64 => quote!(f64),
786+
Type::Binary => quote!(::arrow::buffer::Buffer),
786787
Type::String => quote!(::re_types_core::ArrowString),
787788
Type::Array { elem_type, length } => {
788789
if *unwrap {
@@ -821,6 +822,7 @@ impl quote::ToTokens for &ElementType {
821822
ElementType::Float16 => quote!(half::f16),
822823
ElementType::Float32 => quote!(f32),
823824
ElementType::Float64 => quote!(f64),
825+
ElementType::Binary => quote!(::arrow::buffer::Buffer),
824826
ElementType::String => quote!(::re_types_core::ArrowString),
825827
ElementType::Object { fqname } => quote_fqname_as_type_path(fqname),
826828
}

crates/build/re_types_builder/src/codegen/rust/arrow.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
3535
DataType::Atomic(AtomicDataType::Float32) => quote!(DataType::Float32),
3636
DataType::Atomic(AtomicDataType::Float64) => quote!(DataType::Float64),
3737

38-
DataType::Binary => quote!(DataType::Binary),
38+
DataType::Binary => quote!(DataType::LargeBinary),
3939

4040
DataType::Utf8 => quote!(DataType::Utf8),
4141

crates/build/re_types_builder/src/codegen/rust/deserializer.rs

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,68 @@ fn quote_arrow_field_deserializer(
537537
}
538538
}
539539

540+
DataType::Binary => {
541+
// Special code to handle deserializing both 32-bit and 64-bit opffsets (BinaryArray vs LargeBinaryArray)
542+
quote! {{
543+
fn extract_from_binary<O>(
544+
arrow_data: &arrow::array::GenericByteArray<arrow::datatypes::GenericBinaryType<O>>,
545+
) -> DeserializationResult<std::vec::Vec<Option<arrow::buffer::Buffer>>>
546+
where
547+
O: ::arrow::array::OffsetSizeTrait,
548+
{
549+
use ::arrow::array::Array as _;
550+
use ::re_types_core::arrow_zip_validity::ZipValidity;
551+
552+
let arrow_data_buf = arrow_data.values();
553+
let offsets = arrow_data.offsets();
554+
555+
ZipValidity::new_with_validity(offsets.windows(2), arrow_data.nulls())
556+
.map(|elem| {
557+
elem.map(|window| {
558+
// NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs.
559+
560+
let start = window[0].as_usize();
561+
let end = window[1].as_usize();
562+
let len = end - start;
563+
564+
// NOTE: It is absolutely crucial we explicitly handle the
565+
// boundchecks manually first, otherwise rustc completely chokes
566+
// when slicing the data (as in: a 100x perf drop)!
567+
if arrow_data_buf.len() < end {
568+
// error context is appended below during final collection
569+
return Err(DeserializationError::offset_slice_oob(
570+
(start, end),
571+
arrow_data_buf.len(),
572+
));
573+
}
574+
575+
#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
576+
let data = arrow_data_buf.slice_with_length(start, len);
577+
Ok(data)
578+
})
579+
.transpose()
580+
})
581+
.collect::<DeserializationResult<Vec<Option<_>>>>()
582+
}
583+
584+
if let Some(arrow_data) = #data_src.as_any().downcast_ref::<BinaryArray>() {
585+
extract_from_binary(arrow_data)
586+
.with_context(#obj_field_fqname)?
587+
.into_iter()
588+
} else if let Some(arrow_data) = #data_src.as_any().downcast_ref::<LargeBinaryArray>()
589+
{
590+
extract_from_binary(arrow_data)
591+
.with_context(#obj_field_fqname)?
592+
.into_iter()
593+
} else {
594+
let expected = Self::arrow_datatype();
595+
let actual = arrow_data.data_type().clone();
596+
return Err(DeserializationError::datatype_mismatch(expected, actual))
597+
.with_context(#obj_field_fqname);
598+
}
599+
}}
600+
}
601+
540602
DataType::Utf8 => {
541603
let quoted_downcast = {
542604
let cast_as = quote!(StringArray);
@@ -824,7 +886,7 @@ fn quote_arrow_field_deserializer(
824886
quote!(#fqname_use::from_arrow_opt(#data_src).with_context(#obj_field_fqname)?.into_iter())
825887
}
826888

827-
_ => unimplemented!("{datatype:#?}"),
889+
DataType::Object { .. } => unimplemented!("{datatype:#?}"),
828890
}
829891
}
830892

crates/build/re_types_builder/src/codegen/rust/serializer.rs

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,14 @@ fn quote_arrow_field_serializer(
583583
}
584584
}
585585

586-
DataType::Utf8 => {
586+
DataType::Binary | DataType::Utf8 => {
587+
let is_binary = datatype.to_logical_type() == &DataType::Binary;
588+
let as_bytes = if is_binary {
589+
quote!()
590+
} else {
591+
quote!(.as_bytes())
592+
};
593+
587594
// NOTE: We need values for all slots, regardless of what the validity says,
588595
// hence `unwrap_or_default`.
589596
let (quoted_member_accessor, quoted_transparent_length) = if inner_is_arrow_transparent
@@ -623,7 +630,7 @@ fn quote_arrow_field_serializer(
623630

624631
let inner_data_and_offsets = if elements_are_nullable {
625632
quote! {
626-
let offsets = arrow::buffer::OffsetBuffer::<i32>::from_lengths(
633+
let offsets = arrow::buffer::OffsetBuffer::from_lengths(
627634
#data_src.iter().map(|opt| opt.as_ref() #quoted_transparent_length .unwrap_or_default())
628635
);
629636

@@ -636,13 +643,13 @@ fn quote_arrow_field_serializer(
636643
// NOTE: Flattening to remove the guaranteed layer of nullability: we don't care
637644
// about it while building the backing buffer since it's all offsets driven.
638645
for data in #data_src.iter().flatten() {
639-
buffer_builder.append_slice(data #quoted_member_accessor.as_bytes());
646+
buffer_builder.append_slice(data #quoted_member_accessor #as_bytes);
640647
}
641648
let inner_data: arrow::buffer::Buffer = buffer_builder.finish();
642649
}
643650
} else {
644651
quote! {
645-
let offsets = arrow::buffer::OffsetBuffer::<i32>::from_lengths(
652+
let offsets = arrow::buffer::OffsetBuffer::from_lengths(
646653
#data_src.iter() #quoted_transparent_length
647654
);
648655

@@ -653,22 +660,29 @@ fn quote_arrow_field_serializer(
653660

654661
let mut buffer_builder = arrow::array::builder::BufferBuilder::<u8>::new(capacity);
655662
for data in &#data_src {
656-
buffer_builder.append_slice(data #quoted_member_accessor.as_bytes());
663+
buffer_builder.append_slice(data #quoted_member_accessor #as_bytes);
657664
}
658665
let inner_data: arrow::buffer::Buffer = buffer_builder.finish();
659666
}
660667
};
661668

662-
quote! {{
663-
#inner_data_and_offsets
664-
665-
// Safety: we're building this from actual native strings, so no need to do the
666-
// whole utf8 validation _again_.
667-
// It would be nice to use quote_comment here and put this safety notice in the generated code,
668-
// but that seems to push us over some complexity limit causing rustfmt to fail.
669-
#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
670-
as_array_ref(unsafe { StringArray::new_unchecked(offsets, inner_data, #validity_src) })
671-
}}
669+
if is_binary {
670+
quote! {{
671+
#inner_data_and_offsets
672+
as_array_ref(LargeBinaryArray::new(offsets, inner_data, #validity_src))
673+
}}
674+
} else {
675+
quote! {{
676+
#inner_data_and_offsets
677+
678+
// Safety: we're building this from actual native strings, so no need to do the
679+
// whole utf8 validation _again_.
680+
// It would be nice to use quote_comment here and put this safety notice in the generated code,
681+
// but that seems to push us over some complexity limit causing rustfmt to fail.
682+
#[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
683+
as_array_ref(unsafe { StringArray::new_unchecked(offsets, inner_data, #validity_src) })
684+
}}
685+
}
672686
}
673687

674688
DataType::List(inner_field) | DataType::FixedSizeList(inner_field, _) => {
@@ -919,6 +933,6 @@ fn quote_arrow_field_serializer(
919933
}}
920934
}
921935

922-
_ => unimplemented!("{datatype:#?}"),
936+
DataType::Object { .. } => unimplemented!("{datatype:#?}"),
923937
}
924938
}

crates/build/re_types_builder/src/data_type.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ impl std::fmt::Display for AtomicDataType {
112112
pub enum DataType {
113113
Atomic(AtomicDataType),
114114

115+
// 32-bit or 64-bit
115116
Binary,
116117

117118
Utf8,
@@ -153,8 +154,12 @@ impl DataType {
153154
pub enum LazyDatatype {
154155
Atomic(AtomicDataType),
155156

157+
/// A list of bytes of arbitrary length.
158+
///
159+
/// 32-bit or 64-bit
156160
Binary,
157161

162+
/// Utf8
158163
Utf8,
159164

160165
/// Elements are non-nullable

0 commit comments

Comments
 (0)