|
2 | 2 | // SPDX-FileCopyrightText: Copyright the Vortex contributors |
3 | 3 |
|
4 | 4 | use itertools::Itertools; |
| 5 | +use num_traits::NumCast; |
5 | 6 | use vortex_array::arrays::{ |
6 | | - BoolArray, BooleanBuffer, ConstantArray, NullArray, PrimitiveArray, StructArray, |
7 | | - smallest_storage_type, |
| 7 | + BinaryView, BoolArray, BooleanBuffer, ConstantArray, NullArray, PrimitiveArray, StructArray, |
| 8 | + VarBinViewArray, smallest_storage_type, |
8 | 9 | }; |
9 | 10 | use vortex_array::builders::{ArrayBuilder as _, DecimalBuilder}; |
10 | 11 | use vortex_array::patches::Patches; |
11 | 12 | use vortex_array::validity::Validity; |
12 | 13 | use vortex_array::vtable::CanonicalVTable; |
13 | | -use vortex_array::{Array, Canonical}; |
14 | | -use vortex_buffer::buffer; |
| 14 | +use vortex_array::{Array, Canonical, ToCanonical as _}; |
| 15 | +use vortex_buffer::{Buffer, buffer, buffer_mut}; |
15 | 16 | use vortex_dtype::{ |
16 | | - DType, DecimalDType, NativePType, Nullability, StructFields, match_each_native_ptype, |
| 17 | + DType, DecimalDType, NativePType, Nullability, StructFields, match_each_integer_ptype, |
| 18 | + match_each_native_ptype, |
17 | 19 | }; |
18 | 20 | use vortex_error::{VortexError, VortexExpect as _, VortexResult, vortex_err}; |
19 | 21 | use vortex_scalar::{ |
20 | | - DecimalScalar, NativeDecimalType, Scalar, StructScalar, match_each_decimal_value_type, |
| 22 | + DecimalScalar, NativeDecimalType, Scalar, StructScalar, Utf8Scalar, |
| 23 | + match_each_decimal_value_type, |
21 | 24 | }; |
22 | 25 |
|
23 | 26 | use crate::{SparseArray, SparseVTable}; |
@@ -63,7 +66,21 @@ impl CanonicalVTable<SparseVTable> for SparseVTable { |
63 | 66 | ) |
64 | 67 | }) |
65 | 68 | } |
66 | | - DType::Utf8(_nullability) => todo!(), |
| 69 | + DType::Utf8(nullability) => { |
| 70 | + let patches = array.resolved_patches()?; |
| 71 | + let indices = patches.indices().to_primitive()?; |
| 72 | + let values = patches.values().to_varbinview()?; |
| 73 | + let fill_value = array.fill_scalar().as_utf8(); |
| 74 | + let validity = array |
| 75 | + .validity_mask() |
| 76 | + .map(|x| Validity::from_mask(x, *nullability))?; |
| 77 | + let len = array.len(); |
| 78 | + |
| 79 | + match_each_integer_ptype!(indices.ptype(), |I| { |
| 80 | + let indices = indices.buffer::<I>(); |
| 81 | + canonicalize_utf8::<I>(fill_value, indices, values, validity, len) |
| 82 | + }) |
| 83 | + } |
67 | 84 | DType::Binary(_nullability) => todo!(), |
68 | 85 | DType::List(_dtype, _nullability) => todo!(), |
69 | 86 | DType::Extension(_ext_dtype) => todo!(), |
@@ -199,12 +216,51 @@ fn canonicalize_sparse_decimal<D: NativeDecimalType>( |
199 | 216 | Ok(Canonical::Decimal(array)) |
200 | 217 | } |
201 | 218 |
|
| 219 | +fn canonicalize_utf8<I: NativePType>( |
| 220 | + fill_value: Utf8Scalar, |
| 221 | + indices: Buffer<I>, |
| 222 | + values: VarBinViewArray, |
| 223 | + validity: Validity, |
| 224 | + len: usize, |
| 225 | +) -> VortexResult<Canonical> { |
| 226 | + let n_patch_buffers = values.buffers().len(); |
| 227 | + let mut buffers = values.buffers().to_vec(); |
| 228 | + |
| 229 | + let fill = if let Some(buffer) = &fill_value.value() { |
| 230 | + buffers.push(buffer.inner().clone()); |
| 231 | + BinaryView::make_view( |
| 232 | + buffer.as_ref(), |
| 233 | + u32::try_from(n_patch_buffers).vortex_expect("too many buffers"), |
| 234 | + 0, |
| 235 | + ) |
| 236 | + } else { |
| 237 | + // any <=12 character value will do |
| 238 | + BinaryView::make_view(&[], 0, 0) |
| 239 | + }; |
| 240 | + |
| 241 | + let mut views = buffer_mut![fill; len]; |
| 242 | + for (patch_index, &patch) in indices.into_iter().zip_eq(values.views().iter()) { |
| 243 | + let patch_index_usize = <usize as NumCast>::from(patch_index) |
| 244 | + .vortex_expect("var bin view indices must fit in usize"); |
| 245 | + views[patch_index_usize] = patch; |
| 246 | + } |
| 247 | + |
| 248 | + let array = VarBinViewArray::try_new( |
| 249 | + views.freeze(), |
| 250 | + buffers, |
| 251 | + DType::Utf8(validity.nullability()), |
| 252 | + validity, |
| 253 | + )?; |
| 254 | + |
| 255 | + Ok(Canonical::VarBinView(array)) |
| 256 | +} |
| 257 | + |
202 | 258 | #[cfg(test)] |
203 | 259 | mod test { |
204 | | - |
205 | 260 | use rstest::rstest; |
206 | 261 | use vortex_array::arrays::{ |
207 | | - BoolArray, BooleanBufferBuilder, DecimalArray, PrimitiveArray, StructArray, |
| 262 | + BoolArray, BooleanBufferBuilder, DecimalArray, PrimitiveArray, StructArray, VarBinArray, |
| 263 | + VarBinViewArray, |
208 | 264 | }; |
209 | 265 | use vortex_array::arrow::IntoArrowArray as _; |
210 | 266 | use vortex_array::validity::Validity; |
@@ -511,4 +567,174 @@ mod test { |
511 | 567 | assert_eq!(expected.data_type(), actual.data_type()); |
512 | 568 | assert_eq!(&expected, &actual); |
513 | 569 | } |
| 570 | + |
| 571 | + #[test] |
| 572 | + fn test_sparse_varbinview_non_null_fill() { |
| 573 | + let strings = <VarBinViewArray as FromIterator<_>>::from_iter([ |
| 574 | + Some("hello"), |
| 575 | + Some("goodbye"), |
| 576 | + Some("hello"), |
| 577 | + None, |
| 578 | + Some("bonjour"), |
| 579 | + Some("你好"), |
| 580 | + None, |
| 581 | + ]) |
| 582 | + .into_array(); |
| 583 | + |
| 584 | + let array = SparseArray::try_new( |
| 585 | + buffer![0u16, 3, 4, 5, 7, 9, 10].into_array(), |
| 586 | + strings, |
| 587 | + 12, |
| 588 | + Scalar::from(Some("123".to_owned())), |
| 589 | + ) |
| 590 | + .unwrap(); |
| 591 | + |
| 592 | + let actual = array.to_varbinview().unwrap().into_array(); |
| 593 | + let expected = <VarBinViewArray as FromIterator<_>>::from_iter([ |
| 594 | + Some("hello"), |
| 595 | + Some("123"), |
| 596 | + Some("123"), |
| 597 | + Some("goodbye"), |
| 598 | + Some("hello"), |
| 599 | + None, |
| 600 | + Some("123"), |
| 601 | + Some("bonjour"), |
| 602 | + Some("123"), |
| 603 | + Some("你好"), |
| 604 | + None, |
| 605 | + Some("123"), |
| 606 | + ]) |
| 607 | + .into_array(); |
| 608 | + |
| 609 | + let actual = actual.into_arrow_preferred().unwrap(); |
| 610 | + let expected = expected.into_arrow_preferred().unwrap(); |
| 611 | + |
| 612 | + assert_eq!(actual.data_type(), expected.data_type()); |
| 613 | + assert_eq!(&actual, &expected); |
| 614 | + } |
| 615 | + |
| 616 | + #[test] |
| 617 | + fn test_sparse_varbinview_null_fill() { |
| 618 | + let strings = <VarBinViewArray as FromIterator<_>>::from_iter([ |
| 619 | + Some("hello"), |
| 620 | + Some("goodbye"), |
| 621 | + Some("hello"), |
| 622 | + None, |
| 623 | + Some("bonjour"), |
| 624 | + Some("你好"), |
| 625 | + None, |
| 626 | + ]) |
| 627 | + .into_array(); |
| 628 | + |
| 629 | + let array = SparseArray::try_new( |
| 630 | + buffer![0u16, 3, 4, 5, 7, 9, 10].into_array(), |
| 631 | + strings, |
| 632 | + 12, |
| 633 | + Scalar::null(DType::Utf8(Nullable)), |
| 634 | + ) |
| 635 | + .unwrap(); |
| 636 | + |
| 637 | + let actual = array.to_varbinview().unwrap().into_array(); |
| 638 | + let expected = <VarBinViewArray as FromIterator<_>>::from_iter([ |
| 639 | + Some("hello"), |
| 640 | + None, |
| 641 | + None, |
| 642 | + Some("goodbye"), |
| 643 | + Some("hello"), |
| 644 | + None, |
| 645 | + None, |
| 646 | + Some("bonjour"), |
| 647 | + None, |
| 648 | + Some("你好"), |
| 649 | + None, |
| 650 | + None, |
| 651 | + ]) |
| 652 | + .into_array(); |
| 653 | + |
| 654 | + let actual = actual.into_arrow_preferred().unwrap(); |
| 655 | + let expected = expected.into_arrow_preferred().unwrap(); |
| 656 | + |
| 657 | + assert_eq!(actual.data_type(), expected.data_type()); |
| 658 | + assert_eq!(&actual, &expected); |
| 659 | + } |
| 660 | + |
| 661 | + #[test] |
| 662 | + fn test_sparse_varbinview_non_nullable() { |
| 663 | + let strings = |
| 664 | + VarBinViewArray::from_iter_str(["hello", "goodbye", "hello", "bonjour", "你好"]) |
| 665 | + .into_array(); |
| 666 | + |
| 667 | + let array = SparseArray::try_new( |
| 668 | + buffer![0u16, 3, 4, 5, 8].into_array(), |
| 669 | + strings, |
| 670 | + 9, |
| 671 | + Scalar::from("123".to_owned()), |
| 672 | + ) |
| 673 | + .unwrap(); |
| 674 | + |
| 675 | + let actual = array.to_varbinview().unwrap().into_array(); |
| 676 | + let expected = <VarBinViewArray as FromIterator<_>>::from_iter([ |
| 677 | + Some("hello"), |
| 678 | + Some("123"), |
| 679 | + Some("123"), |
| 680 | + Some("goodbye"), |
| 681 | + Some("hello"), |
| 682 | + Some("bonjour"), |
| 683 | + Some("123"), |
| 684 | + Some("123"), |
| 685 | + Some("你好"), |
| 686 | + ]) |
| 687 | + .into_array(); |
| 688 | + |
| 689 | + let actual = actual.into_arrow_preferred().unwrap(); |
| 690 | + let expected = expected.into_arrow_preferred().unwrap(); |
| 691 | + |
| 692 | + assert_eq!(actual.data_type(), expected.data_type()); |
| 693 | + assert_eq!(&actual, &expected); |
| 694 | + } |
| 695 | + |
| 696 | + #[test] |
| 697 | + fn test_sparse_varbin_null_fill() { |
| 698 | + let strings = <VarBinArray as FromIterator<_>>::from_iter([ |
| 699 | + Some("hello"), |
| 700 | + Some("goodbye"), |
| 701 | + Some("hello"), |
| 702 | + None, |
| 703 | + Some("bonjour"), |
| 704 | + Some("你好"), |
| 705 | + None, |
| 706 | + ]) |
| 707 | + .into_array(); |
| 708 | + |
| 709 | + let array = SparseArray::try_new( |
| 710 | + buffer![0u16, 3, 4, 5, 7, 9, 10].into_array(), |
| 711 | + strings, |
| 712 | + 12, |
| 713 | + Scalar::null(DType::Utf8(Nullable)), |
| 714 | + ) |
| 715 | + .unwrap(); |
| 716 | + |
| 717 | + let actual = array.to_varbinview().unwrap().into_array(); |
| 718 | + let expected = <VarBinViewArray as FromIterator<_>>::from_iter([ |
| 719 | + Some("hello"), |
| 720 | + None, |
| 721 | + None, |
| 722 | + Some("goodbye"), |
| 723 | + Some("hello"), |
| 724 | + None, |
| 725 | + None, |
| 726 | + Some("bonjour"), |
| 727 | + None, |
| 728 | + Some("你好"), |
| 729 | + None, |
| 730 | + None, |
| 731 | + ]) |
| 732 | + .into_array(); |
| 733 | + |
| 734 | + let actual = actual.into_arrow_preferred().unwrap(); |
| 735 | + let expected = expected.into_arrow_preferred().unwrap(); |
| 736 | + |
| 737 | + assert_eq!(actual.data_type(), expected.data_type()); |
| 738 | + assert_eq!(&actual, &expected); |
| 739 | + } |
514 | 740 | } |
0 commit comments