Skip to content

Commit 95a4fab

Browse files
authored
Merge branch 'main' into typed-access-numeric-shredded
2 parents a1c9ab4 + 377f180 commit 95a4fab

File tree

17 files changed

+734
-52
lines changed

17 files changed

+734
-52
lines changed

.github/workflows/docs.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,6 @@ jobs:
4242
- uses: actions/checkout@v5
4343
with:
4444
submodules: true
45-
- name: Install python dev
46-
run: |
47-
apt update
48-
apt install -y libpython3.11-dev
4945
- name: Setup Rust toolchain
5046
uses: ./.github/actions/setup-builder
5147
- name: Install Nightly Rust

.github/workflows/miri.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,12 @@ jobs:
5252
submodules: true
5353
- name: Setup Rust toolchain
5454
run: |
55-
rustup toolchain install nightly --component miri
56-
rustup override set nightly
55+
# Temp pin to nightly-2025-08-18 until https://github.com/rust-lang/rust/issues/145652 is resolved
56+
# See https://github.com/apache/arrow-rs/issues/8181 for more details
57+
rustup toolchain install nightly-2025-08-18 --component miri
58+
rustup override set nightly-2025-08-18
59+
# rustup toolchain install nightly --component miri
60+
# rustup override set nightly
5761
cargo miri setup
5862
- name: Run Miri Checks
5963
env:

arrow-row/src/lib.rs

Lines changed: 194 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ use arrow_array::cast::*;
139139
use arrow_array::types::ArrowDictionaryKeyType;
140140
use arrow_array::*;
141141
use arrow_buffer::{ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer};
142-
use arrow_data::ArrayDataBuilder;
142+
use arrow_data::{ArrayData, ArrayDataBuilder};
143143
use arrow_schema::*;
144144
use variable::{decode_binary_view, decode_string_view};
145145

@@ -1668,8 +1668,24 @@ unsafe fn decode_column(
16681668
rows.iter_mut().for_each(|row| *row = &row[1..]);
16691669
let children = converter.convert_raw(rows, validate_utf8)?;
16701670

1671-
let child_data = children.iter().map(|c| c.to_data()).collect();
1672-
let builder = ArrayDataBuilder::new(field.data_type.clone())
1671+
let child_data: Vec<ArrayData> = children.iter().map(|c| c.to_data()).collect();
1672+
// Since RowConverter flattens certain data types (i.e. Dictionary),
1673+
// we need to use updated data type instead of original field
1674+
let corrected_fields: Vec<Field> = match &field.data_type {
1675+
DataType::Struct(struct_fields) => struct_fields
1676+
.iter()
1677+
.zip(child_data.iter())
1678+
.map(|(orig_field, child_array)| {
1679+
orig_field
1680+
.as_ref()
1681+
.clone()
1682+
.with_data_type(child_array.data_type().clone())
1683+
})
1684+
.collect(),
1685+
_ => unreachable!("Only Struct types should be corrected here"),
1686+
};
1687+
let corrected_struct_type = DataType::Struct(corrected_fields.into());
1688+
let builder = ArrayDataBuilder::new(corrected_struct_type)
16731689
.len(rows.len())
16741690
.null_count(null_count)
16751691
.null_bit_buffer(Some(nulls))
@@ -2208,6 +2224,177 @@ mod tests {
22082224
back[0].to_data().validate_full().unwrap();
22092225
}
22102226

2227+
#[test]
2228+
fn test_dictionary_in_struct() {
2229+
let builder = StringDictionaryBuilder::<Int32Type>::new();
2230+
let mut struct_builder = StructBuilder::new(
2231+
vec![Field::new_dictionary(
2232+
"foo",
2233+
DataType::Int32,
2234+
DataType::Utf8,
2235+
true,
2236+
)],
2237+
vec![Box::new(builder)],
2238+
);
2239+
2240+
let dict_builder = struct_builder
2241+
.field_builder::<StringDictionaryBuilder<Int32Type>>(0)
2242+
.unwrap();
2243+
2244+
// Flattened: ["a", null, "a", "b"]
2245+
dict_builder.append_value("a");
2246+
dict_builder.append_null();
2247+
dict_builder.append_value("a");
2248+
dict_builder.append_value("b");
2249+
2250+
for _ in 0..4 {
2251+
struct_builder.append(true);
2252+
}
2253+
2254+
let s = Arc::new(struct_builder.finish()) as ArrayRef;
2255+
let sort_fields = vec![SortField::new(s.data_type().clone())];
2256+
let converter = RowConverter::new(sort_fields).unwrap();
2257+
let r = converter.convert_columns(&[Arc::clone(&s)]).unwrap();
2258+
2259+
let back = converter.convert_rows(&r).unwrap();
2260+
let [s2] = back.try_into().unwrap();
2261+
2262+
// RowConverter flattens Dictionary
2263+
// s.ty = Struct(foo Dictionary(Int32, Utf8)), s2.ty = Struct(foo Utf8)
2264+
assert_ne!(&s.data_type(), &s2.data_type());
2265+
s2.to_data().validate_full().unwrap();
2266+
2267+
// Check if the logical data remains the same
2268+
// Keys: [0, null, 0, 1]
2269+
// Values: ["a", "b"]
2270+
let s1_struct = s.as_struct();
2271+
let s1_0 = s1_struct.column(0);
2272+
let s1_idx_0 = s1_0.as_dictionary::<Int32Type>();
2273+
let keys = s1_idx_0.keys();
2274+
let values = s1_idx_0.values().as_string::<i32>();
2275+
// Flattened: ["a", null, "a", "b"]
2276+
let s2_struct = s2.as_struct();
2277+
let s2_0 = s2_struct.column(0);
2278+
let s2_idx_0 = s2_0.as_string::<i32>();
2279+
2280+
for i in 0..keys.len() {
2281+
if keys.is_null(i) {
2282+
assert!(s2_idx_0.is_null(i));
2283+
} else {
2284+
let dict_index = keys.value(i) as usize;
2285+
assert_eq!(values.value(dict_index), s2_idx_0.value(i));
2286+
}
2287+
}
2288+
}
2289+
2290+
#[test]
2291+
fn test_dictionary_in_struct_empty() {
2292+
let ty = DataType::Struct(
2293+
vec![Field::new_dictionary(
2294+
"foo",
2295+
DataType::Int32,
2296+
DataType::Int32,
2297+
false,
2298+
)]
2299+
.into(),
2300+
);
2301+
let s = arrow_array::new_empty_array(&ty);
2302+
2303+
let sort_fields = vec![SortField::new(s.data_type().clone())];
2304+
let converter = RowConverter::new(sort_fields).unwrap();
2305+
let r = converter.convert_columns(&[Arc::clone(&s)]).unwrap();
2306+
2307+
let back = converter.convert_rows(&r).unwrap();
2308+
let [s2] = back.try_into().unwrap();
2309+
2310+
// RowConverter flattens Dictionary
2311+
// s.ty = Struct(foo Dictionary(Int32, Int32)), s2.ty = Struct(foo Int32)
2312+
assert_ne!(&s.data_type(), &s2.data_type());
2313+
s2.to_data().validate_full().unwrap();
2314+
assert_eq!(s.len(), 0);
2315+
assert_eq!(s2.len(), 0);
2316+
}
2317+
2318+
#[test]
2319+
fn test_list_of_string_dictionary() {
2320+
let mut builder = ListBuilder::<StringDictionaryBuilder<Int32Type>>::default();
2321+
// List[0] = ["a", "b", "zero", null, "c", "b", "d" (dict)]
2322+
builder.values().append("a").unwrap();
2323+
builder.values().append("b").unwrap();
2324+
builder.values().append("zero").unwrap();
2325+
builder.values().append_null();
2326+
builder.values().append("c").unwrap();
2327+
builder.values().append("b").unwrap();
2328+
builder.values().append("d").unwrap();
2329+
builder.append(true);
2330+
// List[1] = null
2331+
builder.append(false);
2332+
// List[2] = ["e", "zero", "a" (dict)]
2333+
builder.values().append("e").unwrap();
2334+
builder.values().append("zero").unwrap();
2335+
builder.values().append("a").unwrap();
2336+
builder.append(true);
2337+
2338+
let a = Arc::new(builder.finish()) as ArrayRef;
2339+
let data_type = a.data_type().clone();
2340+
2341+
let field = SortField::new(data_type.clone());
2342+
let converter = RowConverter::new(vec![field]).unwrap();
2343+
let rows = converter.convert_columns(&[Arc::clone(&a)]).unwrap();
2344+
2345+
let back = converter.convert_rows(&rows).unwrap();
2346+
assert_eq!(back.len(), 1);
2347+
let [a2] = back.try_into().unwrap();
2348+
2349+
// RowConverter flattens Dictionary
2350+
// a.ty: List(Dictionary(Int32, Utf8)), a2.ty: List(Utf8)
2351+
assert_ne!(&a.data_type(), &a2.data_type());
2352+
2353+
a2.to_data().validate_full().unwrap();
2354+
2355+
let a2_list = a2.as_list::<i32>();
2356+
let a1_list = a.as_list::<i32>();
2357+
2358+
// Check if the logical data remains the same
2359+
// List[0] = ["a", "b", "zero", null, "c", "b", "d" (dict)]
2360+
let a1_0 = a1_list.value(0);
2361+
let a1_idx_0 = a1_0.as_dictionary::<Int32Type>();
2362+
let keys = a1_idx_0.keys();
2363+
let values = a1_idx_0.values().as_string::<i32>();
2364+
let a2_0 = a2_list.value(0);
2365+
let a2_idx_0 = a2_0.as_string::<i32>();
2366+
2367+
for i in 0..keys.len() {
2368+
if keys.is_null(i) {
2369+
assert!(a2_idx_0.is_null(i));
2370+
} else {
2371+
let dict_index = keys.value(i) as usize;
2372+
assert_eq!(values.value(dict_index), a2_idx_0.value(i));
2373+
}
2374+
}
2375+
2376+
// List[1] = null
2377+
assert!(a1_list.is_null(1));
2378+
assert!(a2_list.is_null(1));
2379+
2380+
// List[2] = ["e", "zero", "a" (dict)]
2381+
let a1_2 = a1_list.value(2);
2382+
let a1_idx_2 = a1_2.as_dictionary::<Int32Type>();
2383+
let keys = a1_idx_2.keys();
2384+
let values = a1_idx_2.values().as_string::<i32>();
2385+
let a2_2 = a2_list.value(2);
2386+
let a2_idx_2 = a2_2.as_string::<i32>();
2387+
2388+
for i in 0..keys.len() {
2389+
if keys.is_null(i) {
2390+
assert!(a2_idx_2.is_null(i));
2391+
} else {
2392+
let dict_index = keys.value(i) as usize;
2393+
assert_eq!(values.value(dict_index), a2_idx_2.value(i));
2394+
}
2395+
}
2396+
}
2397+
22112398
#[test]
22122399
fn test_primitive_dictionary() {
22132400
let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
@@ -2231,6 +2418,10 @@ mod tests {
22312418
assert!(rows.row(3) < rows.row(2));
22322419
assert!(rows.row(6) < rows.row(2));
22332420
assert!(rows.row(3) < rows.row(6));
2421+
2422+
let back = converter.convert_rows(&rows).unwrap();
2423+
assert_eq!(back.len(), 1);
2424+
back[0].to_data().validate_full().unwrap();
22342425
}
22352426

22362427
#[test]

arrow-row/src/list.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use arrow_array::{new_null_array, Array, FixedSizeListArray, GenericListArray, O
2020
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
2121
use arrow_data::ArrayDataBuilder;
2222
use arrow_schema::{ArrowError, DataType, SortOptions};
23-
use std::ops::Range;
23+
use std::{ops::Range, sync::Arc};
2424

2525
pub fn compute_lengths<O: OffsetSizeTrait>(
2626
lengths: &mut [usize],
@@ -179,7 +179,25 @@ pub unsafe fn decode<O: OffsetSizeTrait>(
179179

180180
let child_data = child[0].to_data();
181181

182-
let builder = ArrayDataBuilder::new(field.data_type.clone())
182+
// Since RowConverter flattens certain data types (i.e. Dictionary),
183+
// we need to use updated data type instead of original field
184+
let corrected_type = match &field.data_type {
185+
DataType::List(inner_field) => DataType::List(Arc::new(
186+
inner_field
187+
.as_ref()
188+
.clone()
189+
.with_data_type(child_data.data_type().clone()),
190+
)),
191+
DataType::LargeList(inner_field) => DataType::LargeList(Arc::new(
192+
inner_field
193+
.as_ref()
194+
.clone()
195+
.with_data_type(child_data.data_type().clone()),
196+
)),
197+
_ => unreachable!(),
198+
};
199+
200+
let builder = ArrayDataBuilder::new(corrected_type)
183201
.len(rows.len())
184202
.null_count(null_count)
185203
.null_bit_buffer(Some(nulls.into()))

parquet-variant-compute/src/cast_to_variant.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -591,13 +591,19 @@ mod tests {
591591
Arc::new(microsecond_array.with_timezone("+01:00".to_string())),
592592
);
593593

594-
// nanoseconds should get truncated to microseconds
594+
let timestamp = DateTime::from_timestamp_nanos(nanosecond);
595595
let nanosecond_array = TimestampNanosecondArray::from(vec![Some(nanosecond), None]);
596-
run_array_tests(
597-
microsecond,
596+
run_test(
598597
Arc::new(nanosecond_array.clone()),
598+
vec![
599+
Some(Variant::TimestampNtzNanos(timestamp.naive_utc())),
600+
None,
601+
],
602+
);
603+
run_test(
599604
Arc::new(nanosecond_array.with_timezone("+01:00".to_string())),
600-
)
605+
vec![Some(Variant::TimestampNanos(timestamp)), None],
606+
);
601607
}
602608

603609
#[test]

0 commit comments

Comments
 (0)