Skip to content

Commit 89f1b7c

Browse files
authored
Revert "Revert arrow upgrade and related changes" (#52)
* Revert "Revert arrow upgrade and related changes (#50)" This reverts commit 5506e69. * Allow typo
1 parent ff2533c commit 89f1b7c

File tree

37 files changed

+949
-614
lines changed

37 files changed

+949
-614
lines changed

Cargo.lock

Lines changed: 62 additions & 104 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -90,20 +90,20 @@ ahash = { version = "0.8", default-features = false, features = [
9090
"runtime-rng",
9191
] }
9292
apache-avro = { version = "0.20", default-features = false }
93-
arrow = { version = "55.2.0", features = [
93+
arrow = { version = "56.0.0", features = [
9494
"prettyprint",
9595
"chrono-tz",
9696
] }
97-
arrow-buffer = { version = "55.2.0", default-features = false }
98-
arrow-flight = { version = "55.2.0", features = [
97+
arrow-buffer = { version = "56.0.0", default-features = false }
98+
arrow-flight = { version = "56.0.0", features = [
9999
"flight-sql-experimental",
100100
] }
101-
arrow-ipc = { version = "55.2.0", default-features = false, features = [
101+
arrow-ipc = { version = "56.0.0", default-features = false, features = [
102102
"lz4",
103103
] }
104-
arrow-ord = { version = "55.2.0", default-features = false }
105-
arrow-schema = { version = "55.2.0", default-features = false }
106-
async-trait = "0.1.88"
104+
arrow-ord = { version = "56.0.0", default-features = false }
105+
arrow-schema = { version = "56.0.0", default-features = false }
106+
async-trait = "0.1.89"
107107
bigdecimal = "0.4.8"
108108
bytes = "1.10"
109109
chrono = { version = "0.4.41", default-features = false }
@@ -157,7 +157,7 @@ itertools = "0.14"
157157
log = "^0.4"
158158
object_store = { version = "0.12.3", default-features = false }
159159
parking_lot = "0.12"
160-
parquet = { version = "55.2.0", default-features = false, features = [
160+
parquet = { version = "56.0.0", default-features = false, features = [
161161
"arrow",
162162
"async",
163163
"object_store",

datafusion-examples/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ serde_json = { workspace = true }
8181
tempfile = { workspace = true }
8282
test-utils = { path = "../test-utils" }
8383
tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
84-
tonic = "0.12.1"
84+
tonic = "0.13.1"
8585
tracing = { version = "0.1" }
8686
tracing-subscriber = { version = "0.3" }
8787
url = { workspace = true }

datafusion-testing

Submodule datafusion-testing updated 84 files

datafusion/common/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ log = { workspace = true }
7171
object_store = { workspace = true, optional = true }
7272
parquet = { workspace = true, optional = true, default-features = true }
7373
paste = "1.0.15"
74-
pyo3 = { version = "0.24.2", optional = true }
74+
pyo3 = { version = "0.25", optional = true }
7575
recursive = { workspace = true, optional = true }
7676
sqlparser = { workspace = true }
7777
tokio = { workspace = true }

datafusion/common/src/config.rs

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -602,13 +602,6 @@ config_namespace! {
602602
/// default parquet writer setting
603603
pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into())
604604

605-
/// (writing) Sets max statistics size for any column. If NULL, uses
606-
/// default parquet writer setting
607-
/// max_statistics_size is deprecated, currently it is not being used
608-
// TODO: remove once deprecated
609-
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
610-
pub max_statistics_size: Option<usize>, default = Some(4096)
611-
612605
/// (writing) Target maximum number of rows in each row group (defaults to 1M
613606
/// rows). Writing larger row groups requires more memory to write, but
614607
/// can get better compression and be faster to read.
@@ -622,7 +615,7 @@ config_namespace! {
622615

623616
/// (writing) Sets statistics truncate length. If NULL, uses
624617
/// default parquet writer setting
625-
pub statistics_truncate_length: Option<usize>, default = None
618+
pub statistics_truncate_length: Option<usize>, default = Some(64)
626619

627620
/// (writing) Sets best effort maximum number of rows in data page
628621
pub data_page_row_count_limit: usize, default = 20_000
@@ -2141,13 +2134,6 @@ config_namespace_with_hashmap! {
21412134
/// Sets bloom filter number of distinct values. If NULL, uses
21422135
/// default parquet options
21432136
pub bloom_filter_ndv: Option<u64>, default = None
2144-
2145-
/// Sets max statistics size for the column path. If NULL, uses
2146-
/// default parquet options
2147-
/// max_statistics_size is deprecated, currently it is not being used
2148-
// TODO: remove once deprecated
2149-
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
2150-
pub max_statistics_size: Option<usize>, default = None
21512137
}
21522138
}
21532139

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ use parquet::{
3535
metadata::KeyValue,
3636
properties::{
3737
EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion,
38-
DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED,
38+
DEFAULT_STATISTICS_ENABLED,
3939
},
4040
},
4141
schema::types::ColumnPath,
@@ -160,16 +160,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
160160
builder =
161161
builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
162162
}
163-
164-
// max_statistics_size is deprecated, currently it is not being used
165-
// TODO: remove once deprecated
166-
#[allow(deprecated)]
167-
if let Some(max_statistics_size) = options.max_statistics_size {
168-
builder = {
169-
#[allow(deprecated)]
170-
builder.set_column_max_statistics_size(path, max_statistics_size)
171-
}
172-
}
173163
}
174164

175165
Ok(builder)
@@ -218,7 +208,6 @@ impl ParquetOptions {
218208
dictionary_enabled,
219209
dictionary_page_size_limit,
220210
statistics_enabled,
221-
max_statistics_size,
222211
max_row_group_size,
223212
created_by,
224213
column_index_truncate_length,
@@ -264,13 +253,6 @@ impl ParquetOptions {
264253
.set_data_page_row_count_limit(*data_page_row_count_limit)
265254
.set_bloom_filter_enabled(*bloom_filter_on_write);
266255

267-
builder = {
268-
#[allow(deprecated)]
269-
builder.set_max_statistics_size(
270-
max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE),
271-
)
272-
};
273-
274256
if let Some(bloom_filter_fpp) = bloom_filter_fpp {
275257
builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp);
276258
};
@@ -463,12 +445,10 @@ mod tests {
463445
fn column_options_with_non_defaults(
464446
src_col_defaults: &ParquetOptions,
465447
) -> ParquetColumnOptions {
466-
#[allow(deprecated)] // max_statistics_size
467448
ParquetColumnOptions {
468449
compression: Some("zstd(22)".into()),
469450
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
470451
statistics_enabled: Some("none".into()),
471-
max_statistics_size: Some(72),
472452
encoding: Some("RLE".into()),
473453
bloom_filter_enabled: Some(true),
474454
bloom_filter_fpp: Some(0.72),
@@ -493,7 +473,6 @@ mod tests {
493473
dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)),
494474
dictionary_page_size_limit: 42,
495475
statistics_enabled: Some("chunk".into()),
496-
max_statistics_size: Some(42),
497476
max_row_group_size: 42,
498477
created_by: "wordy".into(),
499478
column_index_truncate_length: Some(42),
@@ -551,7 +530,6 @@ mod tests {
551530
),
552531
bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
553532
bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
554-
max_statistics_size: Some(props.max_statistics_size(&col)),
555533
}
556534
}
557535

@@ -608,7 +586,6 @@ mod tests {
608586
compression: default_col_props.compression,
609587
dictionary_enabled: default_col_props.dictionary_enabled,
610588
statistics_enabled: default_col_props.statistics_enabled,
611-
max_statistics_size: default_col_props.max_statistics_size,
612589
bloom_filter_on_write: default_col_props
613590
.bloom_filter_enabled
614591
.unwrap_or_default(),

datafusion/common/src/scalar/mod.rs

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -904,11 +904,10 @@ pub fn dict_from_values<K: ArrowDictionaryKeyType>(
904904
.map(|index| {
905905
if values_array.is_valid(index) {
906906
let native_index = K::Native::from_usize(index).ok_or_else(|| {
907-
DataFusionError::Internal(format!(
908-
"Can not create index of type {} from value {}",
909-
K::DATA_TYPE,
910-
index
911-
))
907+
_internal_datafusion_err!(
908+
"Can not create index of type {} from value {index}",
909+
K::DATA_TYPE
910+
)
912911
})?;
913912
Ok(Some(native_index))
914913
} else {
@@ -2203,6 +2202,16 @@ impl ScalarValue {
22032202
}
22042203

22052204
let array: ArrayRef = match &data_type {
2205+
DataType::Decimal32(_precision, _scale) => {
2206+
return _not_impl_err!(
2207+
"Decimal32 not supported in ScalarValue::iter_to_array"
2208+
);
2209+
}
2210+
DataType::Decimal64(_precision, _scale) => {
2211+
return _not_impl_err!(
2212+
"Decimal64 not supported in ScalarValue::iter_to_array"
2213+
);
2214+
}
22062215
DataType::Decimal128(precision, scale) => {
22072216
let decimal_array =
22082217
ScalarValue::iter_to_decimal_array(scalars, *precision, *scale)?;

datafusion/common/src/types/native.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,10 @@ impl From<DataType> for NativeType {
407407
DataType::Union(union_fields, _) => {
408408
Union(LogicalUnionFields::from(&union_fields))
409409
}
410-
DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s),
410+
DataType::Decimal32(p, s)
411+
| DataType::Decimal64(p, s)
412+
| DataType::Decimal128(p, s)
413+
| DataType::Decimal256(p, s) => Decimal(p, s),
411414
DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
412415
DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
413416
DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),

datafusion/core/src/datasource/file_format/parquet.rs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -523,11 +523,23 @@ mod tests {
523523
let dic_array = DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values))?;
524524
let c_dic: ArrayRef = Arc::new(dic_array);
525525

526-
let batch1 = RecordBatch::try_from_iter(vec![("c_dic", c_dic)])?;
526+
// Data for column string_truncation: ["a".repeat(128), null, "b".repeat(128), null]
527+
let string_truncation: ArrayRef = Arc::new(StringArray::from(vec![
528+
Some("a".repeat(128)),
529+
None,
530+
Some("b".repeat(128)),
531+
None,
532+
]));
533+
534+
let batch1 = RecordBatch::try_from_iter(vec![
535+
("c_dic", c_dic),
536+
("string_truncation", string_truncation),
537+
])?;
527538

528539
// Use store_parquet to write each batch to its own file
529540
// . batch1 written into first file and includes:
530541
// - column c_dic that has 4 rows with no null. Stats min and max of dictionary column is available.
542+
// - column string_truncation that has 4 rows with 2 nulls. Stats min and max of string column is available but not exact.
531543
let store = Arc::new(RequestCountingObjectStore::new(Arc::new(
532544
LocalFileSystem::new(),
533545
)));
@@ -563,6 +575,19 @@ mod tests {
563575
Precision::Exact(Utf8(Some("a".into())))
564576
);
565577

578+
// column string_truncation
579+
let string_truncation_stats = &stats.column_statistics[1];
580+
581+
assert_eq!(string_truncation_stats.null_count, Precision::Exact(2));
582+
assert_eq!(
583+
string_truncation_stats.max_value,
584+
Precision::Inexact(ScalarValue::Utf8View(Some("b".repeat(63) + "c")))
585+
);
586+
assert_eq!(
587+
string_truncation_stats.min_value,
588+
Precision::Inexact(ScalarValue::Utf8View(Some("a".repeat(64))))
589+
);
590+
566591
Ok(())
567592
}
568593

0 commit comments

Comments
 (0)