Skip to content

Commit 9b2fbbb

Browse files
committed
use utfview8
1 parent 8baa05d commit 9b2fbbb

File tree

6 files changed

+31
-31
lines changed

6 files changed

+31
-31
lines changed

datafusion/core/tests/parquet/page_pruning.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ async fn page_index_filter_one_col() {
160160

161161
// 5.create filter date_string_col == "01/01/09"`;
162162
// Note this test doesn't apply type coercion so the literal must match the actual view type
163-
let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8("01/01/09")));
163+
let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8view("01/01/09")));
164164
let parquet_exec = get_parquet_exec(&state, filter).await;
165165
let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
166166
let batch = results.next().await.unwrap().unwrap();

datafusion/sqllogictest/test_files/describe.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ int_col Int32 YES
8181
bigint_col Int64 YES
8282
float_col Float32 YES
8383
double_col Float64 YES
84-
date_string_col Utf8 YES
85-
string_col Utf8 YES
84+
date_string_col Utf8View YES
85+
string_col Utf8View YES
8686
timestamp_col Timestamp(Nanosecond, None) YES
8787
year Int32 YES
8888
month Int32 YES

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ datafusion.execution.parquet.metadata_size_hint NULL
249249
datafusion.execution.parquet.pruning true
250250
datafusion.execution.parquet.pushdown_filters false
251251
datafusion.execution.parquet.reorder_filters false
252-
datafusion.execution.parquet.schema_force_view_types false
252+
datafusion.execution.parquet.schema_force_view_types true
253253
datafusion.execution.parquet.skip_arrow_metadata false
254254
datafusion.execution.parquet.skip_metadata true
255255
datafusion.execution.parquet.statistics_enabled page
@@ -359,7 +359,7 @@ datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the
359359
datafusion.execution.parquet.pruning true (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file
360360
datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
361361
datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
362-
datafusion.execution.parquet.schema_force_view_types false (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
362+
datafusion.execution.parquet.schema_force_view_types true (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
363363
datafusion.execution.parquet.skip_arrow_metadata false (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to <https://docs.rs/parquet/53.3.0/parquet/arrow/arrow_writer/struct.ArrowWriterOptions.html#method.with_skip_arrow_metadata>
364364
datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
365365
datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting

datafusion/sqllogictest/test_files/map.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ describe data;
4545
----
4646
ints Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO
4747
strings Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO
48-
timestamp Utf8 NO
48+
timestamp Utf8View NO
4949

5050
query ??T
5151
SELECT * FROM data ORDER by ints['bytes'] DESC LIMIT 10;

datafusion/sqllogictest/test_files/parquet.slt

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -384,15 +384,15 @@ select
384384
arrow_typeof(binaryview_col), binaryview_col
385385
FROM binary_as_string_default;
386386
----
387-
Binary 616161 Binary 616161 Binary 616161
388-
Binary 626262 Binary 626262 Binary 626262
389-
Binary 636363 Binary 636363 Binary 636363
390-
Binary 646464 Binary 646464 Binary 646464
391-
Binary 656565 Binary 656565 Binary 656565
392-
Binary 666666 Binary 666666 Binary 666666
393-
Binary 676767 Binary 676767 Binary 676767
394-
Binary 686868 Binary 686868 Binary 686868
395-
Binary 696969 Binary 696969 Binary 696969
387+
BinaryView 616161 BinaryView 616161 BinaryView 616161
388+
BinaryView 626262 BinaryView 626262 BinaryView 626262
389+
BinaryView 636363 BinaryView 636363 BinaryView 636363
390+
BinaryView 646464 BinaryView 646464 BinaryView 646464
391+
BinaryView 656565 BinaryView 656565 BinaryView 656565
392+
BinaryView 666666 BinaryView 666666 BinaryView 666666
393+
BinaryView 676767 BinaryView 676767 BinaryView 676767
394+
BinaryView 686868 BinaryView 686868 BinaryView 686868
395+
BinaryView 696969 BinaryView 696969 BinaryView 696969
396396

397397
# Run an explain plan to show the cast happens in the plan (a CAST is needed for the predicates)
398398
query TT
@@ -405,11 +405,11 @@ EXPLAIN
405405
binaryview_col LIKE '%a%';
406406
----
407407
logical_plan
408-
01)Filter: CAST(binary_as_string_default.binary_col AS Utf8) LIKE Utf8("%a%") AND CAST(binary_as_string_default.largebinary_col AS Utf8) LIKE Utf8("%a%") AND CAST(binary_as_string_default.binaryview_col AS Utf8) LIKE Utf8("%a%")
409-
02)--TableScan: binary_as_string_default projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[CAST(binary_as_string_default.binary_col AS Utf8) LIKE Utf8("%a%"), CAST(binary_as_string_default.largebinary_col AS Utf8) LIKE Utf8("%a%"), CAST(binary_as_string_default.binaryview_col AS Utf8) LIKE Utf8("%a%")]
408+
01)Filter: CAST(binary_as_string_default.binary_col AS Utf8View) LIKE Utf8View("%a%") AND CAST(binary_as_string_default.largebinary_col AS Utf8View) LIKE Utf8View("%a%") AND CAST(binary_as_string_default.binaryview_col AS Utf8View) LIKE Utf8View("%a%")
409+
02)--TableScan: binary_as_string_default projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[CAST(binary_as_string_default.binary_col AS Utf8View) LIKE Utf8View("%a%"), CAST(binary_as_string_default.largebinary_col AS Utf8View) LIKE Utf8View("%a%"), CAST(binary_as_string_default.binaryview_col AS Utf8View) LIKE Utf8View("%a%")]
410410
physical_plan
411411
01)CoalesceBatchesExec: target_batch_size=8192
412-
02)--FilterExec: CAST(binary_col@0 AS Utf8) LIKE %a% AND CAST(largebinary_col@1 AS Utf8) LIKE %a% AND CAST(binaryview_col@2 AS Utf8) LIKE %a%
412+
02)--FilterExec: CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
413413
03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
414414
04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
415415

@@ -432,15 +432,15 @@ select
432432
arrow_typeof(binaryview_col), binaryview_col
433433
FROM binary_as_string_option;
434434
----
435-
Utf8 aaa Utf8 aaa Utf8 aaa
436-
Utf8 bbb Utf8 bbb Utf8 bbb
437-
Utf8 ccc Utf8 ccc Utf8 ccc
438-
Utf8 ddd Utf8 ddd Utf8 ddd
439-
Utf8 eee Utf8 eee Utf8 eee
440-
Utf8 fff Utf8 fff Utf8 fff
441-
Utf8 ggg Utf8 ggg Utf8 ggg
442-
Utf8 hhh Utf8 hhh Utf8 hhh
443-
Utf8 iii Utf8 iii Utf8 iii
435+
Utf8View aaa Utf8View aaa Utf8View aaa
436+
Utf8View bbb Utf8View bbb Utf8View bbb
437+
Utf8View ccc Utf8View ccc Utf8View ccc
438+
Utf8View ddd Utf8View ddd Utf8View ddd
439+
Utf8View eee Utf8View eee Utf8View eee
440+
Utf8View fff Utf8View fff Utf8View fff
441+
Utf8View ggg Utf8View ggg Utf8View ggg
442+
Utf8View hhh Utf8View hhh Utf8View hhh
443+
Utf8View iii Utf8View iii Utf8View iii
444444

445445
# Run an explain plan to show the cast happens in the plan (there should be no casts)
446446
query TT
@@ -453,8 +453,8 @@ EXPLAIN
453453
binaryview_col LIKE '%a%';
454454
----
455455
logical_plan
456-
01)Filter: binary_as_string_option.binary_col LIKE Utf8("%a%") AND binary_as_string_option.largebinary_col LIKE Utf8("%a%") AND binary_as_string_option.binaryview_col LIKE Utf8("%a%")
457-
02)--TableScan: binary_as_string_option projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[binary_as_string_option.binary_col LIKE Utf8("%a%"), binary_as_string_option.largebinary_col LIKE Utf8("%a%"), binary_as_string_option.binaryview_col LIKE Utf8("%a%")]
456+
01)Filter: binary_as_string_option.binary_col LIKE Utf8View("%a%") AND binary_as_string_option.largebinary_col LIKE Utf8View("%a%") AND binary_as_string_option.binaryview_col LIKE Utf8View("%a%")
457+
02)--TableScan: binary_as_string_option projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[binary_as_string_option.binary_col LIKE Utf8View("%a%"), binary_as_string_option.largebinary_col LIKE Utf8View("%a%"), binary_as_string_option.binaryview_col LIKE Utf8View("%a%")]
458458
physical_plan
459459
01)CoalesceBatchesExec: target_batch_size=8192
460460
02)--FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%

datafusion/sqllogictest/test_files/simplify_predicates.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ query TT
8484
EXPLAIN SELECT * FROM test_data WHERE str_col > 'apple' AND str_col > 'banana';
8585
----
8686
logical_plan
87-
01)Filter: test_data.str_col > Utf8("banana")
87+
01)Filter: test_data.str_col > Utf8View("banana")
8888
02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
8989

9090
# date_col > '2023-01-01' AND date_col > '2023-02-01' should simplify to date_col > '2023-02-01'
@@ -120,7 +120,7 @@ WHERE int_col > 5
120120
AND float_col BETWEEN 1 AND 100;
121121
----
122122
logical_plan
123-
01)Filter: test_data.str_col LIKE Utf8("A%") AND test_data.float_col >= Float32(1) AND test_data.float_col <= Float32(100) AND test_data.int_col > Int32(10)
123+
01)Filter: test_data.str_col LIKE Utf8View("A%") AND test_data.float_col >= Float32(1) AND test_data.float_col <= Float32(100) AND test_data.int_col > Int32(10)
124124
02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
125125

126126
statement ok

0 commit comments

Comments
 (0)