-
Notifications
You must be signed in to change notification settings - Fork 14
Adds test ensuring dictionary corruption does not occur anymore #208
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
gabotechs
merged 4 commits into
datafusion-contrib:main
from
marc-pydantic:null-corruption
Oct 29, 2025
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,15 +1,21 @@ | ||
| #[cfg(all(feature = "integration", test))] | ||
| mod tests { | ||
| use datafusion::arrow::array::{Int32Array, StringArray}; | ||
| use datafusion::arrow::datatypes::{DataType, Field, Schema}; | ||
| use datafusion::arrow::record_batch::RecordBatch; | ||
| use datafusion::arrow::util::pretty::pretty_format_batches; | ||
| use datafusion::physical_plan::{displayable, execute_stream}; | ||
| use datafusion_distributed::test_utils::localhost::start_localhost_context; | ||
| use datafusion_distributed::test_utils::parquet::register_parquet_tables; | ||
| use datafusion_distributed::test_utils::session_context::register_temp_parquet_table; | ||
| use datafusion_distributed::{ | ||
| DefaultSessionBuilder, DistributedConfig, apply_network_boundaries, assert_snapshot, | ||
| display_plan_ascii, distribute_plan, | ||
| }; | ||
| use futures::TryStreamExt; | ||
| use std::error::Error; | ||
| use std::sync::Arc; | ||
| use uuid::Uuid; | ||
|
|
||
| #[tokio::test] | ||
| async fn distributed_aggregation() -> Result<(), Box<dyn Error>> { | ||
|
|
@@ -149,4 +155,98 @@ mod tests { | |
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| /// Test that multiple first_value() aggregations work correctly in distributed queries. | ||
| // TODO: Once https://github.com/apache/datafusion/pull/18303 is merged, this test will lose | ||
| // meaning, since the PR above will mask the underlying problem. Different queries or | ||
| // a new approach must be used in this case. | ||
| #[tokio::test] | ||
| async fn test_multiple_first_value_aggregations() -> Result<(), Box<dyn Error>> { | ||
| let (ctx, _guard) = start_localhost_context(3, DefaultSessionBuilder).await; | ||
|
|
||
| let schema = Arc::new(Schema::new(vec![ | ||
| Field::new("group_id", DataType::Int32, false), | ||
| Field::new("trace_id", DataType::Utf8, false), | ||
| Field::new("value", DataType::Int32, false), | ||
| ])); | ||
|
|
||
| // Create 2 batches that will be stored as separate parquet files | ||
| let batch1 = RecordBatch::try_new( | ||
| schema.clone(), | ||
| vec![ | ||
| Arc::new(Int32Array::from(vec![1, 2])), | ||
| Arc::new(StringArray::from(vec!["trace1", "trace2"])), | ||
| Arc::new(Int32Array::from(vec![100, 200])), | ||
| ], | ||
| )?; | ||
|
|
||
| let batch2 = RecordBatch::try_new( | ||
| schema.clone(), | ||
| vec![ | ||
| Arc::new(Int32Array::from(vec![3, 4])), | ||
| Arc::new(StringArray::from(vec!["trace3", "trace4"])), | ||
| Arc::new(Int32Array::from(vec![300, 400])), | ||
| ], | ||
| )?; | ||
|
|
||
| let file1 = | ||
| register_temp_parquet_table("records_part1", schema.clone(), vec![batch1], &ctx) | ||
| .await?; | ||
| let file2 = | ||
| register_temp_parquet_table("records_part2", schema.clone(), vec![batch2], &ctx) | ||
| .await?; | ||
|
|
||
| // Create a partitioned table by registering multiple files | ||
| let temp_dir = std::env::temp_dir(); | ||
| let table_dir = temp_dir.join(format!("partitioned_table_{}", Uuid::new_v4())); | ||
| std::fs::create_dir(&table_dir)?; | ||
| std::fs::copy(&file1, table_dir.join("part1.parquet"))?; | ||
| std::fs::copy(&file2, table_dir.join("part2.parquet"))?; | ||
|
|
||
| // Register the directory as a partitioned table | ||
| ctx.register_parquet( | ||
| "records_partitioned", | ||
| table_dir.to_str().unwrap(), | ||
| datafusion::prelude::ParquetReadOptions::default(), | ||
| ) | ||
| .await?; | ||
|
|
||
| let query = r#"SELECT group_id, first_value(trace_id) AS fv1, first_value(value) AS fv2 | ||
| FROM records_partitioned | ||
| GROUP BY group_id | ||
| ORDER BY group_id"#; | ||
|
|
||
| let df = ctx.sql(query).await?; | ||
| let physical = df.create_physical_plan().await?; | ||
|
|
||
| let cfg = DistributedConfig::default().with_network_shuffle_tasks(2); | ||
| let physical_distributed = apply_network_boundaries(physical, &cfg)?; | ||
| let physical_distributed = distribute_plan(physical_distributed)?; | ||
|
|
||
| // Execute distributed query | ||
| let batches_distributed = execute_stream(physical_distributed, ctx.task_ctx())? | ||
| .try_collect::<Vec<_>>() | ||
| .await?; | ||
|
|
||
| let actual_result = pretty_format_batches(&batches_distributed)?; | ||
| let expected_result = "\ | ||
| +----------+--------+-----+ | ||
| | group_id | fv1 | fv2 | | ||
| +----------+--------+-----+ | ||
| | 1 | trace1 | 100 | | ||
| | 2 | trace2 | 200 | | ||
| | 3 | trace3 | 300 | | ||
| | 4 | trace4 | 400 | | ||
| +----------+--------+-----+"; | ||
|
|
||
| // Print them out, the error message from `assert_eq` is otherwise hard to read. | ||
| println!("{}", expected_result); | ||
| println!("{}", actual_result); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: It would be preferable to only do this on failure. I think it's fine if we do |
||
|
|
||
| // Compare against result. The regression this is testing for would have NULL values in | ||
| // the second and third column. | ||
| assert_eq!(actual_result.to_string(), expected_result,); | ||
|
|
||
| Ok(()) | ||
| } | ||
| } | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we just check for duplicate column names directly then? We could make a
MemoryExecor somethingThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can look into that as a follow-up? I want to get the remaining issues with distributed activation first, this can be a side-quest.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure yes we'll see what the maintainers here think
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is fine. It's still a good test. If you could file and issue and put the number in the todo, that would help us track it :)