diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 2f12a8b5d2209..b525afa4acd7b 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -23,25 +23,27 @@ concurrency: on: push: + branches: + - main paths: - "**/Cargo.toml" - "**/Cargo.lock" - branches: - - main pull_request: paths: - "**/Cargo.toml" - "**/Cargo.lock" + merge_group: + jobs: security_audit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install cargo-audit - run: cargo install cargo-audit + uses: taiki-e/install-action@f535147c22906d77695e11cb199e764aa610a4fc # v2.62.46 + with: + tool: cargo-audit - name: Run audit check - # Ignored until https://github.com/apache/datafusion/issues/15571 - # ignored py03 warning until arrow 55 upgrade - run: cargo audit --ignore RUSTSEC-2024-0370 --ignore RUSTSEC-2025-0020 --ignore RUSTSEC-2025-0047 + run: cargo audit --ignore RUSTSEC-2025-0111 diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 05a6d70f0278a..6aaff95e74a5f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -266,7 +266,21 @@ jobs: runs-on: ubuntu-latest container: image: amd64/rust + volumes: + - /usr/local:/host/usr/local steps: + - name: Remove unnecessary preinstalled software + run: | + echo "Disk space before cleanup:" + df -h + # remove tool cache: about 8.5GB (github has host /opt/hostedtoolcache mounted as /__t) + rm -rf /__t/* || true + # remove Haskell runtime: about 6.3GB (host /usr/local/.ghcup) + rm -rf /host/usr/local/.ghcup || true + # remove Android library: about 7.8GB (host /usr/local/lib/android) + rm -rf /host/usr/local/lib/android || true + echo "Disk space after cleanup:" + df -h - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: submodules: true @@ -347,6 +361,19 @@ jobs: with: save-if: ${{ github.ref_name == 'main' }} shared-key: "amd-ci-linux-test-example" + - name: Remove unnecessary preinstalled software + run: | + echo "Disk space before cleanup:" + df -h + apt-get clean + rm -rf /__t/CodeQL + rm -rf /__t/PyPy + rm -rf /__t/Java_Temurin-Hotspot_jdk + rm -rf /__t/Python + rm -rf /__t/go + rm -rf /__t/Ruby + echo "Disk space after cleanup:" + df -h - name: Run examples run: | # test datafusion-sql examples @@ -444,7 +471,7 @@ jobs: export RUST_MIN_STACK=20971520 export TPCH_DATA=`realpath datafusion/sqllogictest/test_files/tpch/data` cargo test plan_q --package datafusion-benchmarks --profile ci --features=ci -- --test-threads=1 - INCLUDE_TPCH=true cargo test --features backtrace --profile ci --package datafusion-sqllogictest --test sqllogictests + INCLUDE_TPCH=true cargo test --features backtrace,parquet_encryption --profile ci --package datafusion-sqllogictest --test sqllogictests - name: Verify Working Directory Clean run: git diff --exit-code diff --git a/Cargo.toml b/Cargo.toml index 93dcfeb4da25e..49affe557a328 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -198,6 +198,7 @@ rpath = false strip = false # Retain debug info for flamegraphs [profile.ci] +debug = false inherits = "dev" incremental = false diff --git a/datafusion-testing b/datafusion-testing index f72ac4075ada5..e9f9e22ccf091 160000 --- a/datafusion-testing +++ b/datafusion-testing @@ -1 +1 @@ -Subproject commit f72ac4075ada5ea9810551bc0c3e3161c61204a2 +Subproject commit e9f9e22ccf09145a7368f80fd6a871f11e2b4481 diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 4d88f5a66732e..91058575723e5 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -2387,7 +2387,7 @@ impl ScalarValue { Arc::new(array) } // explicitly enumerate unsupported types so newly added - // types must be aknowledged, Time32 and Time64 types are + // types must be acknowledged, Time32 and Time64 types are // not supported if the TimeUnit is not valid (Time32 can // only be used with Second and Millisecond, Time64 only // with Microsecond and Nanosecond) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 690ce31d0dc76..14b5bfa54eda2 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1131,7 +1131,7 @@ impl ListingTable { } } -// Expressions can be used for parttion pruning if they can be evaluated using +// Expressions can be used for partition pruning if they can be evaluated using // only the partition columns and there are partition columns. fn can_be_evaluated_for_partition_pruning( partition_column_names: &[&str], diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index 32c4f030b0fc6..ad6abec8cadca 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -1234,7 +1234,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { Arc::new(CoalesceBatchesExec::new(hash_join, 8192)) as Arc; // Top-level CoalescePartitionsExec let cp = Arc::new(CoalescePartitionsExec::new(cb)) as Arc; - // Add a sort for determistic output + // Add a sort for deterministic output let plan = Arc::new(SortExec::new( LexOrdering::new(vec![PhysicalSortExpr::new( col("a", &probe_side_schema).unwrap(), diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 93a3d4af54326..664b86e964eba 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -98,6 +98,7 @@ pub(super) struct ParquetOpener { /// Coerce INT96 timestamps to specific TimeUnit pub coerce_int96: Option, /// Optional parquet FileDecryptionProperties + #[cfg(feature = "parquet_encryption")] pub file_decryption_properties: Option>, /// Rewrite expressions in the context of the file schema pub(crate) expr_adapter_factory: Option>, @@ -151,9 +152,11 @@ impl FileOpener for ParquetOpener { let mut predicate_file_schema = Arc::clone(&self.logical_file_schema); let enable_page_index = self.enable_page_index; + #[cfg(feature = "parquet_encryption")] let encryption_context = self.get_encryption_context(); Ok(Box::pin(async move { + #[cfg(feature = "parquet_encryption")] let file_decryption_properties = encryption_context .get_file_decryption_properties(&file_location) .await?; @@ -502,6 +505,7 @@ where } #[derive(Default)] +#[cfg_attr(not(feature = "parquet_encryption"), allow(dead_code))] struct EncryptionContext { #[cfg(feature = "parquet_encryption")] file_decryption_properties: Option>, @@ -544,6 +548,7 @@ impl EncryptionContext { } #[cfg(not(feature = "parquet_encryption"))] +#[allow(dead_code)] impl EncryptionContext { async fn get_file_decryption_properties( &self, @@ -563,6 +568,7 @@ impl ParquetOpener { } #[cfg(not(feature = "parquet_encryption"))] + #[allow(dead_code)] fn get_encryption_context(&self) -> EncryptionContext { EncryptionContext::default() } diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 007c239ef4928..644cea85ca0a9 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -52,6 +52,7 @@ use datafusion_physical_plan::metrics::Count; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::DisplayFormatType; +#[cfg(feature = "parquet_encryption")] use datafusion_common::encryption::map_config_decryption_to_decryption; #[cfg(feature = "parquet_encryption")] use datafusion_execution::parquet_encryption::EncryptionFactory; @@ -541,6 +542,7 @@ impl FileSource for ParquetSource { Arc::new(DefaultParquetFileReaderFactory::new(object_store)) as _ }); + #[cfg(feature = "parquet_encryption")] let file_decryption_properties = self .table_parquet_options() .crypto @@ -576,6 +578,7 @@ impl FileSource for ParquetSource { enable_row_group_stats_pruning: self.table_parquet_options.global.pruning, schema_adapter_factory, coerce_int96, + #[cfg(feature = "parquet_encryption")] file_decryption_properties, expr_adapter_factory, #[cfg(feature = "parquet_encryption")] diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 65a2108266647..5409cfe8e7e45 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -1070,7 +1070,6 @@ mod tests { .into_iter() .collect(); - //let valid_array = vec![true, false, false, true, false, tru let null_buffer = Buffer::from([0b00101001u8]); let load4 = load4 .into_data() diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index c242a5f0f4330..2b92bb013c47f 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -79,7 +79,7 @@ postgres = [ "tokio-postgres", ] parquet_encryption = [ - "datafusion/parquet_encryption" + "datafusion/parquet_encryption", ] [dev-dependencies] diff --git a/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt index e5cd6d88b08f4..00696fc4fb4f4 100644 --- a/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt +++ b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt @@ -89,7 +89,7 @@ logical_plan 02)--TableScan: test_parquet projection=[id, value, name] physical_plan 01)SortExec: TopK(fetch=3), expr=[value@1 DESC], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilter [ empty ] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] # Disable TopK dynamic filter pushdown statement ok @@ -127,7 +127,7 @@ physical_plan 02)--CoalesceBatchesExec: target_batch_size=8192 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3] 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet -05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] # Disable Join dynamic filter pushdown statement ok @@ -184,7 +184,7 @@ physical_plan 02)--CoalesceBatchesExec: target_batch_size=8192 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3] 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet -05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] # Enable TopK, disable Join statement ok @@ -306,7 +306,7 @@ physical_plan 02)--CoalesceBatchesExec: target_batch_size=8192 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3] 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet -05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] # Cleanup diff --git a/datafusion/sqllogictest/test_files/encrypted_parquet.slt b/datafusion/sqllogictest/test_files/encrypted_parquet.slt index d580b7d1ad2b8..326d7f42d3c83 100644 --- a/datafusion/sqllogictest/test_files/encrypted_parquet.slt +++ b/datafusion/sqllogictest/test_files/encrypted_parquet.slt @@ -29,11 +29,11 @@ STORED AS PARQUET LOCATION 'test_files/scratch/encrypted_parquet/' OPTIONS ( -- Encryption properties 'format.crypto.file_encryption.encrypt_footer' 'true', 'format.crypto.file_encryption.footer_key_as_hex' '30313233343536373839303132333435', -- b"0123456789012345" - 'format.crypto.file_encryption.column_key_as_hex::double_field' '31323334353637383930313233343530', -- b"1234567890123450" - 'format.crypto.file_encryption.column_key_as_hex::float_field' '31323334353637383930313233343531', -- b"1234567890123451" + 'format.crypto.file_encryption.column_key_as_hex::double_field' '31323334353637383930313233343530', -- b"1234567890123450" + 'format.crypto.file_encryption.column_key_as_hex::float_field' '31323334353637383930313233343531', -- b"1234567890123451" -- Decryption properties - 'format.crypto.file_decryption.footer_key_as_hex' '30313233343536373839303132333435', -- b"0123456789012345" - 'format.crypto.file_decryption.column_key_as_hex::double_field' '31323334353637383930313233343530', -- b"1234567890123450" + 'format.crypto.file_decryption.footer_key_as_hex' '30313233343536373839303132333435', -- b"0123456789012345" + 'format.crypto.file_decryption.column_key_as_hex::double_field' '31323334353637383930313233343530', -- b"1234567890123450" 'format.crypto.file_decryption.column_key_as_hex::float_field' '31323334353637383930313233343531', -- b"1234567890123451" ) diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md index 040405f8f63e7..68164c1cbfedd 100644 --- a/docs/source/user-guide/introduction.md +++ b/docs/source/user-guide/introduction.md @@ -86,7 +86,7 @@ Here are some example systems built using DataFusion: By using DataFusion, projects are freed to focus on their specific features, and avoid reimplementing general (but still necessary) features such as an expression representation, standard optimizations, -parellelized streaming execution plans, file format support, etc. +parallelized streaming execution plans, file format support, etc. ## Known Users diff --git a/typos.toml b/typos.toml index 46f21febcf86b..09c5c55c452ab 100644 --- a/typos.toml +++ b/typos.toml @@ -34,6 +34,9 @@ alph = "alph" wih = "wih" Ded = "Ded" +# From SLT README +nteger = "nteger" + [files] extend-exclude = [ "*.slt", @@ -42,5 +45,6 @@ extend-exclude = [ "*.sql", "dev/changelog/**", "benchmarks/**", - "*.csv" + "*.csv", + "docs/source/contributor-guide/governance.md" ]