Skip to content

docs: Stop generating dynamic docs content in build (#3212) #363

docs: Stop generating dynamic docs content in build (#3212)

docs: Stop generating dynamic docs content in build (#3212) #363

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: PR Build (Linux)
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
on:
push:
paths-ignore:
- "doc/**"
- "docs/**"
- "**.md"
- "native/core/benches/**"
- "native/spark-expr/benches/**"
- "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
pull_request:
paths-ignore:
- "doc/**"
- "docs/**"
- "**.md"
- "native/core/benches/**"
- "native/spark-expr/benches/**"
- "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
# manual trigger
# https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
workflow_dispatch:
env:
RUST_VERSION: stable
jobs:
# Fast lint check - gates all other jobs
lint:
name: Lint
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v6
- name: Check Rust formatting
run: |
rustup component add rustfmt
cd native && cargo fmt --all -- --check
# Build native library once and share with all test jobs
build-native:
needs: lint
name: Build Native Library
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v6
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: ${{ env.RUST_VERSION }}
jdk-version: 17 # JDK only needed for common module proto generation
- name: Restore Cargo cache
uses: actions/cache/restore@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
native/target
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
restore-keys: |
${{ runner.os }}-cargo-ci-
- name: Build native library (CI profile)
run: |
cd native
# CI profile: same overflow behavior as release, but faster compilation
# (no LTO, parallel codegen)
cargo build --profile ci
- name: Upload native library
uses: actions/upload-artifact@v4
with:
name: native-lib-linux
path: native/target/ci/libcomet.so
retention-days: 1
- name: Save Cargo cache
uses: actions/cache/save@v4
if: github.ref == 'refs/heads/main'
with:
path: |
~/.cargo/registry
~/.cargo/git
native/target
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
# Run Rust tests (runs in parallel with build-native, uses debug builds)
linux-test-rust:
needs: lint
name: ubuntu-latest/rust-test
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v6
- name: Setup Rust & Java toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: ${{ env.RUST_VERSION }}
jdk-version: 17
- name: Restore Cargo cache
uses: actions/cache/restore@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
native/target
# Note: Java version intentionally excluded - Rust target is JDK-independent
key: ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
restore-keys: |
${{ runner.os }}-cargo-debug-
- name: Rust test steps
uses: ./.github/actions/rust-test
- name: Save Cargo cache
uses: actions/cache/save@v4
if: github.ref == 'refs/heads/main'
with:
path: |
~/.cargo/registry
~/.cargo/git
native/target
key: ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
linux-test:
needs: build-native
strategy:
matrix:
os: [ubuntu-latest]
# the goal with these profiles is to get coverage of all Java, Scala, and Spark
# versions without testing all possible combinations, which would be overkill
profile:
- name: "Spark 3.4, JDK 11, Scala 2.12"
java_version: "11"
maven_opts: "-Pspark-3.4 -Pscala-2.12"
scan_impl: "native_comet"
- name: "Spark 3.5.5, JDK 17, Scala 2.13"
java_version: "17"
maven_opts: "-Pspark-3.5 -Dspark.version=3.5.5 -Pscala-2.13"
scan_impl: "native_comet"
- name: "Spark 3.5.6, JDK 17, Scala 2.13"
java_version: "17"
maven_opts: "-Pspark-3.5 -Dspark.version=3.5.6 -Pscala-2.13"
scan_impl: "native_comet"
- name: "Spark 3.5, JDK 17, Scala 2.12 native_datafusion"
java_version: "17"
maven_opts: "-Pspark-3.5 -Pscala-2.12"
scan_impl: "native_datafusion"
- name: "Spark 3.5, JDK 17, Scala 2.12 native_iceberg_compat"
java_version: "17"
maven_opts: "-Pspark-3.5 -Pscala-2.12"
scan_impl: "native_iceberg_compat"
- name: "Spark 4.0, JDK 17"
java_version: "17"
maven_opts: "-Pspark-4.0"
scan_impl: "native_comet"
suite:
- name: "fuzz"
value: |
org.apache.comet.CometFuzzTestSuite
org.apache.comet.CometFuzzAggregateSuite
org.apache.comet.CometFuzzIcebergSuite
org.apache.comet.CometFuzzMathSuite
org.apache.comet.DataGeneratorSuite
- name: "shuffle"
value: |
org.apache.comet.exec.CometShuffleSuite
org.apache.comet.exec.CometShuffle4_0Suite
org.apache.comet.exec.CometNativeShuffleSuite
org.apache.comet.exec.CometShuffleEncryptionSuite
org.apache.comet.exec.CometShuffleManagerSuite
org.apache.comet.exec.CometAsyncShuffleSuite
org.apache.comet.exec.DisableAQECometShuffleSuite
org.apache.comet.exec.DisableAQECometAsyncShuffleSuite
org.apache.spark.shuffle.sort.SpillSorterSuite
- name: "parquet"
value: |
org.apache.comet.parquet.CometParquetWriterSuite
org.apache.comet.parquet.ParquetReadV1Suite
org.apache.comet.parquet.ParquetReadV2Suite
org.apache.comet.parquet.ParquetReadFromFakeHadoopFsSuite
org.apache.spark.sql.comet.ParquetDatetimeRebaseV1Suite
org.apache.spark.sql.comet.ParquetDatetimeRebaseV2Suite
org.apache.spark.sql.comet.ParquetEncryptionITCase
org.apache.comet.exec.CometNativeReaderSuite
org.apache.comet.CometIcebergNativeSuite
- name: "csv"
value: |
org.apache.comet.csv.CometCsvNativeReadSuite
- name: "exec"
value: |
org.apache.comet.exec.CometAggregateSuite
org.apache.comet.exec.CometExec3_4PlusSuite
org.apache.comet.exec.CometExecSuite
org.apache.comet.exec.CometGenerateExecSuite
org.apache.comet.exec.CometWindowExecSuite
org.apache.comet.exec.CometJoinSuite
org.apache.comet.CometNativeSuite
org.apache.comet.CometSparkSessionExtensionsSuite
org.apache.spark.CometPluginsSuite
org.apache.spark.CometPluginsDefaultSuite
org.apache.spark.CometPluginsNonOverrideSuite
org.apache.spark.CometPluginsUnifiedModeOverrideSuite
org.apache.comet.rules.CometScanRuleSuite
org.apache.comet.rules.CometExecRuleSuite
org.apache.spark.sql.CometTPCDSQuerySuite
org.apache.spark.sql.CometTPCDSQueryTestSuite
org.apache.spark.sql.CometTPCHQuerySuite
org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite
org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite
org.apache.spark.sql.comet.CometTaskMetricsSuite
org.apache.comet.objectstore.NativeConfigSuite
- name: "expressions"
value: |
org.apache.comet.CometExpressionSuite
org.apache.comet.CometExpressionCoverageSuite
org.apache.comet.CometHashExpressionSuite
org.apache.comet.CometTemporalExpressionSuite
org.apache.comet.CometArrayExpressionSuite
org.apache.comet.CometCastSuite
org.apache.comet.CometMathExpressionSuite
org.apache.comet.CometStringExpressionSuite
org.apache.comet.CometBitwiseExpressionSuite
org.apache.comet.CometMapExpressionSuite
org.apache.comet.CometJsonExpressionSuite
org.apache.comet.expressions.conditional.CometIfSuite
org.apache.comet.expressions.conditional.CometCoalesceSuite
org.apache.comet.expressions.conditional.CometCaseWhenSuite
- name: "sql"
value: |
org.apache.spark.sql.CometToPrettyStringSuite
fail-fast: false
name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name }}]
runs-on: ${{ matrix.os }}
container:
image: amd64/rust
env:
JAVA_TOOL_OPTIONS: ${{ matrix.profile.java_version == '17' && '--add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED' || '' }}
steps:
- uses: actions/checkout@v6
- name: Setup Rust & Java toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: ${{ env.RUST_VERSION }}
jdk-version: ${{ matrix.profile.java_version }}
- name: Download native library
uses: actions/download-artifact@v4
with:
name: native-lib-linux
# Download to release/ since Maven's -Prelease expects libcomet.so there
path: native/target/release/
# Restore cargo registry cache (for any cargo commands that might run)
- name: Cache Cargo registry
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
key: ${{ runner.os }}-cargo-registry-${{ hashFiles('native/**/Cargo.lock') }}
restore-keys: |
${{ runner.os }}-cargo-registry-
- name: Java test steps
uses: ./.github/actions/java-test
with:
artifact_name: ${{ matrix.os }}-${{ matrix.profile.name }}-${{ matrix.suite.name }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
suites: ${{ matrix.suite.name == 'sql' && matrix.profile.name == 'Spark 3.4, JDK 11, Scala 2.12' && '' || matrix.suite.value }}
maven_opts: ${{ matrix.profile.maven_opts }}
scan_impl: ${{ matrix.profile.scan_impl }}
upload-test-reports: true
skip-native-build: true
# TPC-H correctness test - verifies benchmark queries produce correct results
verify-benchmark-results-tpch:
needs: build-native
name: Verify TPC-H Results
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v6
- name: Setup Rust & Java toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: ${{ env.RUST_VERSION }}
jdk-version: 11
- name: Download native library
uses: actions/download-artifact@v4
with:
name: native-lib-linux
path: native/target/release/
- name: Cache Maven dependencies
uses: actions/cache@v4
with:
path: |
~/.m2/repository
/root/.m2/repository
key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ runner.os }}-java-maven-
- name: Cache TPC-H data
id: cache-tpch
uses: actions/cache@v4
with:
path: ./tpch
key: tpch-${{ hashFiles('.github/workflows/pr_build_linux.yml') }}
- name: Build project
run: |
./mvnw -B -Prelease install -DskipTests
- name: Generate TPC-H data (SF=1)
if: steps.cache-tpch.outputs.cache-hit != 'true'
run: |
cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw -B -Prelease exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCHData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--location `pwd`/.. --scaleFactor 1 --numPartitions 1 --overwrite"
- name: Run TPC-H queries
run: |
SPARK_HOME=`pwd` SPARK_TPCH_DATA=`pwd`/tpch/sf1_parquet ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCHQuerySuite test
# TPC-DS correctness tests - verifies benchmark queries produce correct results
verify-benchmark-results-tpcds:
needs: build-native
name: Verify TPC-DS Results (${{ matrix.join }})
runs-on: ubuntu-latest
container:
image: amd64/rust
strategy:
matrix:
join: [sort_merge, broadcast, hash]
fail-fast: false
steps:
- uses: actions/checkout@v6
- name: Setup Rust & Java toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: ${{ env.RUST_VERSION }}
jdk-version: 11
- name: Download native library
uses: actions/download-artifact@v4
with:
name: native-lib-linux
path: native/target/release/
- name: Cache Maven dependencies
uses: actions/cache@v4
with:
path: |
~/.m2/repository
/root/.m2/repository
key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ runner.os }}-java-maven-
- name: Cache TPC-DS data
id: cache-tpcds
uses: actions/cache@v4
with:
path: ./tpcds-sf-1
key: tpcds-${{ hashFiles('.github/workflows/pr_build_linux.yml') }}
- name: Build project
run: |
./mvnw -B -Prelease install -DskipTests
- name: Checkout tpcds-kit
if: steps.cache-tpcds.outputs.cache-hit != 'true'
uses: actions/checkout@v6
with:
repository: databricks/tpcds-kit
path: ./tpcds-kit
- name: Build tpcds-kit
if: steps.cache-tpcds.outputs.cache-hit != 'true'
run: |
apt-get update && apt-get install -y yacc bison flex gcc-12 g++-12
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 120 --slave /usr/bin/g++ g++ /usr/bin/g++-12
cd tpcds-kit/tools && make OS=LINUX
- name: Generate TPC-DS data (SF=1)
if: steps.cache-tpcds.outputs.cache-hit != 'true'
run: |
cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw -B -Prelease exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location `pwd`/../tpcds-sf-1 --scaleFactor 1 --numPartitions 1"
- name: Run TPC-DS queries (Sort merge join)
if: matrix.join == 'sort_merge'
run: |
SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=-1
spark.sql.join.preferSortMergeJoin=true
- name: Run TPC-DS queries (Broadcast hash join)
if: matrix.join == 'broadcast'
run: |
SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=10485760
- name: Run TPC-DS queries (Shuffled hash join)
if: matrix.join == 'hash'
run: |
SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=-1
spark.sql.join.forceApplyShuffledHashJoin=true