Fix dynamic shape pad #3840
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test GB-25 | |
| on: | |
| push: | |
| branches: | |
| - main | |
| paths: &paths | |
| - '.github/workflows/test-gb-25.yml' | |
| - '**/BUILD' | |
| - '**/WORKSPACE' | |
| - '**/*.bzl' | |
| - 'patches/**' | |
| - 'src/**' | |
| - 'third_party/**' | |
| pull_request: | |
| branches: | |
| - main | |
| paths: *paths | |
| concurrency: | |
| # Skip intermediate builds: always. | |
| # Cancel intermediate builds: only if it is a pull request build. | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} | |
| jobs: | |
| test-gb-25: | |
| name: 'Test GB-25 - ${{ matrix.os }} - julia ${{ matrix.julia_version }} - XLA ${{ matrix.xla_commit }} - GB-25 ${{ matrix.gb25_commit }}- Reactant ${{ matrix.reactant_commit }}' | |
| runs-on: ${{ matrix.os }} | |
| timeout-minutes: 150 | |
| container: | |
| image: ${{ (contains(matrix.os, 'linux') && 'ghcr.io/enzymead/reactant-docker-images@sha256:cd45d851f5ea544f88d042eafefa53d948c229fffcab6189019324e3b02b505a' ) || '' }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| os: | |
| - linux-x86-a2-48-a100-4gpu | |
| - linux-x86-ct6e-180-4tpu | |
| julia_version: | |
| - '1.11' | |
| xla_commit: | |
| - '' | |
| # - 'b25f3cbed2bc88c8ffef85f6a5319e2cf7b0454c' | |
| gb25_commit: | |
| - 'main' | |
| # - '0123456789abcdef0123456789abcdef01234567' | |
| reactant_commit: | |
| - 'main' | |
| steps: | |
| - name: Check GPUs | |
| if: ${{ contains(matrix.os, 'a100') }} | |
| run: | | |
| nvidia-smi | |
| - name: Set Artifacts index | |
| shell: bash | |
| run: | | |
| # Artifact names cannot include forward slashes and some other characters, here we | |
| # do some sanitisation. | |
| ARTIFACT_INDEX=${{ matrix.os}}-${{ matrix.julia_version }}-${{ matrix.xla_commit }}-${{ matrix.gb25_commit }}-${{ matrix.reactant_commit }} | |
| echo "ARTIFACT_INDEX=${ARTIFACT_INDEX//\//_}" >> "${GITHUB_ENV}" | |
| - uses: bazel-contrib/setup-bazel@0.16.0 | |
| name: Set up Bazel | |
| with: | |
| # Avoid downloading Bazel every time. | |
| bazelisk-cache: true | |
| # Store build cache per workflow only for macOS. | |
| disk-cache: ${{ runner.os == 'macOS' && format('gb-25-{0}-{1}', github.workflow, matrix.os) || false }} | |
| # Do not share repository cache between workflows. | |
| repository-cache: false | |
| bazelisk-version: 1.x | |
| - name: Checkout Reactant.jl | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: 'EnzymeAD/Reactant.jl' | |
| ref: ${{ matrix.reactant_commit }} | |
| path: 'Reactant.jl' | |
| - name: Set REACTANT_DIR | |
| # We have to use `${GITHUB_WORKSPACE}` instead of `github.workspace` because GitHub | |
| # is terrible and the two don't match inside containers: | |
| # https://github.com/actions/runner/issues/2058 | |
| run: | | |
| REACTANT_DIR=${GITHUB_WORKSPACE}/Reactant.jl | |
| # Make sure this directory exists, for good measure | |
| ls -lhrt "${REACTANT_DIR}" | |
| echo "REACTANT_DIR=${REACTANT_DIR}" >> ${GITHUB_ENV} | |
| - uses: julia-actions/setup-julia@v2 | |
| with: | |
| version: ${{ matrix.julia_version }} | |
| - name: Load Julia packages from cache | |
| uses: julia-actions/cache@v3 | |
| - name: Modify Enzyme-JAX commit | |
| timeout-minutes: 1 | |
| run: | | |
| sed -i.bak 's/ENZYMEXLA_COMMIT = ".*"/ENZYMEXLA_COMMIT = "${{ github.sha }}"/' ReactantExtra/WORKSPACE | |
| cat ReactantExtra/WORKSPACE | |
| working-directory: ${{ env.REACTANT_DIR }}/deps | |
| - name: Modify XLA commit | |
| timeout-minutes: 1 | |
| shell: bash | |
| run: | | |
| if [[ -n "${{ matrix.xla_commit }}" ]]; then | |
| sed -E -i.bak \ | |
| -e 's/xla_workspace\(NEW_XLA_PATCHES(, .*)?\)/xla_workspace(NEW_XLA_PATCHES, "${{ matrix.xla_commit }}")/' \ | |
| ReactantExtra/WORKSPACE | |
| cat ReactantExtra/WORKSPACE | |
| fi | |
| working-directory: ${{ env.REACTANT_DIR }}/deps | |
| - name: Build local libReactant | |
| timeout-minutes: 60 | |
| run: | | |
| julia --project --color=yes -e 'using Pkg; Pkg.instantiate()' | |
| julia --project --color=yes -O0 build_local.jl --cc=clang --gcc_host_compiler_path= --push-cache | |
| working-directory: ${{ env.REACTANT_DIR }}/deps | |
| - name: Copy libdevice | |
| if: ${{ contains(matrix.os, 'a100') }} | |
| timeout-minutes: 10 | |
| shell: bash | |
| run: | | |
| if [[ ! -e bazel-bin/cuda/nvvm/libdevice/libdevice.10.bc ]]; then | |
| mkdir -pv bazel-bin/cuda/nvvm/libdevice | |
| cp -v bazel-bin/libReactantExtra.so.runfiles/cuda_nvcc/nvvm/libdevice/libdevice.10.bc bazel-bin/cuda/nvvm/libdevice/libdevice.10.bc | |
| fi | |
| working-directory: ${{ env.REACTANT_DIR }}/deps/ReactantExtra | |
| - name: Checkout GB-25 | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: 'PRONTOLab/GB-25' | |
| ref: ${{ matrix.gb25_commit }} | |
| path: 'GB-25' | |
| - name: Set GB25_DIR | |
| # We have to use `${GITHUB_WORKSPACE}` instead of `github.workspace` because GitHub | |
| # is terrible and the two don't match inside containers: | |
| # https://github.com/actions/runner/issues/2058 | |
| run: | | |
| GB25_DIR=${GITHUB_WORKSPACE}/GB-25 | |
| # Make sure this directory exists, for good measure | |
| ls -lhrt "${GB25_DIR}" | |
| echo "GB25_DIR=${GB25_DIR}" >> ${GITHUB_ENV} | |
| - name: Set default precision (GPU) | |
| if: ${{ contains(matrix.os, 'a100') }} | |
| run: | | |
| echo "RUN_FLAGS=--precision=64" >> ${GITHUB_ENV} | |
| - name: Set default precision (TPU) | |
| if: ${{ contains(matrix.os, 'tpu') }} | |
| run: | | |
| echo "RUN_FLAGS=--precision=32" >> ${GITHUB_ENV} | |
| - name: Instantiate GB-25 environment | |
| timeout-minutes: 40 | |
| shell: julia --project --color=yes {0} | |
| run: | | |
| # Copy preference file to point to newly built libReactant | |
| cp("${{ env.REACTANT_DIR }}/LocalPreferences.toml", "${{ env.GB25_DIR }}/LocalPreferences.toml") | |
| using Pkg | |
| if !isempty("${{ matrix.reactant_commit }}") | |
| # Install specific commit of Reactant, if necessary | |
| Pkg.add([ | |
| PackageSpec(; name="Reactant", rev="${{ matrix.reactant_commit }}"), | |
| ]) | |
| end | |
| # Instantiate environment | |
| Pkg.instantiate() | |
| # Cleanup depot if possible | |
| Pkg.gc() | |
| working-directory: ${{ env.GB25_DIR }} | |
| - name: Upload Julia project environment | |
| uses: actions/upload-artifact@v7 | |
| timeout-minutes: 10 | |
| if: ${{ always() }} | |
| with: | |
| name: 'julia-environment-${{ env.ARTIFACT_INDEX }}' | |
| path: | | |
| ${{ env.GB25_DIR }}/Manifest.toml | |
| ${{ env.GB25_DIR }}/Project.toml | |
| retention-days: 90 | |
| overwrite: false | |
| - name: Install mpiexecjl | |
| run: | | |
| MPIEXECJL_DIR="${GITHUB_WORKSPACE}/bin" | |
| julia --project --color=yes -e "using MPI; MPI.install_mpiexecjl(; destdir=\"${MPIEXECJL_DIR}\")" | |
| echo "${MPIEXECJL_DIR}" >> "${GITHUB_PATH}" | |
| working-directory: ${{ env.GB25_DIR }} | |
| - name: Compile GB-25 simulation | |
| timeout-minutes: 30 | |
| run: | | |
| export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump_compile' | |
| timeout --signal=TERM --verbose 29m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_compile.jl ${{ env.RUN_FLAGS }} | |
| working-directory: ${{ env.GB25_DIR }} | |
| - name: Show remaining collective operations | |
| timeout-minutes: 10 | |
| shell: bash | |
| run: | | |
| OK="true" | |
| if [[ '${{ contains(matrix.os, 'a100') }}' == 'true' ]]; then | |
| ALL_TO_ALL_THRESHOLD=0 | |
| ALL_GATHER_THRESHOLD=0 | |
| ALL_REDUCE_THRESHOLD=0 | |
| COLLECTIVE_PERMUTE_THRESHOLD=345 | |
| elif [[ '${{ contains(matrix.os, 'tpu') }}' == 'true' ]]; then | |
| ALL_TO_ALL_THRESHOLD=0 | |
| ALL_GATHER_THRESHOLD=0 | |
| ALL_REDUCE_THRESHOLD=19 | |
| COLLECTIVE_PERMUTE_THRESHOLD=1076 | |
| fi | |
| for OP in "all-to-all" "all-gather" "all-reduce" "%collective-permute-start.* ="; do | |
| COLLECTIVES=$(find . -name 'optimised_sharded_baroclinic_instability*.xla' -exec grep --with-filename --line-number --extended-regexp "${OP}" '{}' \;) | |
| NUM_COLLECTIVES=$(echo "${COLLECTIVES}" | wc -l | xargs) | |
| if [[ -n "${COLLECTIVES}" ]]; then | |
| MSG="There are ${NUM_COLLECTIVES} remaining ${OP} operations" | |
| echo "::error title=${{ matrix.os }} - Remaining ${OP} Operations:: ${MSG}" | |
| echo | |
| echo "----------" | |
| echo "${COLLECTIVES}" | |
| echo "----------" | |
| if [[ "${OP}" == "all-to-all" && ${NUM_COLLECTIVES} -gt ${ALL_TO_ALL_THRESHOLD} ]] || [[ "${OP}" == "all-gather" && ${NUM_COLLECTIVES} -gt ${ALL_GATHER_THRESHOLD} ]] || [[ "${OP}" == "all-reduce" && ${NUM_COLLECTIVES} -gt ${ALL_REDUCE_THRESHOLD} ]] || [[ "${OP}" =~ "collective-permute" && ${NUM_COLLECTIVES} -gt ${COLLECTIVE_PERMUTE_THRESHOLD} ]]; then | |
| OK="false" | |
| fi | |
| fi | |
| done | |
| if [[ "${OK}" == "false" ]]; then | |
| exit 1 | |
| fi | |
| - name: Run GB-25 simulation | |
| timeout-minutes: 60 | |
| run: | | |
| export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump_run' | |
| timeout --signal=TERM --verbose 59m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_run.jl ${{ env.RUN_FLAGS }} | |
| working-directory: ${{ env.GB25_DIR }} | |
| - name: Display profile results | |
| shell: julia --project --color=yes {0} | |
| run: | | |
| using Reactant | |
| prof_dir = joinpath(first(readdir(joinpath(pwd(), "sharding", "profiling"); join=true)), "loop2", "plugins", "profile") | |
| prof_dir = joinpath(first(readdir(prof_dir; join=true))) | |
| xplane_file = first(filter(endswith(".xplane.pb"), readdir(prof_dir; join=true))) | |
| display(Reactant.Profiler.load_xplane_file(xplane_file)) | |
| working-directory: ${{ env.GB25_DIR }} | |
| - name: Test correctness in GB-25 code | |
| timeout-minutes: 20 | |
| run: | | |
| timeout --signal=TERM --verbose 19m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict correctness/correctness_sharded_baroclinic_instability_simulation_run.jl ${{ env.RUN_FLAGS }} | |
| working-directory: ${{ env.GB25_DIR }} | |
| - name: Upload MLIR and XLA modules | |
| uses: actions/upload-artifact@v7 | |
| timeout-minutes: 10 | |
| if: ${{ !cancelled() }} | |
| with: | |
| name: 'simulation-mlir-${{ env.ARTIFACT_INDEX }}' | |
| path: | | |
| ${{ env.GB25_DIR }}/**/*.mlir | |
| ${{ env.GB25_DIR }}/**/optimised_*.xla | |
| retention-days: 90 | |
| overwrite: false | |
| - name: Upload XLA dump | |
| uses: actions/upload-artifact@v7 | |
| timeout-minutes: 20 | |
| if: ${{ !cancelled() }} | |
| with: | |
| name: 'simulation-xla-dump-${{ env.ARTIFACT_INDEX }}' | |
| path: '${{ env.GB25_DIR }}/**/xla_dump*' | |
| retention-days: 90 | |
| overwrite: false | |
| - name: Upload XLA profiler traces | |
| uses: actions/upload-artifact@v7 | |
| timeout-minutes: 10 | |
| if: ${{ !cancelled() }} | |
| with: | |
| name: 'simulation-xla-profile-${{ env.ARTIFACT_INDEX }}' | |
| path: '${{ env.GB25_DIR }}/**/plugins' | |
| retention-days: 90 | |
| overwrite: false |