Fix dynamic shape pad #3840

Workflow file for this run

.github/workflows/test-gb-25.yml at cdb3b9f

	name: Test GB-25

	on:
	push:
	branches:
	- main
	paths: &paths
	- '.github/workflows/test-gb-25.yml'
	- '**/BUILD'
	- '**/WORKSPACE'
	- '*/.bzl'
	- 'patches/**'
	- 'src/**'
	- 'third_party/**'
	pull_request:
	branches:
	- main
	paths: *paths

	concurrency:
	# Skip intermediate builds: always.
	# Cancel intermediate builds: only if it is a pull request build.
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}

	jobs:
	test-gb-25:
	name: 'Test GB-25 - ${{ matrix.os }} - julia ${{ matrix.julia_version }} - XLA ${{ matrix.xla_commit }} - GB-25 ${{ matrix.gb25_commit }}- Reactant ${{ matrix.reactant_commit }}'
	runs-on: ${{ matrix.os }}
	timeout-minutes: 150

	container:
	image: ${{ (contains(matrix.os, 'linux') && 'ghcr.io/enzymead/reactant-docker-images@sha256:cd45d851f5ea544f88d042eafefa53d948c229fffcab6189019324e3b02b505a' ) \|\| '' }}

	strategy:
	fail-fast: false
	matrix:
	os:
	- linux-x86-a2-48-a100-4gpu
	- linux-x86-ct6e-180-4tpu
	julia_version:
	- '1.11'
	xla_commit:
	- ''
	# - 'b25f3cbed2bc88c8ffef85f6a5319e2cf7b0454c'
	gb25_commit:
	- 'main'
	# - '0123456789abcdef0123456789abcdef01234567'
	reactant_commit:
	- 'main'

	steps:
	- name: Check GPUs
	if: ${{ contains(matrix.os, 'a100') }}
	run: \|
	nvidia-smi
	- name: Set Artifacts index
	shell: bash
	run: \|
	# Artifact names cannot include forward slashes and some other characters, here we
	# do some sanitisation.
	ARTIFACT_INDEX=${{ matrix.os}}-${{ matrix.julia_version }}-${{ matrix.xla_commit }}-${{ matrix.gb25_commit }}-${{ matrix.reactant_commit }}
	echo "ARTIFACT_INDEX=${ARTIFACT_INDEX//\//_}" >> "${GITHUB_ENV}"
	- uses: bazel-contrib/setup-bazel@0.16.0
	name: Set up Bazel
	with:
	# Avoid downloading Bazel every time.
	bazelisk-cache: true
	# Store build cache per workflow only for macOS.
	disk-cache: ${{ runner.os == 'macOS' && format('gb-25-{0}-{1}', github.workflow, matrix.os) \|\| false }}
	# Do not share repository cache between workflows.
	repository-cache: false
	bazelisk-version: 1.x
	- name: Checkout Reactant.jl
	uses: actions/checkout@v6
	with:
	repository: 'EnzymeAD/Reactant.jl'
	ref: ${{ matrix.reactant_commit }}
	path: 'Reactant.jl'
	- name: Set REACTANT_DIR
	# We have to use `${GITHUB_WORKSPACE}` instead of `github.workspace` because GitHub
	# is terrible and the two don't match inside containers:
	# https://github.com/actions/runner/issues/2058
	run: \|
	REACTANT_DIR=${GITHUB_WORKSPACE}/Reactant.jl
	# Make sure this directory exists, for good measure
	ls -lhrt "${REACTANT_DIR}"
	echo "REACTANT_DIR=${REACTANT_DIR}" >> ${GITHUB_ENV}
	- uses: julia-actions/setup-julia@v2
	with:
	version: ${{ matrix.julia_version }}
	- name: Load Julia packages from cache
	uses: julia-actions/cache@v3
	- name: Modify Enzyme-JAX commit
	timeout-minutes: 1
	run: \|
	sed -i.bak 's/ENZYMEXLA_COMMIT = ".*"/ENZYMEXLA_COMMIT = "${{ github.sha }}"/' ReactantExtra/WORKSPACE

	cat ReactantExtra/WORKSPACE
	working-directory: ${{ env.REACTANT_DIR }}/deps
	- name: Modify XLA commit
	timeout-minutes: 1
	shell: bash
	run: \|
	if [[ -n "${{ matrix.xla_commit }}" ]]; then
	sed -E -i.bak \
	-e 's/xla_workspace$NEW_XLA_PATCHES(, .*)?$/xla_workspace(NEW_XLA_PATCHES, "${{ matrix.xla_commit }}")/' \
	ReactantExtra/WORKSPACE

	cat ReactantExtra/WORKSPACE
	fi
	working-directory: ${{ env.REACTANT_DIR }}/deps
	- name: Build local libReactant
	timeout-minutes: 60
	run: \|
	julia --project --color=yes -e 'using Pkg; Pkg.instantiate()'
	julia --project --color=yes -O0 build_local.jl --cc=clang --gcc_host_compiler_path= --push-cache
	working-directory: ${{ env.REACTANT_DIR }}/deps
	- name: Copy libdevice
	if: ${{ contains(matrix.os, 'a100') }}
	timeout-minutes: 10
	shell: bash
	run: \|
	if [[ ! -e bazel-bin/cuda/nvvm/libdevice/libdevice.10.bc ]]; then
	mkdir -pv bazel-bin/cuda/nvvm/libdevice
	cp -v bazel-bin/libReactantExtra.so.runfiles/cuda_nvcc/nvvm/libdevice/libdevice.10.bc bazel-bin/cuda/nvvm/libdevice/libdevice.10.bc
	fi
	working-directory: ${{ env.REACTANT_DIR }}/deps/ReactantExtra
	- name: Checkout GB-25
	uses: actions/checkout@v6
	with:
	repository: 'PRONTOLab/GB-25'
	ref: ${{ matrix.gb25_commit }}
	path: 'GB-25'
	- name: Set GB25_DIR
	# We have to use `${GITHUB_WORKSPACE}` instead of `github.workspace` because GitHub
	# is terrible and the two don't match inside containers:
	# https://github.com/actions/runner/issues/2058
	run: \|
	GB25_DIR=${GITHUB_WORKSPACE}/GB-25
	# Make sure this directory exists, for good measure
	ls -lhrt "${GB25_DIR}"
	echo "GB25_DIR=${GB25_DIR}" >> ${GITHUB_ENV}

	- name: Set default precision (GPU)
	if: ${{ contains(matrix.os, 'a100') }}
	run: \|
	echo "RUN_FLAGS=--precision=64" >> ${GITHUB_ENV}
	- name: Set default precision (TPU)
	if: ${{ contains(matrix.os, 'tpu') }}
	run: \|
	echo "RUN_FLAGS=--precision=32" >> ${GITHUB_ENV}
	- name: Instantiate GB-25 environment
	timeout-minutes: 40
	shell: julia --project --color=yes {0}
	run: \|
	# Copy preference file to point to newly built libReactant
	cp("${{ env.REACTANT_DIR }}/LocalPreferences.toml", "${{ env.GB25_DIR }}/LocalPreferences.toml")

	using Pkg
	if !isempty("${{ matrix.reactant_commit }}")
	# Install specific commit of Reactant, if necessary
	Pkg.add([
	PackageSpec(; name="Reactant", rev="${{ matrix.reactant_commit }}"),
	])
	end
	# Instantiate environment
	Pkg.instantiate()
	# Cleanup depot if possible
	Pkg.gc()
	working-directory: ${{ env.GB25_DIR }}
	- name: Upload Julia project environment
	uses: actions/upload-artifact@v7
	timeout-minutes: 10
	if: ${{ always() }}
	with:
	name: 'julia-environment-${{ env.ARTIFACT_INDEX }}'
	path: \|
	${{ env.GB25_DIR }}/Manifest.toml
	${{ env.GB25_DIR }}/Project.toml
	retention-days: 90
	overwrite: false
	- name: Install mpiexecjl
	run: \|
	MPIEXECJL_DIR="${GITHUB_WORKSPACE}/bin"
	julia --project --color=yes -e "using MPI; MPI.install_mpiexecjl(; destdir=\"${MPIEXECJL_DIR}\")"
	echo "${MPIEXECJL_DIR}" >> "${GITHUB_PATH}"
	working-directory: ${{ env.GB25_DIR }}
	- name: Compile GB-25 simulation
	timeout-minutes: 30
	run: \|
	export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump_compile'
	timeout --signal=TERM --verbose 29m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_compile.jl ${{ env.RUN_FLAGS }}
	working-directory: ${{ env.GB25_DIR }}
	- name: Show remaining collective operations
	timeout-minutes: 10
	shell: bash
	run: \|
	OK="true"
	if [[ '${{ contains(matrix.os, 'a100') }}' == 'true' ]]; then
	ALL_TO_ALL_THRESHOLD=0
	ALL_GATHER_THRESHOLD=0
	ALL_REDUCE_THRESHOLD=0
	COLLECTIVE_PERMUTE_THRESHOLD=345
	elif [[ '${{ contains(matrix.os, 'tpu') }}' == 'true' ]]; then
	ALL_TO_ALL_THRESHOLD=0
	ALL_GATHER_THRESHOLD=0
	ALL_REDUCE_THRESHOLD=19
	COLLECTIVE_PERMUTE_THRESHOLD=1076
	fi

	for OP in "all-to-all" "all-gather" "all-reduce" "%collective-permute-start.* ="; do
	COLLECTIVES=$(find . -name 'optimised_sharded_baroclinic_instability*.xla' -exec grep --with-filename --line-number --extended-regexp "${OP}" '{}' \;)
	NUM_COLLECTIVES=$(echo "${COLLECTIVES}" \| wc -l \| xargs)
	if [[ -n "${COLLECTIVES}" ]]; then
	MSG="There are ${NUM_COLLECTIVES} remaining ${OP} operations"
	echo "::error title=${{ matrix.os }} - Remaining ${OP} Operations:: ${MSG}"
	echo
	echo "----------"
	echo "${COLLECTIVES}"
	echo "----------"
	if [[ "${OP}" == "all-to-all" && ${NUM_COLLECTIVES} -gt ${ALL_TO_ALL_THRESHOLD} ]] \|\| [[ "${OP}" == "all-gather" && ${NUM_COLLECTIVES} -gt ${ALL_GATHER_THRESHOLD} ]] \|\| [[ "${OP}" == "all-reduce" && ${NUM_COLLECTIVES} -gt ${ALL_REDUCE_THRESHOLD} ]] \|\| [[ "${OP}" =~ "collective-permute" && ${NUM_COLLECTIVES} -gt ${COLLECTIVE_PERMUTE_THRESHOLD} ]]; then
	OK="false"
	fi
	fi
	done
	if [[ "${OK}" == "false" ]]; then
	exit 1
	fi
	- name: Run GB-25 simulation
	timeout-minutes: 60
	run: \|
	export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump_run'
	timeout --signal=TERM --verbose 59m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_run.jl ${{ env.RUN_FLAGS }}
	working-directory: ${{ env.GB25_DIR }}
	- name: Display profile results
	shell: julia --project --color=yes {0}
	run: \|
	using Reactant

	prof_dir = joinpath(first(readdir(joinpath(pwd(), "sharding", "profiling"); join=true)), "loop2", "plugins", "profile")
	prof_dir = joinpath(first(readdir(prof_dir; join=true)))

	xplane_file = first(filter(endswith(".xplane.pb"), readdir(prof_dir; join=true)))

	display(Reactant.Profiler.load_xplane_file(xplane_file))
	working-directory: ${{ env.GB25_DIR }}
	- name: Test correctness in GB-25 code
	timeout-minutes: 20
	run: \|
	timeout --signal=TERM --verbose 19m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict correctness/correctness_sharded_baroclinic_instability_simulation_run.jl ${{ env.RUN_FLAGS }}
	working-directory: ${{ env.GB25_DIR }}
	- name: Upload MLIR and XLA modules
	uses: actions/upload-artifact@v7
	timeout-minutes: 10
	if: ${{ !cancelled() }}
	with:
	name: 'simulation-mlir-${{ env.ARTIFACT_INDEX }}'
	path: \|
	${{ env.GB25_DIR }}/*/.mlir
	${{ env.GB25_DIR }}/*/optimised_.xla
	retention-days: 90
	overwrite: false
	- name: Upload XLA dump
	uses: actions/upload-artifact@v7
	timeout-minutes: 20
	if: ${{ !cancelled() }}
	with:
	name: 'simulation-xla-dump-${{ env.ARTIFACT_INDEX }}'
	path: '${{ env.GB25_DIR }}/*/xla_dump'
	retention-days: 90
	overwrite: false
	- name: Upload XLA profiler traces
	uses: actions/upload-artifact@v7
	timeout-minutes: 10
	if: ${{ !cancelled() }}
	with:
	name: 'simulation-xla-profile-${{ env.ARTIFACT_INDEX }}'
	path: '${{ env.GB25_DIR }}/**/plugins'
	retention-days: 90
	overwrite: false

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix dynamic shape pad #3840

Workflow file

Fix dynamic shape pad #3840

Uh oh!

Workflow file for this run