6666 URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
6767 secrets : inherit
6868
69- build-triton :
70- needs : build-jax
71- if : inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
72- uses : ./.github/workflows/_build.yaml
73- with :
74- ARCHITECTURE : ${{ inputs.ARCHITECTURE }}
75- ARTIFACT_NAME : artifact-triton-build
76- BADGE_FILENAME : badge-triton-build
77- BUILD_DATE : ${{ inputs.BUILD_DATE }}
78- BASE_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
79- CONTAINER_NAME : triton
80- DOCKERFILE : .github/container/Dockerfile.triton
81- RUNNER_SIZE : large
82- EXTRA_BUILD_ARGS : |
83- URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
84- secrets : inherit
85-
8669 build-equinox :
8770 needs : build-jax
8871 uses : ./.github/workflows/_build.yaml
@@ -176,9 +159,23 @@ jobs:
176159 URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
177160 secrets : inherit
178161
162+ build-axlearn :
163+ needs : build-jax
164+ uses : ./.github/workflows/_build.yaml
165+ with :
166+ ARCHITECTURE : ${{ inputs.ARCHITECTURE }}
167+ ARTIFACT_NAME : artifact-axlearn-build
168+ BADGE_FILENAME : badge-axlearn-build
169+ BUILD_DATE : ${{ inputs.BUILD_DATE }}
170+ BASE_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
171+ CONTAINER_NAME : axlearn
172+ DOCKERFILE : .github/container/Dockerfile.axlearn
173+ RUNNER_SIZE : large
174+ secrets : inherit
175+
179176 collect-docker-tags :
180177 runs-on : ubuntu-22.04
181- if : " !cancelled()"
178+ if : ${{ !cancelled() }}
182179 needs :
183180 - build-base
184181 - build-jax
@@ -189,6 +186,7 @@ jobs:
189186 - build-upstream-t5x
190187 - build-rosetta-t5x
191188 - build-gemma
189+ - build-axlearn
192190 outputs :
193191 TAGS : ${{ steps.collect-tags.outputs.TAGS }}
194192
@@ -198,10 +196,24 @@ jobs:
198196 run : |
199197 TAGS=$(cat <<EOF | jq -c
200198 [\
201- {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
202- {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
203- {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
204- {}\
199+ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
200+ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
201+ {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
202+ {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
203+ {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
204+ {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
205+ {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
206+ {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
207+ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
208+ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
209+ {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
210+ {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
211+ {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
212+ {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
213+ {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
214+ {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
215+ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
216+ {}\
205217 ]
206218 EOF
207219 )
@@ -399,9 +411,8 @@ jobs:
399411 runs-on : eks
400412 env :
401413 JAX_DOCKER_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
402- JOB_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-jax
403- POSTPROCESS_JOB_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
404- TOKEN_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-token
414+ JOB_NAME : ${{ github.run_id }}-nsys-jax
415+ POSTPROCESS_JOB_NAME : ${{ github.run_id }}-nsys-jax-postprocess
405416 steps :
406417 - name : Check out the repository
407418 uses : actions/checkout@v4
@@ -411,59 +422,37 @@ jobs:
411422 registry : ghcr.io
412423 username : ${{ github.repository_owner }}
413424 password : ${{ secrets.GITHUB_TOKEN }}
414- - name : Store GitHub Container Registry token as Kubernetes secret
415- run : |
416- kubectl create secret generic \
417- ${{ github.run_id }}-${{ github.run_attempt }}-token \
418- --from-file=.dockerconfigjson=$HOME/.docker/config.json \
419- --type=kubernetes.io/dockerconfigjson
425+ - name : K8s GHCR store and delete token
426+ id : store-token
427+ uses : ./.github/actions/store-delete-k8s-ghcr
420428 - name : Configure Kubernetes job
421429 run : |
422430 yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
423431 | select(di == 1).metadata.name = strenv(JOB_NAME)
424- | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
432+ | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
425433 | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
426434 | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
427435 .github/eks-workflow-files/job.yml
428436 git diff .github/eks-workflow-files/job.yml
429437 - name : Submit Kubernetes job
430- run : kubectl apply -f .github/eks-workflow-files/job.yml
431- - name : Wait for Kubernetes job to start
432- run : |
433- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
434- sleep 2
435- done
436- - name : Stream Kubernetes job output
437- run : kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
438- # Clean up in case of errors as well as success
439- - name : Delete Kubernetes job
440- if : always()
441- run : kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
438+ uses : ./.github/actions/submit-delete-k8s-job
439+ with :
440+ job-config-file : .github/eks-workflow-files/job.yml
441+ job-name : ${{ env.JOB_NAME }}
442442 - name : Configure post-processing job
443443 run : |
444444 export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
445445 yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
446446 | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
447- | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
447+ | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
448448 | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
449449 .github/eks-workflow-files/post-process-job.yml
450450 git diff .github/eks-workflow-files/post-process-job.yml
451- - name : Submit post-processing Kubernetes job
452- run : kubectl apply -f .github/eks-workflow-files/post-process-job.yml
453- - name : Wait for post-processing Kubernetes job to start
454- run : |
455- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
456- sleep 2
457- done
458- - name : Stream post-processing Kubernetes job output
459- run : kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
460- # Clean up in case of errors as well as success
461- - name : Delete post-processing Kubernetes job
462- if : always()
463- run : kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
464- - name : Delete GitHub Container Registry token
465- if : always()
466- run : kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
451+ - name : Submit post process Kubernetes job
452+ uses : ./.github/actions/submit-delete-k8s-job
453+ with :
454+ job-config-file : .github/eks-workflow-files/post-process-job.yml
455+ job-name : ${{ env.POSTPROCESS_JOB_NAME }}
467456
468457 # test-equinox:
469458 # needs: build-equinox
@@ -663,3 +652,126 @@ jobs:
663652 with :
664653 MAXTEXT_IMAGE : ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
665654 secrets : inherit
655+
656+ test-axlearn-eks :
657+ needs : build-axlearn
658+ if : inputs.ARCHITECTURE == 'amd64'
659+ runs-on : eks
660+ env :
661+ AXLEARN_DOCKER_IMAGE : ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
662+ JOB_NAME : axlearn-${{ github.run_id }}
663+ steps :
664+ - name : Check out the repository
665+ uses : actions/checkout@v4
666+ - name : Login to GitHub Container Registry
667+ uses : docker/login-action@v3
668+ with :
669+ registry : ghcr.io
670+ username : ${{ github.repository_owner }}
671+ password : ${{ secrets.GITHUB_TOKEN }}
672+ - name : K8s GHCR store and delete token
673+ id : store-token
674+ uses : ./.github/actions/store-delete-k8s-ghcr
675+ - name : Configure axlearn test job
676+ run : |
677+ # Replace placeholders in axlearn-job.yml with environment variables
678+ yq -i ea '
679+ select(di == 0).metadata.name = strenv(JOB_NAME)
680+ | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
681+ | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
682+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
683+ .github/eks-workflow-files/axlearn/axlearn-job.yml
684+ git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
685+ - name : Submit & delete axlearn test
686+ uses : ./.github/actions/submit-delete-k8s-job
687+ with :
688+ job-config-file : " .github/eks-workflow-files/axlearn/axlearn-job.yml"
689+ job-name : ${{ env.JOB_NAME }}
690+ - name : Download logs from S3
691+ id : log-s3
692+ run : |
693+ mkdir -p axlearn-output
694+ aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
695+ aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
696+ passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
697+ failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
698+ total_tests=$((failed_tests + passed_tests))
699+ echo "Passed tests: $passed_tests"
700+ echo "Failed tests: $failed_tests"
701+ echo "Total tests: $total_tests"
702+ echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
703+ echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
704+ echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
705+ - name : Generate sitrep
706+ id : sitrep
707+ if : ${{ !cancelled() }}
708+ shell : bash -x -e {0}
709+ run : |
710+ # bring in utility functions
711+ source .github/workflows/scripts/to_json.sh
712+ badge_label='Axlearn EKS Unit'
713+ total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
714+ failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
715+ passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
716+ errors="0" \
717+ summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
718+ badge_message="Passed $passed_tests out of $total_tests." \
719+ badge_color="brightgreen"
720+ if [ "$failed_tests" -gt 0 ]; then
721+ badge_color="red"
722+ fi \
723+ to_json \
724+ summary \
725+ errors total_tests passed_tests failed_tests \
726+ badge_label badge_color badge_message \
727+ > sitrep.json
728+ schemaVersion=1 \
729+ label="${badge_label}" \
730+ message="Passed $passed_tests out of $total_tests." \
731+ color=$badge_color \
732+ to_json schemaVersion label message color \
733+ > badge-axlearn-test.json
734+ - name : Upload artifacts
735+ if : ${{ !cancelled() }}
736+ uses : actions/upload-artifact@v4
737+ with :
738+ name : " artifact-axlearn-test"
739+ path : |
740+ sitrep.json
741+ badge-axlearn-test.json
742+ axlearn-output/*
743+ # the fuji test will run for 20 minutes only, as per 2025-02-24
744+ # is not possible to set the `max_steps` value
745+ # this will be done with a customer python code
746+ test-axlearn-fuji-models-eks :
747+ needs : build-axlearn
748+ if : inputs.ARCHITECTURE == 'amd64'
749+ runs-on : eks
750+ env :
751+ AXLEARN_DOCKER_IMAGE : ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
752+ JOB_NAME : axlearn-fuji-3b-${{ github.run_id }}
753+ steps :
754+ - name : Check out the repository
755+ uses : actions/checkout@v4
756+ - name : Login to GitHub Container Registry
757+ uses : docker/login-action@v3
758+ with :
759+ registry : ghcr.io
760+ username : ${{ github.repository_owner }}
761+ password : ${{ secrets.GITHUB_TOKEN }}
762+ - name : K8s GHCR store and delete token
763+ id : store-token
764+ uses : ./.github/actions/store-delete-k8s-ghcr
765+ - name : Configure axlearn test job
766+ run : |
767+ yq -i ea '
768+ select(di == 0).metadata.name = strenv(JOB_NAME)
769+ | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
770+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
771+ .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
772+ git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
773+ - name : Submit & delete axlearn test
774+ uses : ./.github/actions/submit-delete-k8s-job
775+ with :
776+ job-config-file : " .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
777+ job-name : ${{ env.JOB_NAME }}
0 commit comments