6666 URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
6767 secrets : inherit
6868
69- build-triton :
70- needs : build-jax
71- if : inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
72- uses : ./.github/workflows/_build.yaml
73- with :
74- ARCHITECTURE : ${{ inputs.ARCHITECTURE }}
75- ARTIFACT_NAME : artifact-triton-build
76- BADGE_FILENAME : badge-triton-build
77- BUILD_DATE : ${{ inputs.BUILD_DATE }}
78- BASE_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
79- CONTAINER_NAME : triton
80- DOCKERFILE : .github/container/Dockerfile.triton
81- RUNNER_SIZE : large
82- EXTRA_BUILD_ARGS : |
83- URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
84- secrets : inherit
85-
8669 build-equinox :
8770 needs : build-jax
8871 uses : ./.github/workflows/_build.yaml
@@ -176,9 +159,23 @@ jobs:
176159 URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
177160 secrets : inherit
178161
162+ build-axlearn :
163+ needs : build-jax
164+ uses : ./.github/workflows/_build.yaml
165+ with :
166+ ARCHITECTURE : ${{ inputs.ARCHITECTURE }}
167+ ARTIFACT_NAME : artifact-axlearn-build
168+ BADGE_FILENAME : badge-axlearn-build
169+ BUILD_DATE : ${{ inputs.BUILD_DATE }}
170+ BASE_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
171+ CONTAINER_NAME : axlearn
172+ DOCKERFILE : .github/container/Dockerfile.axlearn
173+ RUNNER_SIZE : large
174+ secrets : inherit
175+
179176 collect-docker-tags :
180177 runs-on : ubuntu-22.04
181- if : " !cancelled()"
178+ if : ${{ !cancelled() }}
182179 needs :
183180 - build-base
184181 - build-jax
@@ -198,10 +195,23 @@ jobs:
198195 run : |
199196 TAGS=$(cat <<EOF | jq -c
200197 [\
201- {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
202- {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
203- {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
204- {}\
198+ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
199+ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
200+ {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
201+ {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
202+ {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
203+ {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
204+ {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
205+ {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
206+ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
207+ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
208+ {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
209+ {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
210+ {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
211+ {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
212+ {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
213+ {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
214+ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
205215 ]
206216 EOF
207217 )
@@ -396,74 +406,51 @@ jobs:
396406 test-nsys-jax-eks :
397407 needs : build-jax
398408 if : inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
399- runs-on : eks
400- env :
401- JAX_DOCKER_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
402- JOB_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-jax
403- POSTPROCESS_JOB_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
404- TOKEN_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-token
405- steps :
406- - name : Check out the repository
407- uses : actions/checkout@v4
409+ runs-on : eks
410+ env :
411+ JAX_DOCKER_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
412+ JOB_NAME : ${{ github.run_id }}-nsys-jax
413+ POSTPROCESS_JOB_NAME : ${{ github.run_id }}-nsys-jax-postprocess
414+ steps :
415+ - name : Check out the repository
416+ uses : actions/checkout@v4
408417 - name : Login to GitHub Container Registry
409418 uses : docker/login-action@v3
410419 with :
411- registry : ghcr.io
412- username : ${{ github.repository_owner }}
413- password : ${{ secrets.GITHUB_TOKEN }}
414- - name : Store GitHub Container Registry token as Kubernetes secret
415- run : |
416- kubectl create secret generic \
417- ${{ github.run_id }}-${{ github.run_attempt }}-token \
418- --from-file=.dockerconfigjson=$HOME/.docker/config.json \
419- --type=kubernetes.io/dockerconfigjson
420- - name : Configure Kubernetes job
421- run : |
422- yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
423- | select(di == 1).metadata.name = strenv(JOB_NAME)
424- | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
425- | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
426- | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
427- .github/eks-workflow-files/job.yml
428- git diff .github/eks-workflow-files/job.yml
429- - name : Submit Kubernetes job
430- run : kubectl apply -f .github/eks-workflow-files/job.yml
431- - name : Wait for Kubernetes job to start
432- run : |
433- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
434- sleep 2
435- done
436- - name : Stream Kubernetes job output
437- run : kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
438- # Clean up in case of errors as well as success
439- - name : Delete Kubernetes job
440- if : always()
441- run : kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
442- - name : Configure post-processing job
443- run : |
444- export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
445- yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
446- | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
447- | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
448- | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
449- .github/eks-workflow-files/post-process-job.yml
450- git diff .github/eks-workflow-files/post-process-job.yml
451- - name : Submit post-processing Kubernetes job
452- run : kubectl apply -f .github/eks-workflow-files/post-process-job.yml
453- - name : Wait for post-processing Kubernetes job to start
454- run : |
455- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
456- sleep 2
457- done
458- - name : Stream post-processing Kubernetes job output
459- run : kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
460- # Clean up in case of errors as well as success
461- - name : Delete post-processing Kubernetes job
462- if : always()
463- run : kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
464- - name : Delete GitHub Container Registry token
465- if : always()
466- run : kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
420+ registry : ghcr.io
421+ username : ${{ github.repository_owner }}
422+ password : ${{ secrets.GITHUB_TOKEN }}
423+ - name : K8s GHCR store and delete token
424+ id : store-token
425+ uses : ./.github/actions/store-delete-k8s-ghcr
426+ - name : Configure Kubernetes job
427+ run : |
428+ yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
429+ | select(di == 1).metadata.name = strenv(JOB_NAME)
430+ | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
431+ | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
432+ | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
433+ .github/eks-workflow-files/job.yml
434+ git diff .github/eks-workflow-files/job.yml
435+ - name : Submit Kubernetes job
436+ uses : ./.github/actions/submit-delete-k8s-job
437+ with :
438+ job-config-file : .github/eks-workflow-files/job.yml
439+ job-name : ${{ env.JOB_NAME }}
440+ - name : Configure post-processing job
441+ run : |
442+ export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
443+ yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
444+ | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
445+ | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
446+ | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
447+ .github/eks-workflow-files/post-process-job.yml
448+ git diff .github/eks-workflow-files/post-process-job.yml
449+ - name : Submit post process Kubernetes job
450+ uses : ./.github/actions/submit-delete-k8s-job
451+ with :
452+ job-config-file : .github/eks-workflow-files/post-process-job.yml
453+ job-name : ${{ env.POSTPROCESS_JOB_NAME }}
467454
468455 # test-equinox:
469456 # needs: build-equinox
@@ -663,3 +650,126 @@ jobs:
663650 with :
664651 MAXTEXT_IMAGE : ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
665652 secrets : inherit
653+
654+ test-axlearn-eks :
655+ needs : build-axlearn
656+ if : inputs.ARCHITECTURE == 'amd64'
657+ runs-on : eks
658+ env :
659+ AXLEARN_DOCKER_IMAGE : ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
660+ JOB_NAME : axlearn-${{ github.run_id }}
661+ steps :
662+ - name : Check out the repository
663+ uses : actions/checkout@v4
664+ - name : Login to GitHub Container Registry
665+ uses : docker/login-action@v3
666+ with :
667+ registry : ghcr.io
668+ username : ${{ github.repository_owner }}
669+ password : ${{ secrets.GITHUB_TOKEN }}
670+ - name : K8s GHCR store and delete token
671+ id : store-token
672+ uses : ./.github/actions/store-delete-k8s-ghcr
673+ - name : Configure axlearn test job
674+ run : |
675+ # Replace placeholders in axlearn-job.yml with environment variables
676+ yq -i ea '
677+ select(di == 0).metadata.name = strenv(JOB_NAME)
678+ | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
679+ | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
680+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
681+ .github/eks-workflow-files/axlearn/axlearn-job.yml
682+ git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
683+ - name : Submit & delete axlearn test
684+ uses : ./.github/actions/submit-delete-k8s-job
685+ with :
686+ job-config-file : " .github/eks-workflow-files/axlearn/axlearn-job.yml"
687+ job-name : ${{ env.JOB_NAME }}
688+ - name : Download logs from S3
689+ id : log-s3
690+ run : |
691+ mkdir -p axlearn-output
692+ aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
693+ aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
694+ passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
695+ failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
696+ total_tests=$((failed_tests + passed_tests))
697+ echo "Passed tests: $passed_tests"
698+ echo "Failed tests: $failed_tests"
699+ echo "Total tests: $total_tests"
700+ echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
701+ echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
702+ echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
703+ - name : Generate sitrep
704+ id : sitrep
705+ if : ${{ !cancelled() }}
706+ shell : bash -x -e {0}
707+ run : |
708+ # bring in utility functions
709+ source .github/workflows/scripts/to_json.sh
710+ badge_label='Axlearn EKS Unit'
711+ total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
712+ failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
713+ passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
714+ errors="0" \
715+ summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
716+ badge_message="Passed $passed_tests out of $total_tests." \
717+ badge_color="brightgreen"
718+ if [ "$failed_tests" -gt 0 ]; then
719+ badge_color="red"
720+ fi \
721+ to_json \
722+ summary \
723+ errors total_tests passed_tests failed_tests \
724+ badge_label badge_color badge_message \
725+ > sitrep.json
726+ schemaVersion=1 \
727+ label="${badge_label}" \
728+ message="Passed $passed_tests out of $total_tests." \
729+ color=$badge_color \
730+ to_json schemaVersion label message color \
731+ > badge-axlearn-test.json
732+ - name : Upload artifacts
733+ if : ${{ !cancelled() }}
734+ uses : actions/upload-artifact@v4
735+ with :
736+ name : " artifact-axlearn-test"
737+ path : |
738+ sitrep.json
739+ badge-axlearn-test.json
740+ axlearn-output/*
741+ # the fuji test will run for 20 minutes only, as per 2025-02-24
742+ # is not possible to set the `max_steps` value
743+ # this will be done with a customer python code
744+ test-axlearn-fuji-models-eks :
745+ needs : build-axlearn
746+ if : inputs.ARCHITECTURE == 'amd64'
747+ runs-on : eks
748+ env :
749+ AXLEARN_DOCKER_IMAGE : ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
750+ JOB_NAME : axlearn-fuji-3b-${{ github.run_id }}
751+ steps :
752+ - name : Check out the repository
753+ uses : actions/checkout@v4
754+ - name : Login to GitHub Container Registry
755+ uses : docker/login-action@v3
756+ with :
757+ registry : ghcr.io
758+ username : ${{ github.repository_owner }}
759+ password : ${{ secrets.GITHUB_TOKEN }}
760+ - name : K8s GHCR store and delete token
761+ id : store-token
762+ uses : ./.github/actions/store-delete-k8s-ghcr
763+ - name : Configure axlearn test job
764+ run : |
765+ yq -i ea '
766+ select(di == 0).metadata.name = strenv(JOB_NAME)
767+ | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
768+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
769+ .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
770+ git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
771+ - name : Submit & delete axlearn test
772+ uses : ./.github/actions/submit-delete-k8s-job
773+ with :
774+ job-config-file : " .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
775+ job-name : ${{ env.JOB_NAME }}
0 commit comments