Skip to content

Commit d8cffef

Browse files
Merge branch 'AI-Hypercomputer:main' into main
2 parents c35fbd3 + 3889010 commit d8cffef

File tree

94 files changed

+4255
-777
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+4255
-777
lines changed

.github/CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# Changes in this file should match with requiredReviewers in file .github/workflows/AddLabel.yml
2-
* @gobbleturk @khatwanimohit @bvandermoon @vipannalla
2+
* @gobbleturk @khatwanimohit @bvandermoon @vipannalla @RissyRan

.github/workflows/AddLabel.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ name: Add Label
1616

1717
on:
1818
workflow_run:
19-
workflows: [Unit Test, CodeQL]
19+
workflows: [Tests, CodeQL]
2020
types:
2121
- completed
2222
pull_request_review:
@@ -57,6 +57,7 @@ jobs:
5757
khatwanimohit: "",
5858
bvandermoon: "",
5959
vipannalla: "",
60+
RissyRan: "",
6061
}
6162
const reviews = await github.rest.pulls.listReviews({
6263
owner,

.github/workflows/CPUTests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
name: Linter
22

33
on:
4+
pull_request:
45
push:
5-
branches:
6-
- '**'
6+
branches: [ "main" ]
77

88
jobs:
99
cpu:

.github/workflows/RunTests.yml

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
16+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
17+
18+
name: Tests
19+
20+
on:
21+
pull_request:
22+
push:
23+
branches: [ "main" ]
24+
workflow_dispatch:
25+
schedule:
26+
# Run the job every 4 hours
27+
- cron: '0 */4 * * *'
28+
29+
jobs:
30+
prelim:
31+
runs-on: ["self-hosted"]
32+
steps:
33+
- name: Test gsutil installation
34+
run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
35+
- name: Cleanup old docker images
36+
run: docker system prune --all --force
37+
38+
tpu_image:
39+
needs: prelim
40+
uses: ./.github/workflows/build_upload_internal.yml
41+
with:
42+
device_type: tpu
43+
device_name: v4-8
44+
build_mode: stable_stack
45+
base_image: us-docker.pkg.dev/cloud-tpu-images/jax-stable-stack/tpu:latest
46+
47+
gpu_image:
48+
needs: prelim
49+
uses: ./.github/workflows/build_upload_internal.yml
50+
with:
51+
device_type: gpu
52+
device_name: a100-40gb-4
53+
build_mode: pinned
54+
55+
tpu_unit_tests:
56+
needs: tpu_image
57+
uses: ./.github/workflows/run_tests_internal.yml
58+
with:
59+
device_type: tpu
60+
device_name: v4-8
61+
pytest_marker: 'not gpu_only and not integration_test'
62+
test_directory: 'tests'
63+
xla_python_client_mem_fraction: 0.75
64+
tf_force_gpu_allow_growth: false
65+
container_resource_option: "--privileged"
66+
67+
tpu_integration_tests:
68+
needs: tpu_image
69+
uses: ./.github/workflows/run_tests_internal.yml
70+
with:
71+
device_type: tpu
72+
device_name: v4-8
73+
pytest_marker: 'not gpu_only and integration_test'
74+
test_directory: 'tests/integration_tests'
75+
xla_python_client_mem_fraction: 0.75
76+
tf_force_gpu_allow_growth: false
77+
container_resource_option: "--privileged"
78+
79+
gpu_unit_tests:
80+
needs: gpu_image
81+
uses: ./.github/workflows/run_tests_internal.yml
82+
with:
83+
device_type: gpu
84+
device_name: a100-40gb-4
85+
pytest_marker: 'not tpu_only and not integration_test'
86+
test_directory: 'tests'
87+
xla_python_client_mem_fraction: 0.65
88+
tf_force_gpu_allow_growth: true
89+
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
90+
91+
gpu_integration_tests:
92+
needs: gpu_image
93+
uses: ./.github/workflows/run_tests_internal.yml
94+
with:
95+
device_type: gpu
96+
device_name: a100-40gb-4
97+
pytest_marker: 'not tpu_only and integration_test'
98+
test_directory: 'tests/integration_tests'
99+
xla_python_client_mem_fraction: 0.65
100+
tf_force_gpu_allow_growth: true
101+
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
102+
103+
104+
clean_up:
105+
if: ${{ always() }} # always execute, regardless of previous jobs or steps.
106+
needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
107+
name: "Clean up"
108+
runs-on: ["self-hosted"]
109+
steps:
110+
- name: Delete GPU image
111+
run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
112+
- name: Delete TPU image
113+
run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet
114+

.github/workflows/UnitTests.yml

Lines changed: 0 additions & 177 deletions
This file was deleted.

.github/workflows/UploadDockerImages.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,13 @@ jobs:
4141
bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_jax_nightly MODE=nightly DEVICE=tpu PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_jax_nightly
4242
- name: build jax stable stack image
4343
run : |
44-
bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_jax_stable_stack_0.4.35 MODE=stable_stack DEVICE=TPU PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_jax_stable_stack_0.4.35 BASEIMAGE=us-docker.pkg.dev/cloud-tpu-images/jax-stable-stack/tpu:jax0.4.35-rev1 MAXTEXT_REQUIREMENTS_FILE=requirements_with_jax_stable_stack.txt
44+
bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_jax_stable_stack MODE=stable_stack DEVICE=TPU PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_jax_stable_stack BASEIMAGE=us-docker.pkg.dev/cloud-tpu-images/jax-stable-stack/tpu:latest MAXTEXT_REQUIREMENTS_FILE=requirements_with_jax_stable_stack.txt
4545
- name: build image with stable stack nightly jax
4646
run: |
47-
bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_stable_stack_nightly_jax MODE=stable_stack DEVICE=tpu PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_gpu_jax_stable_stack_nightly BASEIMAGE=us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/tpu/jax_nightly:latest MAXTEXT_REQUIREMENTS_FILE=requirements_with_jax_stable_stack.txt
47+
bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_stable_stack_nightly_jax MODE=stable_stack DEVICE=tpu PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_tpu_jax_stable_stack_nightly BASEIMAGE=us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/tpu/jax_nightly:latest MAXTEXT_REQUIREMENTS_FILE=requirements_with_jax_stable_stack.txt
48+
- name: build image with jax stable stack release candidate image
49+
run: |
50+
bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_stable_stack_candidate MODE=stable_stack DEVICE=tpu PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_stable_stack_candidate BASEIMAGE=us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest MAXTEXT_REQUIREMENTS_FILE=requirements_with_jax_stable_stack.txt
4851
gpu:
4952
strategy:
5053
fail-fast: false
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# This file defines a module for building and uploading an image used in UnitTests.yml
16+
17+
name: Build and Upload Image
18+
19+
on:
20+
workflow_call:
21+
inputs:
22+
device_type:
23+
required: true
24+
type: string
25+
device_name:
26+
required: true
27+
type: string
28+
build_mode:
29+
required: true
30+
type: string
31+
base_image:
32+
required: false
33+
type: string
34+
35+
jobs:
36+
build_and_upload:
37+
name: Build and upload image (${{ inputs.device_name }})
38+
runs-on: ["self-hosted", "${{ inputs.device_type }}", "${{ inputs.device_name }}"]
39+
steps:
40+
- uses: actions/checkout@v4
41+
- name: Build an image
42+
run: |
43+
bash docker_build_dependency_image.sh MODE=${{ inputs.build_mode }} DEVICE=${{ inputs.device_type }} BASEIMAGE=${{ inputs.base_image }}
44+
- name: Tag the image
45+
run: |
46+
docker tag maxtext_base_image gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
47+
- name: Upload the image
48+
run: |
49+
docker push gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
50+

0 commit comments

Comments
 (0)