Skip to content

Commit e92f1d0

Browse files
authored
Merge branch 'main' into zxhe/multi-container
2 parents 1fe5b6d + 7f4d193 commit e92f1d0

File tree

112 files changed

+975
-5235
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

112 files changed

+975
-5235
lines changed

.github/actions/install-kjob/action.yml

Lines changed: 0 additions & 35 deletions
This file was deleted.

.github/actions/setup-test-env/action.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ runs:
4444
run: gcloud auth configure-docker --quiet
4545
shell: bash
4646
- uses: ./.github/actions/install-kueue
47-
- uses: ./.github/actions/install-kjob
4847
- name: Install XPK
4948
run: pip install dist/xpk-*.whl
5049
shell: bash

.github/workflows/build_tests.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,13 @@ jobs:
4949
lookup-only: true
5050
- name: install dependencies
5151
if : steps.check-cache.outputs.cache-hit != 'true'
52-
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
52+
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue
5353
- name: Cache dependencies
5454
if : steps.check-cache.outputs.cache-hit != 'true'
5555
uses: actions/cache/save@v3
5656
with:
5757
path: |
5858
/usr/local/bin/kubectl-kueue
59-
/usr/local/bin/kubectl-kjob
6059
~/.cache/pip
6160
${{env.pythonLocation}}
6261
key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}

.github/workflows/integration_basic_cluster_create.yaml

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -152,62 +152,6 @@ jobs:
152152
run: xpk info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
153153
- name: Delete the workload on the cluster
154154
run: xpk workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
155-
- name: Create test script to execute in batch
156-
run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
157-
- name: Run a batch job on the cluster
158-
run: xpk batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3
159-
- name: List out the jobs on the cluster
160-
run: xpk job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-'
161-
- name: Get created job name
162-
run: |
163-
JOB_NAME=$(xpk job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | grep 'multislice-queue' | head -1 | awk '{print $1}')
164-
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
165-
- name: Check job spec
166-
run: |
167-
job_spec=$(kubectl get job ${JOB_NAME} -o jsonpath='{.spec}')
168-
echo "$job_spec" | grep '"completions":2'
169-
echo "$job_spec" | grep '"parallelism":2'
170-
echo "$job_spec" | jq '.template.spec.containers | length' | grep 3
171-
- name: Get job info for the last job created on the cluster
172-
run: xpk job info ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
173-
- name: Cancel the batch job on the cluster
174-
run: xpk job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted"
175-
- name: Create shell and exit it immediately
176-
run: |
177-
cat <<EOF > create-shell.exp
178-
#!/usr/bin/expect
179-
set timeout 180
180-
spawn sh -c "xpk shell --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee shell.log"
181-
send "\n"
182-
expect {
183-
"/ # " {
184-
send "exit\n"
185-
# Wait for EOF after exit
186-
expect eof
187-
exit 0
188-
}
189-
timeout {
190-
puts "Timed out waiting for pod to be running"
191-
exit 1
192-
}
193-
eof {
194-
puts "Unexpected EOF before getting prompt"
195-
exit 1
196-
}
197-
}
198-
EOF
199-
chmod +x ./create-shell.exp
200-
expect ./create-shell.exp
201-
- name: Check if shell exists and is running
202-
run: |
203-
pod_name=$(grep 'waiting for pod' shell.log | awk -F'"' '{print $2}')
204-
kubectl wait --for='jsonpath={.status.conditions[?(@.type=="Ready")].status}=True' --timeout=1m pod/${pod_name}
205-
- name: Stop the shell
206-
run: xpk shell stop --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
207-
- name: Delete create-shell.exp file
208-
run: rm create-shell.exp
209-
- name: Delete shell.log file
210-
run: rm shell.log
211155
- name: Delete the cluster created
212156
if: always()
213157
run: xpk cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --force
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License
14+
15+
name: Basic GPU cluster create
16+
17+
on:
18+
workflow_call:
19+
20+
permissions:
21+
contents: read
22+
23+
jobs:
24+
gpu-cluster-create-and-delete:
25+
runs-on: [ubuntu-22.04]
26+
concurrency:
27+
group: nightly-test-cluster-group-gpu
28+
cancel-in-progress: false
29+
env:
30+
GPU_CLUSTER_NAME: nightly-xpk-b200
31+
WORKLOAD_NAME: xpktest-gpu-nightly-${{ github.run_attempt }}
32+
steps:
33+
- uses: actions/download-artifact@v4
34+
with:
35+
name: custom-scripts
36+
- name: Setup environment
37+
uses: ./.github/actions/setup-test-env
38+
with:
39+
credentials_json: "${{ secrets.GCP_SA_KEY }}"
40+
- name: Check xpk installation
41+
run: xpk version
42+
- name: 'Setup Service Account for XPK'
43+
run: |
44+
# 1. Clear any existing WIF configurations to avoid conflicts
45+
rm -rf $HOME/.config/gcloud
46+
mkdir -p $HOME/.config/gcloud
47+
48+
# 2. Write the Key File
49+
echo '${{ secrets.GCP_SA_KEY }}' > $HOME/.config/gcloud/application_default_credentials.json
50+
51+
# 3. Activate the Service Account
52+
# This updates the internal config files to point to the key file.
53+
# When Docker mounts the directory, it will now see "Active Account: Service Account"
54+
gcloud auth activate-service-account --key-file=$HOME/.config/gcloud/application_default_credentials.json --project=cloud-tpu-multipod-dev
55+
56+
# 4. Set Env Var for the host (GitHub Runner)
57+
echo "GOOGLE_APPLICATION_CREDENTIALS=$HOME/.config/gcloud/application_default_credentials.json" >> $GITHUB_ENV
58+
- name: Create an XPK Cluster with 1 x b200 GPU
59+
run: xpk cluster create --cluster $GPU_CLUSTER_NAME --device-type=b200-8 --zone=asia-northeast1-b --default-pool-cpu-machine-type=n1-standard-16 --spot
60+
- name: Authenticate Docker
61+
run: gcloud auth configure-docker --quiet
62+
- name: Run a base-docker-image workload
63+
run: xpk workload create --cluster $GPU_CLUSTER_NAME --workload $WORKLOAD_NAME --docker-image='nvidia/cuda:12.1.0-base-ubuntu22.04' --command "nvidia-smi" --zone=asia-northeast1-b --device-type=b200-8
64+
- name: List out the workloads on the cluster
65+
run: xpk workload list --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b
66+
- name: Wait for workload completion and confirm it succeeded
67+
run: xpk workload list --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b --wait-for-job-completion $WORKLOAD_NAME --timeout 600
68+
- name: Delete the workload on the cluster
69+
run: xpk workload delete --workload $WORKLOAD_NAME --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b
70+
- name: Delete the cluster created
71+
if: always()
72+
run: xpk cluster delete --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b --force
73+
- name: Upload cluster nodepool creation log
74+
if: always()
75+
uses: actions/upload-artifact@v4
76+
with:
77+
name: gpu-cluster-nodepool-log-${{github.run_id}}
78+
path: /tmp/NodepoolCreate-${{ env.GPU_CLUSTER_NAME }}-np-*

.github/workflows/integration_legacy_tests.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,13 @@ jobs:
4747
lookup-only: true
4848
- name: install dependencies
4949
if: steps.check-cache.outputs.cache-hit != 'true'
50-
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
50+
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue
5151
- name: Cache dependencies
5252
if: steps.check-cache.outputs.cache-hit != 'true'
5353
uses: actions/cache/save@v3
5454
with:
5555
path: |
5656
/usr/local/bin/kubectl-kueue
57-
/usr/local/bin/kubectl-kjob
5857
~/.cache/pip
5958
${{env.pythonLocation}}
6059
key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}

.github/workflows/label-validation.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ jobs:
3636
with:
3737
mode: minimum
3838
count: 1
39-
labels: "release-improvements, release-bugfix, release-features"
40-
message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: release-improvements, release-bugfix, release-features"
39+
labels: "release-improvements, release-bugfix, release-features, release-breaking"
40+
message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: release-improvements, release-bugfix, release-features, release-breaking"
4141
- id: do-not-merge
4242
uses: mheap/github-action-required-labels@v5
4343
with:

.github/workflows/nightly_tests.yaml

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,38 +16,41 @@ name: Nightly Tests
1616

1717
on:
1818
workflow_dispatch:
19-
schedule: # Schedule the job run at 12AM PST daily.
20-
- cron: "0 8 * * *"
19+
schedule: # Schedule the job run at 6AM UTC daily.
20+
- cron: "0 6 * * *"
2121

2222
permissions:
2323
contents: read
2424

2525
jobs:
26-
build_kjob:
27-
uses: ./.github/workflows/reusable_build_kjob.yaml
2826
build_wheel:
2927
uses: ./.github/workflows/reusable_build_wheel.yaml
3028
build_actions:
3129
uses: ./.github/workflows/reusable_build_scripts.yaml
3230
basic_cluster_create:
33-
needs: [build_kjob, build_actions, build_wheel]
31+
needs: [build_actions, build_wheel]
3432
uses: ./.github/workflows/integration_basic_cluster_create.yaml
3533
secrets: inherit
3634

35+
gpu_cluster_create:
36+
needs: [build_actions, build_wheel]
37+
uses: ./.github/workflows/integration_gpu_cluster_create.yaml
38+
secrets: inherit
39+
3740
pathways_cluster_create:
38-
needs: [build_kjob, build_actions, build_wheel]
41+
needs: [build_actions, build_wheel]
3942
uses: ./.github/workflows/integration_pathways_cluster_create.yaml
4043
secrets: inherit
4144

4245
ray_cluster_create:
43-
needs: [build_kjob, build_actions, build_wheel]
46+
needs: [build_actions, build_wheel]
4447
uses: ./.github/workflows/integration_ray_cluster_create.yaml
4548
secrets: inherit
4649
legacy_integration:
47-
needs: [build_kjob, build_actions, build_wheel]
50+
needs: [build_actions, build_wheel]
4851
uses: ./.github/workflows/integration_legacy_tests.yaml
4952
secrets: inherit
5053
storage-tests:
51-
needs: [build_kjob, build_actions, build_wheel]
54+
needs: [build_actions, build_wheel]
5255
uses: ./.github/workflows/integration_storage_tests.yaml
5356
secrets: inherit

.github/workflows/reusable_build_kjob.yaml

Lines changed: 0 additions & 23 deletions
This file was deleted.

.github/workflows/reusable_goldens.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ jobs:
3333
with:
3434
path: |
3535
/usr/local/bin/kubectl-kueue
36-
/usr/local/bin/kubectl-kjob
3736
~/.cache/pip
3837
${{env.pythonLocation}}
3938
key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}}

0 commit comments

Comments
 (0)