Skip to content

Commit aa40b08

Browse files
pawloch00sharabiani44past4
authored
Fixes for slurm like commands (#397)
* Fixes for slurm-like commands Signed-off-by: Piotr Pawłowski <[email protected]> --------- Signed-off-by: Piotr Pawłowski <[email protected]> Co-authored-by: Farhad Sharabiani <[email protected]> Co-authored-by: Pawel Kepka <[email protected]>
1 parent 2091726 commit aa40b08

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+848
-144
lines changed

.github/workflows/batch_test.yaml

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,19 +47,17 @@ jobs:
4747
gcloud config set compute/zone us-east4-a
4848
gcloud config get compute/zone
4949
- name: Prepare directories
50-
run: mkdir -p ~/.cache/pip && mkdir bin/
50+
run: mkdir -p ~/.cache/pip
5151
- name: Restore cached dependencies
5252
uses: actions/cache@v4
5353
with:
5454
path: |
55-
bin/
55+
/usr/local/bin/kubectl-kueue
56+
/usr/local/bin/kubectl-kjob
5657
~/.cache/pip
5758
${{env.pythonLocation}}
5859
key: xpk-deps-3.10-${{inputs.run-id}}
5960
restore-keys: xpk-deps-3.10
60-
- name: Install xpk dependencies
61-
run: |
62-
echo $PWD/bin >> "$GITHUB_PATH"
6361
- name: Install expect package
6462
run: sudo apt-get install expect
6563
- name: Check xpk installation
@@ -82,7 +80,7 @@ jobs:
8280
echo "$job_spec" | grep '"parallelism":2'
8381
echo "$job_spec" | jq '.template.spec.containers | length' | grep 3
8482
- name: Get job info for the last job created on the cluster
85-
run: python3 xpk.py job info ${JOB_NAME} | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
83+
run: python3 xpk.py job info ${JOB_NAME} --cluster ${{inputs.cluster-name}} | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
8684
- name: Cancel the batch job on the cluster
8785
run: |
8886
python3 xpk.py job cancel ${JOB_NAME} --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | grep "job.batch/${JOB_NAME} deleted"
@@ -100,4 +98,4 @@ jobs:
10098
# - name: Check if shell exists and is running
10199
# run: kubectl get pods | grep xpk-def-app-profile-interactive- | grep Running
102100
- name: Stop the shell
103-
run: python3 xpk.py shell stop
101+
run: python3 xpk.py shell stop --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}

.github/workflows/build_tests.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ jobs:
9595
uses: actions/cache@v3
9696
with:
9797
path: |
98-
bin/
98+
usr/local/bin/
9999
~/.cache/pip
100100
${{env.pythonLocation}}
101101
key: xpk-deps-${{ matrix.python-version }}-${{needs.set-variables.outputs.run-id}}
@@ -108,7 +108,8 @@ jobs:
108108
uses: actions/cache/save@v3
109109
with:
110110
path: |
111-
bin/
111+
/usr/local/bin/kubectl-kueue
112+
/usr/local/bin/kubectl-kjob
112113
~/.cache/pip
113114
${{env.pythonLocation}}
114115
key: xpk-deps-${{ matrix.python-version }}-${{needs.set-variables.outputs.run-id}}

.github/workflows/cluster_create.yaml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,13 @@ jobs:
5050
with:
5151
python-version: '3.10'
5252
- name: Prepare directories
53-
run: mkdir -p ~/.cache/pip && mkdir bin/
53+
run: mkdir -p ~/.cache/pip
5454
- name: Restore cached dependencies
5555
uses: actions/cache@v4
5656
with:
5757
path: |
58-
bin/
58+
/usr/local/bin/kubectl-kueue
59+
/usr/local/bin/kubectl-kjob
5960
~/.cache/pip
6061
${{env.pythonLocation}}
6162
key: xpk-deps-3.10-${{inputs.run-id}}
@@ -73,10 +74,8 @@ jobs:
7374
run: |
7475
gcloud config set compute/zone us-east4-a
7576
gcloud config get compute/zone
76-
- name: Install xpk dependencies
77-
run: |
78-
echo $PWD/bin >> "$GITHUB_PATH"
7977
- name: Check xpk installation
8078
run: xpk --help
8179
- name: Create a Pathways-enabled XPK Cluster with 2x ${{inputs.tpu-type}} nodepools. Larger num-nodes to avoid master resizing.
82-
run: python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --enable-gcpfilestore-csi-driver --enable-gcsfuse-csi-driver --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
80+
run: python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=2 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --enable-gcpfilestore-csi-driver --enable-gcsfuse-csi-driver --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
81+

.github/workflows/cluster_delete.yaml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,17 @@ jobs:
4646
gcloud config set compute/zone us-east4-a
4747
gcloud config get compute/zone
4848
- name: Prepare directories
49-
run: mkdir -p ~/.cache/pip && mkdir bin/
49+
run: mkdir -p ~/.cache/pip
5050
- name: Restore cached dependencies
5151
uses: actions/cache@v4
5252
with:
5353
path: |
54-
bin/
54+
/usr/local/bin/kubectl-kueue
55+
/usr/local/bin/kubectl-kjob
5556
~/.cache/pip
5657
${{env.pythonLocation}}
5758
key: xpk-deps-3.10-${{inputs.run-id}}
5859
restore-keys: xpk-deps-3.10
59-
- name: Install xpk dependencies
60-
run: |
61-
echo $PWD/bin >> "$GITHUB_PATH"
6260
- name: Check xpk installation
6361
run: xpk --help
6462
- name: Delete the cluster created

.github/workflows/cluster_private.yaml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,13 @@ jobs:
5858
- name: Verify gcp setup
5959
run: gcloud info
6060
- name: Prepare directories
61-
run: mkdir -p ~/.cache/pip && mkdir bin/
61+
run: mkdir -p ~/.cache/pip
6262
- name: Restore cached dependencies
6363
uses: actions/cache@v4
6464
with:
6565
path: |
66-
bin/
66+
/usr/local/bin/kubectl-kueue
67+
/usr/local/bin/kubectl-kjob
6768
~/.cache/pip
6869
${{env.pythonLocation}}
6970
key: xpk-deps-3.10-${{inputs.run-id}}
@@ -72,9 +73,6 @@ jobs:
7273
run: |
7374
gcloud config set compute/zone us-east4-a
7475
gcloud config get compute/zone
75-
- name: Install xpk dependencies
76-
run: |
77-
echo $PWD/bin >> "$GITHUB_PATH"
7876
- name: Check xpk installation
7977
run: xpk --help
8078
- name: Create a Pathways-enabled private XPK Cluster with 2x ${{inputs.tpu-type}} nodepools. Larger num-nodes to avoid master resizing.

.github/workflows/filestore_tests.yaml

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -76,19 +76,17 @@ jobs:
7676
gcloud config set compute/zone us-east4-a
7777
gcloud config get compute/zone
7878
- name: Prepare directories
79-
run: mkdir -p ~/.cache/pip && mkdir bin/
79+
run: mkdir -p ~/.cache/pip
8080
- name: Restore cached dependencies
8181
uses: actions/cache@v4
8282
with:
8383
path: |
84-
bin/
84+
/usr/local/bin/kubectl-kueue
85+
/usr/local/bin/kubectl-kjob
8586
~/.cache/pip
8687
${{env.pythonLocation}}
8788
key: xpk-deps-3.10-${{inputs.run-id}}
8889
restore-keys: xpk-deps-3.10-
89-
- name: Update PATH
90-
run: |
91-
echo $PWD/bin >> "$GITHUB_PATH"
9290
- name: Verify xpk installation
9391
run: xpk --help
9492
- name: Authenticate Docker
@@ -154,19 +152,17 @@ jobs:
154152
gcloud config set compute/zone us-east4-a
155153
gcloud config get compute/zone
156154
- name: Prepare directories
157-
run: mkdir -p ~/.cache/pip && mkdir bin/
155+
run: mkdir -p ~/.cache/pip
158156
- name: Restore cached dependencies
159157
uses: actions/cache@v4
160158
with:
161159
path: |
162-
bin/
160+
/usr/local/bin/kubectl-kueue
161+
/usr/local/bin/kubectl-kjob
163162
~/.cache/pip
164163
${{env.pythonLocation}}
165164
key: xpk-deps-3.10-${{inputs.run-id}}
166165
restore-keys: xpk-deps-3.10-
167-
- name: Update PATH
168-
run: |
169-
echo $PWD/bin >> "$GITHUB_PATH"
170166
- name: Verify xpk installation
171167
run: xpk --help
172168
- name: Authenticate Docker

.github/workflows/integration_tests.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,13 @@ jobs:
3737
- name: Verify gcp setup
3838
run: gcloud info
3939
- name: Prepare directories
40-
run: mkdir -p ~/.cache/pip && mkdir bin/
40+
run: mkdir -p ~/.cache/pip
4141
- name: Restore cached dependencies
4242
uses: actions/cache@v4
4343
with:
4444
path: |
45-
bin/
45+
/usr/local/bin/kubectl-kueue
46+
/usr/local/bin/kubectl-kjob
4647
~/.cache/pip
4748
${{env.pythonLocation}}
4849
key: xpk-deps-3.10-${{inputs.run-id}}

.github/workflows/lint_and_format.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,13 @@ jobs:
3434
with:
3535
python-version: ${{ matrix.python-version }}
3636
- name: Prepare directories
37-
run: mkdir -p ~/.cache/pip && mkdir bin/
37+
run: mkdir -p ~/.cache/pip
3838
- name: Restore cached dependencies
3939
uses: actions/cache@v4
4040
with:
4141
path: |
42-
bin/
42+
/usr/local/bin/kubectl-kueue
43+
/usr/local/bin/kubectl-kjob
4344
~/.cache/pip
4445
${{env.pythonLocation}}
4546
key: xpk-deps-${{matrix.python-version}}-${{inputs.run-id}}

.github/workflows/nightly_tests.yaml

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -59,51 +59,47 @@ jobs:
5959
run: |
6060
gcloud config set compute/zone us-east4-a
6161
gcloud config get compute/zone
62-
- name: Install xpk dependencies
63-
run: |
64-
make install
65-
echo $PWD/bin >> "$GITHUB_PATH"
6662
- name: Check xpk installation
6763
run: xpk --help
6864
- name: Create an XPK Cluster with zero node pools
69-
run: python xpk.py cluster create --cluster $EMPTY_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=0 --zone=europe-west4-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
65+
run: python xpk.py cluster create --cluster $EMPTY_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
7066
- name: Delete the cluster created
71-
run: python xpk.py cluster delete --cluster $EMPTY_CLUSTER_NAME --zone=europe-west4-b --force
67+
run: python xpk.py cluster delete --cluster $EMPTY_CLUSTER_NAME --zone=us-central2-b --force
7268
if: always()
7369
- name: Create a Private XPK Cluster with zero node pools
74-
run: python xpk.py cluster create --cluster $PRIVATE_CLUSTER_NAME --private --tpu-type=v5p-8 --num-slices=0 --zone=europe-west4-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}' --custom-cluster-arguments='${CLUSTER_NETWORK_ARGUMENTS}'
70+
run: python xpk.py cluster create --cluster $PRIVATE_CLUSTER_NAME --private --tpu-type=v5p-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${CLUSTER_NETWORK_ARGUMENTS}'
7571
- name: Verify the created cluster is private
76-
run: gcloud container clusters describe $PRIVATE_CLUSTER_NAME --region=europe-west4 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
72+
run: gcloud container clusters describe $PRIVATE_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
7773
- name: Delete the cluster created
78-
run: python xpk.py cluster delete --cluster $PRIVATE_CLUSTER_NAME --zone=europe-west4-b --force
74+
run: python xpk.py cluster delete --cluster $PRIVATE_CLUSTER_NAME --zone=us-central2-b --force
7975
if: always()
8076
- name: Create an XPK Cluster with 2x v5p-8 nodepools
81-
run: python xpk.py cluster create --cluster $TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=europe-west4-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
77+
run: python xpk.py cluster create --cluster $TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
8278
- name: Authenticate Docker
8379
run: gcloud auth configure-docker --quiet
8480
- name: Create test script to execute in workloads
8581
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
8682
- name: Run a base-docker-image workload
87-
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=v5p-8 --num-slices=2 --zone=europe-west4-b
83+
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b
8884
- name: List out the workloads on the cluster
89-
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
85+
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
9086
- name: Run xpk inspector with the workload created above
91-
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --workload $WORKLOAD_NAME
87+
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
9288
- name: Wait for workload completion and confirm it succeeded
93-
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
89+
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
9490
- name: Run xpk info command
95-
run : python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
91+
run : python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
9692
- name: Delete the workload on the cluster
97-
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
93+
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
9894
- name: Create test script to execute in batch
9995
run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
10096
- name: Run a batch job on the cluster
101-
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3
97+
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3
10298
- name: List out the jobs on the cluster
103-
run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep 'xpk-def-app-profile-slurm-'
99+
run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-'
104100
- name: Get created job name
105101
run: |
106-
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
102+
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
107103
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
108104
- name: Check job spec
109105
run: |
@@ -115,7 +111,7 @@ jobs:
115111
run: python3 xpk.py job info ${JOB_NAME} | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
116112
- name: Cancel the batch job on the cluster
117113
run: |
118-
python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep "job.batch/${JOB_NAME} deleted"
114+
python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted"
119115
- name: Create shell and exit it immidiatelly
120116
run: |
121117
cat <<'EOF' >> create-shell.exp
@@ -132,7 +128,7 @@ jobs:
132128
run: python3 xpk.py shell stop
133129
- name: Delete the cluster created
134130
if: always()
135-
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --force
131+
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --force
136132

137133
pw-cluster-and-workload:
138134
runs-on: [ubuntu-22.04]
@@ -162,18 +158,18 @@ jobs:
162158
- name: Check xpk installation
163159
run: xpk --help
164160
- name: Create an Pathways-enabled XPK Cluster with 2 x v5p-8 nodepools
165-
run: python xpk.py cluster create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=europe-west4-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_NETWORK_ARGUMENTS}"
161+
run: python xpk.py cluster create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_NETWORK_ARGUMENTS}"
166162
- name: Create test script to execute in workloads
167163
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
168164
- name: Run a Pathways workload on Ubuntu base image
169-
run: python xpk.py workload create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v5p-8 --num-slices=2 --zone=europe-west4-b --command "echo \"Hello world from a test script! \""
165+
run: python xpk.py workload create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
170166
- name: Wait for Pathways workload completion and confirm it succeeded
171-
run: python3 xpk.py workload list --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=europe-west4-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
167+
run: python3 xpk.py workload list --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
172168
- name: Delete the Pathways workload on the cluster
173-
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=europe-west4-b
169+
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b
174170
- name: Delete the Pathways cluster created
175171
if: always()
176-
run: python xpk.py cluster delete --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=europe-west4-b --force
172+
run: python xpk.py cluster delete --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b --force
177173

178174
rc-cluster:
179175
runs-on: [ubuntu-22.04]
@@ -203,10 +199,10 @@ jobs:
203199
- name: Check xpk installation
204200
run: xpk --help
205201
- name: Create a RayCluster-enabled XPK Cluster with 2 x v5p-8 nodepools
206-
run: python xpk.py cluster create-ray --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=europe-west4-b --ray-version=2.39.0 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS}}'
202+
run: python xpk.py cluster create-ray --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --ray-version=2.39.0 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS}}'
207203
- name: Delete the RayCluster-enabled XPK cluster
208204
if: always()
209-
run: python xpk.py cluster delete --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --zone=europe-west4-b
205+
run: python xpk.py cluster delete --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --zone=us-central2-b
210206

211207

212208

.github/workflows/unit_tests.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,13 @@ jobs:
2828
with:
2929
python-version: '3.10'
3030
- name: Prepare directories
31-
run: mkdir -p ~/.cache/pip && mkdir bin/
31+
run: mkdir -p ~/.cache/pip
3232
- name: Restore cached dependencies
3333
uses: actions/cache@v4
3434
with:
3535
path: |
36-
bin/
36+
/usr/local/bin/kubectl-kueue
37+
/usr/local/bin/kubectl-kjob
3738
~/.cache/pip
3839
${{env.pythonLocation}}
3940
key: xpk-deps-3.10-${{inputs.run-id}}

0 commit comments

Comments
 (0)