Skip to content

Commit c7b4958

Browse files
Kjob storage configuration (#372)
* Kjob storage configuration --------- Co-authored-by: pawloch00 <[email protected]>
1 parent 2c6642c commit c7b4958

File tree

19 files changed

+616
-104
lines changed

19 files changed

+616
-104
lines changed

.github/workflows/build_tests.yaml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,9 +200,22 @@ jobs:
200200
location: ${{needs.set-variables.outputs.location}}
201201
run-id: ${{needs.set-variables.outputs.run-id}}
202202
secrets: inherit
203+
fuse-tests:
204+
needs: [filestore-tests, set-variables]
205+
uses: ./.github/workflows/fuse_tests.yaml
206+
concurrency: # We support one build or nightly test to run at a time currently.
207+
group: fuse-tests-${{needs.set-variables.outputs.run-id}}
208+
cancel-in-progress: true
209+
with:
210+
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
211+
tpu-type: ${{needs.set-variables.outputs.tpu-type}}
212+
zone: ${{needs.set-variables.outputs.zone}}
213+
location: ${{needs.set-variables.outputs.location}}
214+
run-id: ${{needs.set-variables.outputs.run-id}}
215+
secrets: inherit
203216
cluster-delete:
204217
if: always()
205-
needs: [workloads-tests, batch-tests, set-variables, filestore-tests]
218+
needs: [set-variables, fuse-tests]
206219
uses: ./.github/workflows/cluster_delete.yaml
207220
with:
208221
cluster-name: ${{needs.set-variables.outputs.cluster-name}}

.github/workflows/filestore_tests.yaml

Lines changed: 117 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License
1414

15-
name: Storage Tests
15+
name: Filestore Tests
1616

1717
on:
1818
workflow_call:
@@ -34,23 +34,16 @@ on:
3434
run-id:
3535
required: true
3636
type: string
37-
env:
38-
# Names must be unique in parallel running tests.
39-
TPU_FILESTORE_CLUSTER_NAME: ${{inputs.cluster-name}}-fs-attach
40-
TPU_FILESTORE_CLUSTER_NAME_CREATE: ${{inputs.cluster-name}}-create
41-
STORAGE_NAME_CREATE: ${{inputs.cluster-name}}-test
42-
FS_STORAGE_NAME: ${{secrets.INSTANCE_NAME}}-test-${{inputs.run-id}}
43-
FS_DELETE_WORKLOAD: "fs-delete-workload"
44-
FS_READ_WORKLOAD: "fs-read-workload"
45-
FS_WRITE_WORKLOAD: "fs-write-workload"
46-
FS_DELETE_WORKLOAD_CREATE: "fs-delete-workload-create"
47-
FS_READ_WORKLOAD_CREATE: "fs-read-workload-create"
48-
FS_WRITE_WORKLOAD_CREATE: "fs-write-workload-create"
49-
CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50"
5037

5138
jobs:
52-
run-filestore-workload:
39+
run-filestore:
5340
runs-on: [ubuntu-22.04]
41+
env:
42+
# Names must be unique in parallel running tests.
43+
FS_STORAGE_NAME: ${{secrets.INSTANCE_NAME}}-test-${{inputs.run-id}}
44+
FS_WRITE_WORKLOAD: "fs-write-workload"
45+
FS_READ_WORKLOAD: "fs-read-workload"
46+
FS_DELETE_WORKLOAD: "fs-delete-workload"
5447
steps:
5548
- uses: actions/checkout@v4
5649
- uses: actions/setup-python@v5
@@ -87,20 +80,24 @@ jobs:
8780
${{env.pythonLocation}}
8881
key: xpk-deps-3.10-${{inputs.run-id}}
8982
restore-keys: xpk-deps-3.10-
83+
- name: Install expect package
84+
run: sudo apt-get install expect
9085
- name: Verify xpk installation
9186
run: xpk --help
9287
- name: Authenticate Docker
9388
run: gcloud auth configure-docker --quiet
9489
- name: Fill Filestore manifest file
9590
run: |
96-
sed -i 's/PROJECT_NAME/${{secrets.PROJECT_NAME}}/g; s/ZONE/${{inputs.zone}}/g; s/INSTANCE_NAME/${{secrets.INSTANCE_NAME}}/g; s/VOL_NAME/${{secrets.VOL_NAME}}/g; s/IP_ADDRESS/${{secrets.IP_ADDRESS}}/g' ./tests/data/fs-manifest.yaml
91+
sed -i 's/PROJECT_NAME/${{secrets.PROJECT_NAME}}/g; s/ZONE/${{inputs.zone}}/g; s/INSTANCE_NAME/${{secrets.INSTANCE_NAME}}/g; s/VOL_NAME/${{secrets.VOL_NAME}}/g; s/IP_ADDRESS/${{secrets.IP_ADDRESS}}/g' ./tests/data/fs-manifest.yaml
9792
- name: Attach auto-mount GCP Filestore Storage instance
9893
run: |
9994
python3 xpk.py storage attach $FS_STORAGE_NAME --cluster=${{inputs.cluster-name}} --zone=${{inputs.zone}} --type=gcpfilestore \
10095
--auto-mount=true \
10196
--mount-point='/fs-test-mount-point' --readonly=false --manifest='./tests/data/fs-manifest.yaml'
10297
- name: List and verify existing Storages
10398
run: python3 xpk.py storage list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | tee output.txt | grep $FS_STORAGE_NAME || (echo 'No storage found' && exit 143)
99+
- name: Verify VolumeBundle created
100+
run: kubectl get volumebundle $FS_STORAGE_NAME -o jsonpath='{.spec.containerVolumeMounts[0].mountPath}' | grep '/fs-test-mount-point'
104101
- name: Run workload to write file on filestore
105102
run : python3 xpk.py workload create --workload $FS_WRITE_WORKLOAD --docker-image='marketplace.gcr.io/google/ubuntu2004' --command "mkdir /fs-test-mount-point/$RANDOM_SEED/ && echo 'Test text message' > /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Writing to filestore failed' && exit 143)" --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --zone ${{inputs.location}}
106103
- name: Wait for writer workload completion and confirm it succeeded
@@ -109,12 +106,51 @@ jobs:
109106
run : python3 xpk.py workload create --workload $FS_READ_WORKLOAD --command "grep 'Test text message' /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)" --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --zone ${{inputs.location}}
110107
- name: Wait for reader workload completion and confirm it succeeded
111108
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $FS_READ_WORKLOAD --timeout 300
109+
- name: Create batch-read.sh script
110+
run: |
111+
cat <<EOF > batch-read.sh
112+
#!/bin/bash
113+
grep 'Test text message' /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)
114+
EOF
115+
- name: Run a batch-read job on the cluster
116+
run: python3 xpk.py batch --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} batch-read.sh | tee batch-read.log
117+
- name: Get job name
118+
run: |
119+
READ_JOB_NAME=$(cat batch-read.log | grep 'xpk-def-app-profile-slurm-' | awk -F': ' '{print $2}')
120+
echo "READ_JOB_NAME=${READ_JOB_NAME}" >> $GITHUB_ENV
121+
- name: Wait for the batch-read job to finish
122+
run: kubectl wait job.batch/$READ_JOB_NAME --for=condition=Complete --timeout=1m
123+
- name: Cancel the batch-read job
124+
run: python3 xpk.py job cancel $READ_JOB_NAME --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | grep "job.batch/$READ_JOB_NAME deleted"
125+
- name: Delete batch-read.log file
126+
run: rm batch-read.log
127+
- name: Delete batch-read.sh file
128+
run: rm batch-read.sh
129+
- name: Create shell and exit it immediately
130+
run: |
131+
cat <<EOF >> create-shell.exp
132+
##!/usr/bin/expect
133+
spawn python3 xpk.py shell --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
134+
expect "/ # "
135+
send "cat /fs-test-mount-point/$RANDOM_SEED/test.txt\n"
136+
expect "Test text message"
137+
send "exit\n"
138+
EOF
139+
chmod +x ./create-shell.exp
140+
expect ./create-shell.exp
141+
- name: Stop the shell
142+
run: python3 xpk.py shell stop --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
143+
- name: Delete create-shell.exp file
144+
run: rm create-shell.exp
112145
- name: Run workload to delete file on filestore
113146
run : python3 xpk.py workload create --workload $FS_DELETE_WORKLOAD --command "rm -rf /fs-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --zone ${{inputs.location}}
114147
- name: Wait for delete workload completion and confirm it succeeded
115148
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $FS_DELETE_WORKLOAD --timeout 300
116149
- name: Delete storage
117150
run: python3 xpk.py storage delete $FS_STORAGE_NAME --zone=${{inputs.zone}} --cluster=${{inputs.cluster-name}}
151+
- name: Verify VolumeBundle deleted
152+
run: |
153+
! kubectl get volumebundle | grep $FS_STORAGE_NAME
118154
- name: Delete the writer workload on the cluster
119155
if: always()
120156
run: python3 xpk.py workload delete --workload $FS_WRITE_WORKLOAD --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
@@ -124,9 +160,16 @@ jobs:
124160
- name: Delete the delete workload on the cluster
125161
if: always()
126162
run: python3 xpk.py workload delete --workload $FS_DELETE_WORKLOAD --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
127-
128-
filestore-create:
163+
164+
run-filestore-create:
165+
needs: [run-filestore]
129166
runs-on: [ubuntu-22.04]
167+
env:
168+
# Names must be unique in parallel running tests.
169+
FS_STORAGE_NAME: ${{inputs.cluster-name}}-test
170+
FS_WRITE_WORKLOAD: "fs-write-workload-create"
171+
FS_READ_WORKLOAD: "fs-read-workload-create"
172+
FS_DELETE_WORKLOAD: "fs-delete-workload-create"
130173
steps:
131174
- uses: actions/checkout@v4
132175
- uses: actions/setup-python@v5
@@ -163,40 +206,83 @@ jobs:
163206
${{env.pythonLocation}}
164207
key: xpk-deps-3.10-${{inputs.run-id}}
165208
restore-keys: xpk-deps-3.10-
209+
- name: Install expect package
210+
run: sudo apt-get install expect
166211
- name: Verify xpk installation
167212
run: xpk --help
168213
- name: Authenticate Docker
169214
run: gcloud auth configure-docker --quiet
170215
- name: Create auto-mount GCP Filestore Storage instance
171216
run: |
172-
python3 xpk.py storage create $STORAGE_NAME_CREATE --cluster=${{inputs.cluster-name}} --zone=${{inputs.zone}} --type=gcpfilestore \
217+
python3 xpk.py storage create $FS_STORAGE_NAME --cluster=${{inputs.cluster-name}} --zone=${{inputs.zone}} --type=gcpfilestore \
173218
--auto-mount=true --vol=vol1 --size=1024 --tier=BASIC_HDD \
174219
--mount-point='/fs-test-mount-point' --readonly=false
175220
- name: List and verify existing Storages
176-
run: python3 xpk.py storage list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | tee output.txt | grep $STORAGE_NAME_CREATE || (echo 'No storage found' && exit 143)
221+
run: python3 xpk.py storage list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | tee output.txt | grep $FS_STORAGE_NAME || (echo 'No storage found' && exit 143)
222+
- name: Verify VolumeBundle created
223+
run: kubectl get volumebundle $FS_STORAGE_NAME -o jsonpath='{.spec.containerVolumeMounts[0].mountPath}' | grep '/fs-test-mount-point'
177224
- name: Run workload to write file on filestore
178-
run : python3 xpk.py workload create --workload $FS_WRITE_WORKLOAD_CREATE --num-slices=1 --docker-image='marketplace.gcr.io/google/ubuntu2004' --command "mkdir /fs-test-mount-point/$RANDOM_SEED/ && echo 'Test text message' > /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Writing to filestore failed' && exit 143)" --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --zone ${{inputs.zone}}
225+
run: python3 xpk.py workload create --workload $FS_WRITE_WORKLOAD --num-slices=1 --docker-image='marketplace.gcr.io/google/ubuntu2004' --command "mkdir /fs-test-mount-point/$RANDOM_SEED/ && echo 'Test text message' > /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Writing to filestore failed' && exit 143)" --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --zone ${{inputs.zone}}
179226
- name: Wait for writer workload completion and confirm it succeeded
180-
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $FS_WRITE_WORKLOAD_CREATE --timeout 300
227+
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $FS_WRITE_WORKLOAD --timeout 300
181228
- name: Run workload to read file on filestore
182-
run : python3 xpk.py workload create --workload $FS_READ_WORKLOAD_CREATE --num-slices=1 --command "grep 'Test text message' /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)" --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --zone ${{inputs.zone}}
229+
run : python3 xpk.py workload create --workload $FS_READ_WORKLOAD --num-slices=1 --command "grep 'Test text message' /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)" --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --zone ${{inputs.zone}}
183230
- name: Wait for reader workload completion and confirm it succeeded
184-
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $FS_READ_WORKLOAD_CREATE --timeout 300
231+
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $FS_READ_WORKLOAD --timeout 300
232+
- name: Create batch-read.sh script
233+
run: |
234+
cat <<EOF > batch-read.sh
235+
#!/bin/bash
236+
grep 'Test text message' /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)
237+
EOF
238+
- name: Run a batch-read job on the cluster
239+
run: python3 xpk.py batch --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} batch-read.sh | tee batch-read.log
240+
- name: Get job name
241+
run: |
242+
READ_JOB_NAME=$(cat batch-read.log | grep 'xpk-def-app-profile-slurm-' | awk -F': ' '{print $2}')
243+
echo "READ_JOB_NAME=${READ_JOB_NAME}" >> $GITHUB_ENV
244+
- name: Wait for the batch-read job to finish
245+
run: kubectl wait job.batch/$READ_JOB_NAME --for=condition=Complete --timeout=1m
246+
- name: Cancel the batch-read job
247+
run: python3 xpk.py job cancel $READ_JOB_NAME --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | grep "job.batch/$READ_JOB_NAME deleted"
248+
- name: Delete batch-read.log file
249+
run: rm batch-read.log
250+
- name: Delete batch-read.sh file
251+
run: rm batch-read.sh
252+
- name: Create shell and exit it immediately
253+
run: |
254+
cat <<EOF >> create-shell.exp
255+
##!/usr/bin/expect
256+
spawn python3 xpk.py shell --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
257+
expect "/ # "
258+
send "cat /fs-test-mount-point/$RANDOM_SEED/test.txt\n"
259+
expect "Test text message"
260+
send "exit\n"
261+
EOF
262+
chmod +x ./create-shell.exp
263+
expect ./create-shell.exp
264+
- name: Stop the shell
265+
run: python3 xpk.py shell stop --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
266+
- name: Delete create-shell.exp file
267+
run: rm create-shell.exp
185268
- name: Run workload to delete file on filestore
186-
run : python3 xpk.py workload create --workload $FS_DELETE_WORKLOAD_CREATE --command "rm -rf /fs-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --num-slices=1 --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --zone ${{inputs.zone}}
269+
run : python3 xpk.py workload create --workload $FS_DELETE_WORKLOAD --command "rm -rf /fs-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --num-slices=1 --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --zone ${{inputs.zone}}
187270
- name: Wait for delete workload completion and confirm it succeeded
188-
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $FS_DELETE_WORKLOAD_CREATE --timeout 300
271+
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $FS_DELETE_WORKLOAD --timeout 300
189272
- name: Delete storage
190-
run: python3 xpk.py storage delete $STORAGE_NAME_CREATE --zone=${{inputs.zone}} --cluster=${{inputs.cluster-name}}
273+
run: python3 xpk.py storage delete $FS_STORAGE_NAME --zone=${{inputs.zone}} --cluster=${{inputs.cluster-name}}
274+
- name: Verify VolumeBundle deleted
275+
run: |
276+
! kubectl get volumebundle | grep $FS_STORAGE_NAME
191277
- name: Delete the filestore instance
192278
if: always()
193-
run: gcloud filestore instances delete $STORAGE_NAME_CREATE --zone=${{inputs.zone}} --force
279+
run: gcloud filestore instances delete $FS_STORAGE_NAME --zone=${{inputs.zone}} --force
194280
- name: Delete the writer workload on the cluster
195281
if: always()
196-
run: python3 xpk.py workload delete --workload $FS_WRITE_WORKLOAD_CREATE --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
282+
run: python3 xpk.py workload delete --workload $FS_WRITE_WORKLOAD --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
197283
- name: Delete the reader workload on the cluster
198284
if: always()
199-
run: python3 xpk.py workload delete --workload $FS_READ_WORKLOAD_CREATE --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
285+
run: python3 xpk.py workload delete --workload $FS_READ_WORKLOAD --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
200286
- name: Delete the delete workload on the cluster
201287
if: always()
202-
run: python3 xpk.py workload delete --workload $FS_DELETE_WORKLOAD_CREATE --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
288+
run: python3 xpk.py workload delete --workload $FS_DELETE_WORKLOAD --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}

0 commit comments

Comments
 (0)