Skip to content

Commit c919384

Browse files
scalibyycchenzheng
authored andcommitted
Merge branch 'develop' into chzheng/docker_image_flag
2 parents 813afa0 + b1637d4 commit c919384

File tree

101 files changed

+2104
-838
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+2104
-838
lines changed

.dockerignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
*__pycache__*
55
tmp/
66
.pytype
7+
.mypy_cache
78
# Byte-compiled / optimized / DLL files
89
__pycache__/
910
*.py[cod]

.github/CODEOWNERS

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
* @Obliviour @44past4 @sharabiani @pawloch00 @BluValor @gcie @RoshaniN @scaliby @jamOne- @SikaGrr @FIoannides @fatoshoti
2-
slice/ @mwysokin @mimowo @gabesaba @PBundyra @mwielgus @pajakd
1+
* @scaliby @jamOne- @SikaGrr @FIoannides

.github/workflows/build_tests.yaml

Lines changed: 6 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ on:
2424
type: choice
2525
options:
2626
- v4-8
27-
push:
28-
branches: ["main","develop"]
2927
pull_request: # By default this runs for types assigned, opened and synchronize.
3028

3129
jobs:
@@ -35,13 +33,6 @@ jobs:
3533
group: set-variables-${{ github.event.number}}
3634
cancel-in-progress: true
3735
outputs:
38-
cluster-name: ${{ steps.set-cluster-name.outputs.cluster-name }}
39-
cluster-name-dws: ${{ steps.set-cluster-name-dws.outputs.cluster-name-dws }}
40-
group-name: ${{ steps.set-group-name.outputs.group-name }}
41-
zone: ${{ steps.set-zone.outputs.zone }}
42-
tpu-type: ${{ steps.set-tpu-type.outputs.tpu-type }}
43-
tpu-type-topology: ${{ steps.set-tpu-type-topology.outputs.tpu-type-topology }}
44-
location: ${{steps.set-location.outputs.location}}
4536
run-id: ${{steps.set-run-id.outputs.run-id}}
4637
steps:
4738
- name: set run-id
@@ -57,34 +48,6 @@ jobs:
5748
RUN_ID="${{ github.event.number }}"
5849
fi
5950
echo run-id=$RUN_ID >> $GITHUB_OUTPUT
60-
- name: set cluster-name
61-
id: set-cluster-name
62-
run: |
63-
echo cluster-name=build-xpk-2-nodepools-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
64-
- name: set cluster-name-dws
65-
id: set-cluster-name-dws
66-
run: |
67-
echo cluster-name-dws=build-xpk-2-nodepools-dws-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
68-
- name: set group-name
69-
id: set-group-name
70-
run: |
71-
echo group-name=xpk-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
72-
- name: set zone
73-
id: set-zone
74-
run: |
75-
echo zone=us-central2-b >> $GITHUB_OUTPUT
76-
- name: set tpu-type
77-
id: set-tpu-type
78-
run: |
79-
echo tpu-type=v4-8 >> $GITHUB_OUTPUT
80-
- name: set tpu-type-topology
81-
id: set-tpu-type-topology
82-
run: |
83-
echo tpu-type-topology=v4-2x2x1 >> $GITHUB_OUTPUT
84-
- name: set location
85-
id: set-location
86-
run: |
87-
echo location=us-central2 >> $GITHUB_OUTPUT
8851
install-dependencies:
8952
needs: [set-variables]
9053
runs-on: ubuntu-22.04
@@ -112,7 +75,7 @@ jobs:
11275
lookup-only: true
11376
- name: install dependencies
11477
if : steps.check-cache.outputs.cache-hit != 'true'
115-
run: make install-lint && make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
78+
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
11679
- name: Cache dependencies
11780
if : steps.check-cache.outputs.cache-hit != 'true'
11881
uses: actions/cache/save@v3
@@ -131,6 +94,11 @@ jobs:
13194
uses: ./.github/workflows/reusable_lint_and_format.yml
13295
with:
13396
run-id: '${{needs.set-variables.outputs.run-id}}'
97+
verify-goldens:
98+
needs: [install-dependencies, set-variables]
99+
uses: ./.github/workflows/reusable_goldens.yaml
100+
with:
101+
run-id: '${{needs.set-variables.outputs.run-id}}'
134102
run-unit-tests:
135103
needs: [install-dependencies, set-variables]
136104
uses: ./.github/workflows/reusable_unit_tests.yaml
@@ -139,86 +107,3 @@ jobs:
139107
concurrency: # We support one build or nightly test to run at a time currently.
140108
group: unit-tests-${{needs.set-variables.outputs.run-id}}
141109
cancel-in-progress: true
142-
run-integration-tests:
143-
needs: [install-dependencies, set-variables]
144-
uses: ./.github/workflows/reusable_integration_tests.yaml
145-
with:
146-
run-id: '${{needs.set-variables.outputs.run-id}}'
147-
concurrency: # We support one build or nightly test to run at a time currently.
148-
group: integration-tests-${{needs.set-variables.outputs.run-id}}
149-
cancel-in-progress: true
150-
secrets: inherit
151-
cluster-private:
152-
needs: [linter, run-unit-tests, run-integration-tests, set-variables]
153-
uses: ./.github/workflows/reusable_cluster_private.yaml
154-
concurrency: # We support one build or nightly test to run at a time currently.
155-
group: cluster-private-${{needs.set-variables.outputs.run-id}}
156-
cancel-in-progress: true
157-
with:
158-
run-id: '${{needs.set-variables.outputs.run-id}}'
159-
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
160-
tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}'
161-
zone: '${{needs.set-variables.outputs.zone}}'
162-
location: '${{needs.set-variables.outputs.location}}'
163-
secrets: inherit
164-
cluster-create:
165-
needs: [linter, run-unit-tests, run-integration-tests, set-variables]
166-
concurrency: # We support one build or nightly test to run at a time currently.
167-
group: cluster-create-${{needs.set-variables.outputs.run-id}}
168-
cancel-in-progress: true
169-
uses: ./.github/workflows/reusable_cluster_create.yaml
170-
with:
171-
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
172-
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
173-
tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}'
174-
zone: '${{needs.set-variables.outputs.zone}}'
175-
location: '${{needs.set-variables.outputs.location}}'
176-
run-id: '${{needs.set-variables.outputs.run-id}}'
177-
secrets: inherit
178-
workloads-tests:
179-
needs: [cluster-create, set-variables]
180-
uses: ./.github/workflows/reusable_workload_tests.yaml
181-
concurrency: # We support one build or nightly test to run at a time currently.
182-
group: workload-tests-${{needs.set-variables.outputs.run-id}}
183-
cancel-in-progress: true
184-
with:
185-
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
186-
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
187-
tpu-type: ${{needs.set-variables.outputs.tpu-type}}
188-
tpu-type-topology: ${{needs.set-variables.outputs.tpu-type-topology}}
189-
zone: ${{needs.set-variables.outputs.zone}}
190-
run-id: '${{needs.set-variables.outputs.run-id}}'
191-
secrets: inherit
192-
batch-tests:
193-
needs: [cluster-create, set-variables]
194-
uses: ./.github/workflows/reusable_batch_tests.yaml
195-
concurrency: # We support one build or nightly test to run at a time currently.
196-
group: batch-tests-${{needs.set-variables.outputs.run-id}}
197-
cancel-in-progress: true
198-
with:
199-
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
200-
zone: ${{needs.set-variables.outputs.zone}}
201-
run-id: ${{needs.set-variables.outputs.run-id}}
202-
secrets: inherit
203-
storage-tests:
204-
needs: [cluster-create, set-variables, batch-tests, workloads-tests]
205-
uses: ./.github/workflows/reusable_storage_tests.yaml
206-
concurrency: # We support one build or nightly test to run at a time currently.
207-
group: storage-tests-${{needs.set-variables.outputs.run-id}}
208-
cancel-in-progress: true
209-
with:
210-
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
211-
tpu-type: ${{needs.set-variables.outputs.tpu-type}}
212-
zone: ${{needs.set-variables.outputs.zone}}
213-
run-id: ${{needs.set-variables.outputs.run-id}}
214-
secrets: inherit
215-
cluster-delete:
216-
if: always()
217-
needs: [set-variables, storage-tests]
218-
uses: ./.github/workflows/reusable_cluster_delete.yaml
219-
with:
220-
cluster-name-dws: ${{needs.set-variables.outputs.cluster-name-dws}}
221-
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
222-
run-id: ${{needs.set-variables.outputs.run-id}}
223-
zone: ${{needs.set-variables.outputs.zone}}
224-
secrets: inherit

.github/workflows/nightly_tests.yaml

Lines changed: 180 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515
name: Nightly Tests
1616

1717
on:
18-
push:
19-
branches: ["develop"]
2018
workflow_dispatch:
2119
schedule: # Schedule the job run at 12AM PST daily.
2220
- cron: '0 8 * * *'
2321

22+
permissions:
23+
contents: read
2424

2525
env:
2626
CLUSTER_NETWORK_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}}"
@@ -230,3 +230,181 @@ jobs:
230230
- name: Delete the RayCluster-enabled XPK cluster
231231
if: always()
232232
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
233+
234+
set-variables:
235+
runs-on: [ubuntu-22.04]
236+
concurrency:
237+
group: set-variables-${{ github.event.number}}
238+
cancel-in-progress: true
239+
outputs:
240+
cluster-name: ${{ steps.set-cluster-name.outputs.cluster-name }}
241+
cluster-name-dws: ${{ steps.set-cluster-name-dws.outputs.cluster-name-dws }}
242+
group-name: ${{ steps.set-group-name.outputs.group-name }}
243+
zone: ${{ steps.set-zone.outputs.zone }}
244+
tpu-type: ${{ steps.set-tpu-type.outputs.tpu-type }}
245+
tpu-type-topology: ${{ steps.set-tpu-type-topology.outputs.tpu-type-topology }}
246+
location: ${{steps.set-location.outputs.location}}
247+
run-id: ${{steps.set-run-id.outputs.run-id}}
248+
steps:
249+
- name: set run-id
250+
id: set-run-id
251+
run: |
252+
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
253+
RUN_ID="dispatch"
254+
elif [ "${{ github.ref }}" == "refs/heads/main" ]; then
255+
RUN_ID="main"
256+
elif [ "${{ github.ref }}" == "refs/heads/develop" ]; then
257+
RUN_ID="develop"
258+
else
259+
RUN_ID="${{ github.event.number }}"
260+
fi
261+
echo run-id=$RUN_ID >> $GITHUB_OUTPUT
262+
- name: set cluster-name
263+
id: set-cluster-name
264+
run: |
265+
echo cluster-name=build-xpk-2-nodepools-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
266+
- name: set cluster-name-dws
267+
id: set-cluster-name-dws
268+
run: |
269+
echo cluster-name-dws=build-xpk-2-nodepools-dws-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
270+
- name: set group-name
271+
id: set-group-name
272+
run: |
273+
echo group-name=xpk-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
274+
- name: set zone
275+
id: set-zone
276+
run: |
277+
echo zone=us-central2-b >> $GITHUB_OUTPUT
278+
- name: set tpu-type
279+
id: set-tpu-type
280+
run: |
281+
echo tpu-type=v4-8 >> $GITHUB_OUTPUT
282+
- name: set tpu-type-topology
283+
id: set-tpu-type-topology
284+
run: |
285+
echo tpu-type-topology=v4-2x2x1 >> $GITHUB_OUTPUT
286+
- name: set location
287+
id: set-location
288+
run: |
289+
echo location=us-central2 >> $GITHUB_OUTPUT
290+
install-dependencies:
291+
needs: [set-variables]
292+
runs-on: ubuntu-22.04
293+
strategy:
294+
matrix:
295+
python-version: ["3.10", "3.11"]
296+
steps:
297+
- uses: actions/checkout@v4
298+
- uses: google-github-actions/setup-gcloud@v2
299+
with:
300+
version: '>= 363.0.0'
301+
install_components: 'beta, gke-gcloud-auth-plugin'
302+
- uses: actions/setup-python@v5
303+
with:
304+
python-version: ${{ matrix.python-version }}
305+
- name: Check if cache exists
306+
id: check-cache
307+
uses: actions/cache@v3
308+
with:
309+
path: |
310+
usr/local/bin/
311+
~/.cache/pip
312+
${{env.pythonLocation}}
313+
key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
314+
lookup-only: true
315+
- name: install dependencies
316+
if : steps.check-cache.outputs.cache-hit != 'true'
317+
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
318+
- name: Cache dependencies
319+
if : steps.check-cache.outputs.cache-hit != 'true'
320+
uses: actions/cache/save@v3
321+
with:
322+
path: |
323+
/usr/local/bin/kubectl-kueue
324+
/usr/local/bin/kubectl-kjob
325+
~/.cache/pip
326+
${{env.pythonLocation}}
327+
key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
328+
run-integration-tests:
329+
needs: [install-dependencies, set-variables]
330+
uses: ./.github/workflows/reusable_integration_tests.yaml
331+
with:
332+
run-id: '${{needs.set-variables.outputs.run-id}}'
333+
concurrency: # We support one build or nightly test to run at a time currently.
334+
group: integration-tests-${{needs.set-variables.outputs.run-id}}
335+
cancel-in-progress: true
336+
secrets: inherit
337+
cluster-private:
338+
needs: [run-integration-tests, set-variables]
339+
uses: ./.github/workflows/reusable_cluster_private.yaml
340+
concurrency: # We support one build or nightly test to run at a time currently.
341+
group: cluster-private-${{needs.set-variables.outputs.run-id}}
342+
cancel-in-progress: true
343+
with:
344+
run-id: '${{needs.set-variables.outputs.run-id}}'
345+
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
346+
tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}'
347+
zone: '${{needs.set-variables.outputs.zone}}'
348+
location: '${{needs.set-variables.outputs.location}}'
349+
secrets: inherit
350+
cluster-create:
351+
needs: [run-integration-tests, set-variables]
352+
concurrency: # We support one build or nightly test to run at a time currently.
353+
group: cluster-create-${{needs.set-variables.outputs.run-id}}
354+
cancel-in-progress: true
355+
uses: ./.github/workflows/reusable_cluster_create.yaml
356+
with:
357+
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
358+
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
359+
tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}'
360+
zone: '${{needs.set-variables.outputs.zone}}'
361+
location: '${{needs.set-variables.outputs.location}}'
362+
run-id: '${{needs.set-variables.outputs.run-id}}'
363+
secrets: inherit
364+
workloads-tests:
365+
needs: [cluster-create, set-variables]
366+
uses: ./.github/workflows/reusable_workload_tests.yaml
367+
concurrency: # We support one build or nightly test to run at a time currently.
368+
group: workload-tests-${{needs.set-variables.outputs.run-id}}
369+
cancel-in-progress: true
370+
with:
371+
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
372+
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
373+
tpu-type: ${{needs.set-variables.outputs.tpu-type}}
374+
tpu-type-topology: ${{needs.set-variables.outputs.tpu-type-topology}}
375+
zone: ${{needs.set-variables.outputs.zone}}
376+
run-id: '${{needs.set-variables.outputs.run-id}}'
377+
secrets: inherit
378+
batch-tests:
379+
needs: [cluster-create, set-variables]
380+
uses: ./.github/workflows/reusable_batch_tests.yaml
381+
concurrency: # We support one build or nightly test to run at a time currently.
382+
group: batch-tests-${{needs.set-variables.outputs.run-id}}
383+
cancel-in-progress: true
384+
with:
385+
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
386+
zone: ${{needs.set-variables.outputs.zone}}
387+
run-id: ${{needs.set-variables.outputs.run-id}}
388+
secrets: inherit
389+
storage-tests:
390+
needs: [cluster-create, set-variables, batch-tests, workloads-tests]
391+
uses: ./.github/workflows/reusable_storage_tests.yaml
392+
concurrency: # We support one build or nightly test to run at a time currently.
393+
group: storage-tests-${{needs.set-variables.outputs.run-id}}
394+
cancel-in-progress: true
395+
with:
396+
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
397+
tpu-type: ${{needs.set-variables.outputs.tpu-type}}
398+
zone: ${{needs.set-variables.outputs.zone}}
399+
run-id: ${{needs.set-variables.outputs.run-id}}
400+
secrets: inherit
401+
cluster-delete:
402+
if: always()
403+
needs: [set-variables, storage-tests]
404+
uses: ./.github/workflows/reusable_cluster_delete.yaml
405+
with:
406+
cluster-name-dws: ${{needs.set-variables.outputs.cluster-name-dws}}
407+
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
408+
run-id: ${{needs.set-variables.outputs.run-id}}
409+
zone: ${{needs.set-variables.outputs.zone}}
410+
secrets: inherit

0 commit comments

Comments
 (0)