Skip to content

Commit 9c37889

Browse files
authored
add 1K, 5K and 10K RayCluster/RayJob scalability tests (#2218)
Signed-off-by: Andrew Sy Kim <[email protected]>
1 parent f69885b commit 9c37889

29 files changed

+1125
-32
lines changed

benchmark/perf-tests/100-raycluster/config.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ steps:
4242
timeout: 30m
4343
command:
4444
- "bash"
45-
- "100-raycluster/wait-for-rayclusters.sh"
45+
- "common/wait-for-rayclusters.sh"
46+
- "100"
4647
- name: Wait for pods to be running
4748
measurements:
4849
- Identifier: WaitForControlledPodsRunning

benchmark/perf-tests/100-rayjob/config.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ steps:
6060
timeout: 30m
6161
command:
6262
- "bash"
63-
- "100-rayjob/wait-for-rayjobs.sh"
63+
- "common/wait-for-rayjobs.sh"
64+
- "100"
6465
- name: Wait for pods to be running
6566
measurements:
6667
- Identifier: WaitForControlledPodsRunning

benchmark/perf-tests/100-rayjob/wait-for-rayjobs.sh

Lines changed: 0 additions & 28 deletions
This file was deleted.
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
name: kuberay
2+
namespace:
3+
number: 100
4+
tuningSets:
5+
- name: Uniform100qps
6+
qpsLoad:
7+
qps: 100
8+
steps:
9+
- name: Start measurements
10+
measurements:
11+
- Identifier: PodStartupLatency
12+
Method: PodStartupLatency
13+
Params:
14+
action: start
15+
labelSelector: app.kubernetes.io/created-by = kuberay-operator
16+
threshold: 30m
17+
- Identifier: WaitForControlledPodsRunning
18+
Method: WaitForControlledPodsRunning
19+
Params:
20+
action: start
21+
apiVersion: ray.io/v1
22+
kind: RayCluster
23+
labelSelector: app.kubernetes.io/created-by = kuberay-operator
24+
operationTimeout: 120s
25+
- name: Preload Images
26+
measurements:
27+
- Identifier: PreloadImages
28+
Method: Exec
29+
Params:
30+
timeout: 30m
31+
command:
32+
- "bash"
33+
- "common/preload-image.sh"
34+
- name: Creating Ray clusters
35+
phases:
36+
- namespaceRange:
37+
min: 1
38+
max: 100
39+
replicasPerNamespace: 10
40+
tuningSet: Uniform100qps
41+
objectBundle:
42+
- basename: raycluster
43+
objectTemplatePath: raycluster.yaml
44+
templateFillMap:
45+
Replicas: 3
46+
Image: "rayproject/ray:2.9.3"
47+
- name: Wait for RayClusters ready
48+
measurements:
49+
- Identifier: WaitForRayCluster
50+
Method: Exec
51+
Params:
52+
timeout: 30m
53+
command:
54+
- "bash"
55+
- "common/wait-for-rayclusters.sh"
56+
- "1000"
57+
- name: Measure wait for pods to be running
58+
measurements:
59+
- Identifier: WaitForControlledPodsRunning
60+
Method: WaitForControlledPodsRunning
61+
Params:
62+
action: gather
63+
- name: Measure pod startup latency
64+
measurements:
65+
- Identifier: PodStartupLatency
66+
Method: PodStartupLatency
67+
Params:
68+
action: gather
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
apiVersion: ray.io/v1
2+
kind: RayCluster
3+
metadata:
4+
name: {{.Name}}
5+
labels:
6+
perf-test: ray-cluster
7+
spec:
8+
rayVersion: '2.9.3'
9+
headGroupSpec:
10+
serviceType: ClusterIP
11+
rayStartParams:
12+
dashboard-host: '0.0.0.0'
13+
disable-usage-stats: 'true'
14+
template:
15+
spec:
16+
containers:
17+
- name: ray-head
18+
image: {{.Image}}
19+
ports:
20+
- containerPort: 6379
21+
name: gcs
22+
- containerPort: 8265
23+
name: dashboard
24+
- containerPort: 10001
25+
name: client
26+
resources:
27+
limits:
28+
cpu: "1"
29+
requests:
30+
cpu: "10m"
31+
volumes:
32+
- name: ray-logs
33+
emptyDir: {}
34+
workerGroupSpecs:
35+
- replicas: {{.Replicas}}
36+
minReplicas: 1
37+
maxReplicas: 10
38+
# logical group name, for this called small-group, also can be functional
39+
groupName: small-group
40+
rayStartParams: {}
41+
template:
42+
spec:
43+
containers:
44+
- name: ray-worker
45+
image: {{.Image}}
46+
resources:
47+
limits:
48+
cpu: "1"
49+
requests:
50+
cpu: "10m"
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<testsuite name="ClusterLoaderV2" tests="0" failures="0" errors="0" time="1644.719">
2+
<testcase name="kuberay overall (1000-raycluster/config.yaml)" classname="ClusterLoaderV2" time="647.5399098"/>
3+
<testcase name="kuberay: [step: 01] Start measurements [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.105058303"/>
4+
<testcase name="kuberay: [step: 01] Start measurements [01] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="1.006024017"/>
5+
<testcase name="kuberay: [step: 02] Preload Images [00] - PreloadImages" classname="ClusterLoaderV2" time="309.500836837"/>
6+
<testcase name="kuberay: [step: 03] Creating Ray clusters" classname="ClusterLoaderV2" time="10.622250764"/>
7+
<testcase name="kuberay: [step: 04] Wait for RayClusters ready [00] - WaitForRayCluster" classname="ClusterLoaderV2" time="258.283033377"/>
8+
<testcase name="kuberay: [step: 05] Wait for pods to be running [00] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="5.491021323"/>
9+
<testcase name="kuberay: [step: 06] Measure pod startup latency [00] - PodStartupLatency" classname="ClusterLoaderV2" time="1.513658548"/>
10+
</testsuite>
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
name: kuberay
2+
namespace:
3+
number: 100
4+
tuningSets:
5+
- name: Uniform100qps
6+
qpsLoad:
7+
qps: 100
8+
steps:
9+
- name: Start measurements
10+
measurements:
11+
- Identifier: PodStartupLatency
12+
Method: PodStartupLatency
13+
Params:
14+
action: start
15+
labelSelector: app.kubernetes.io/created-by = kuberay-operator
16+
threshold: 30m
17+
- Identifier: WaitForControlledPodsRunning
18+
Method: WaitForControlledPodsRunning
19+
Params:
20+
action: start
21+
apiVersion: ray.io/v1
22+
kind: RayCluster
23+
labelSelector: app.kubernetes.io/created-by = kuberay-operator
24+
operationTimeout: 120s
25+
- Identifier: JobLifecycleLatency
26+
Method: JobLifecycleLatency
27+
Params:
28+
action: start
29+
labelSelector: app.kubernetes.io/created-by = kuberay-operator
30+
threshold: 10m
31+
- name: Creating RayJobs for PyTorch MNIST fine-tuning
32+
phases:
33+
- namespaceRange:
34+
min: 1
35+
max: 100
36+
replicasPerNamespace: 5
37+
tuningSet: Uniform100qps
38+
objectBundle:
39+
- basename: pytorch-mnist
40+
objectTemplatePath: pytorch-mnist-rayjob.yaml
41+
templateFillMap:
42+
Image: "rayproject/ray:2.9.3"
43+
- name: Creating RayJobs for Ray Data Image Resizing
44+
phases:
45+
- namespaceRange:
46+
min: 1
47+
max: 100
48+
replicasPerNamespace: 5
49+
tuningSet: Uniform100qps
50+
objectBundle:
51+
- basename: ray-data-image-resize
52+
objectTemplatePath: ray-data-image-resize.yaml
53+
templateFillMap:
54+
Image: "rayproject/ray:2.9.3"
55+
- name: Wait for RayJobs complete
56+
measurements:
57+
- Identifier: WaitForRayJob
58+
Method: Exec
59+
Params:
60+
timeout: 30m
61+
command:
62+
- "bash"
63+
- "common/wait-for-rayjobs.sh"
64+
- "500" # 1000 since we deploy two RayJobs with 500 instances each
65+
- name: Measure wait for pods to be running
66+
measurements:
67+
- Identifier: WaitForControlledPodsRunning
68+
Method: WaitForControlledPodsRunning
69+
Params:
70+
action: gather
71+
operationTimeout: 10m
72+
- name: Measure pod startup latency
73+
measurements:
74+
- Identifier: PodStartupLatency
75+
Method: PodStartupLatency
76+
Params:
77+
action: gather
78+
- name: Measure job finished
79+
measurements:
80+
- Identifier: JobLifecycleLatency
81+
Method: JobLifecycleLatency
82+
Params:
83+
action: gather
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
apiVersion: ray.io/v1
2+
kind: RayJob
3+
metadata:
4+
name: {{.Name}}
5+
labels:
6+
perf-test: rayjob-pytorch-mnist
7+
spec:
8+
shutdownAfterJobFinishes: true
9+
entrypoint: python ray_train_pytorch_mnist.py
10+
submitterPodTemplate:
11+
spec:
12+
restartPolicy: Never
13+
containers:
14+
- name: submitter-job
15+
image: {{.Image}}
16+
command:
17+
- "sh"
18+
- "-c"
19+
args:
20+
- |
21+
#!/bin/sh
22+
23+
ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
24+
ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"NUM_WORKERS":"2","CPUS_PER_WORKER":"1","OMP_NUM_THREADS":"1"}}' -- python ray_train_pytorch_mnist.py
25+
resources:
26+
requests:
27+
cpu: "10m"
28+
rayClusterSpec:
29+
rayVersion: '2.9.3'
30+
headGroupSpec:
31+
rayStartParams:
32+
disable-usage-stats: 'true'
33+
template:
34+
spec:
35+
containers:
36+
- name: ray-head
37+
image: {{.Image}}
38+
ports:
39+
- containerPort: 6379
40+
name: gcs-server
41+
- containerPort: 8265
42+
name: dashboard
43+
- containerPort: 10001
44+
name: client
45+
resources:
46+
requests:
47+
cpu: "100m"
48+
memory: "4Gi"
49+
workerGroupSpecs:
50+
- replicas: 2
51+
minReplicas: 1
52+
maxReplicas: 5
53+
groupName: worker-group
54+
rayStartParams: {}
55+
template:
56+
spec:
57+
containers:
58+
- name: ray-worker
59+
image: {{.Image}}
60+
resources:
61+
requests:
62+
cpu: "100m"
63+
memory: "4Gi"
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
apiVersion: ray.io/v1
2+
kind: RayJob
3+
metadata:
4+
name: {{.Name}}
5+
labels:
6+
perf-test: ray-data-image-resize
7+
spec:
8+
shutdownAfterJobFinishes: true
9+
entrypoint: python ray_data_image_resize.py
10+
submitterPodTemplate:
11+
spec:
12+
restartPolicy: Never
13+
containers:
14+
- name: submitter-job
15+
image: {{.Image}}
16+
command:
17+
- "sh"
18+
- "-c"
19+
args:
20+
- |
21+
#!/bin/sh
22+
23+
ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
24+
ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"BUCKET_NAME":"ray-images","BUCKET_PREFIX":"images"}}' -- python ray_data_image_resize.py
25+
resources:
26+
requests:
27+
cpu: "10m"
28+
rayClusterSpec:
29+
rayVersion: '2.9.3'
30+
headGroupSpec:
31+
rayStartParams:
32+
disable-usage-stats: 'true'
33+
template:
34+
spec:
35+
containers:
36+
- name: ray-head
37+
image: {{.Image}}
38+
ports:
39+
- containerPort: 6379
40+
name: gcs-server
41+
- containerPort: 8265
42+
name: dashboard
43+
- containerPort: 10001
44+
name: client
45+
resources:
46+
requests:
47+
cpu: "100m"
48+
memory: "2Gi"
49+
workerGroupSpecs:
50+
- replicas: 2
51+
minReplicas: 1
52+
maxReplicas: 5
53+
groupName: worker-group
54+
rayStartParams: {}
55+
template:
56+
spec:
57+
containers:
58+
- name: ray-worker
59+
image: {{.Image}}
60+
resources:
61+
requests:
62+
cpu: "100m"
63+
memory: "2Gi"

0 commit comments

Comments
 (0)