Skip to content

Commit f309b59

Browse files
committed
add dra job for 90% fill up and 10% churn
1 parent f581a11 commit f309b59

File tree

5 files changed

+281
-0
lines changed

5 files changed

+281
-0
lines changed

clusterloader2/testing/dra/README.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
### Usage
2+
3+
In order to test the workload here, use the [Getting Started] (../../docs/GETTING_STARTED.md) guide
4+
to set up a kind cluster for the test
5+
6+
#### Steady State DRA Test
7+
8+
This test scenario first fills the cluster to 90% utilization with long-running pods, then measures the performance of
9+
constantly scheduling short-lived pods at a steady rate.
10+
11+
1. Use the following env variables:
12+
```
13+
export CL2_MODE=Indexed
14+
export CL2_NODES_PER_NAMESPACE=1
15+
export CL2_LOAD_TEST_THROUGHPUT=20 # Fast initial fill
16+
export CL2_STEADY_STATE_QPS=5 # Controlled rate for measurement
17+
export CL2_JOB_RUNNING_TIME=30s # Short-lived pods runtime
18+
export CL2_LONG_JOB_RUNNING_TIME=1h # Long-running pods runtime (for cluster fill)
19+
export CL2_GPUS_PER_NODE=8 # GPUs per node
20+
export CL2_FILL_PERCENTAGE=90 # Cluster fill percentage
21+
```
22+
23+
2. Run the test with:
24+
```
25+
./run-e2e.sh cluster-loader2 \
26+
--provider=kind \
27+
--kubeconfig=/root/.kube/config \
28+
--report-dir=/tmp/clusterloader2-results \
29+
--testconfig=testing/dra/config.yaml \
30+
--nodes=5
31+
```
32+
33+
This test will:
34+
1. Create ResourceClaimTemplates in each namespace
35+
2. Fill the cluster to 90% utilization with long-running pods (each using 1 GPU)
36+
3. Measure performance while continuously creating short-lived pods at a steady rate
37+
4. Collect metrics on pod startup latency, job lifecycle latency, and scheduler metrics
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
{{$MODE := DefaultParam .CL2_MODE "Indexed"}}
2+
{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
3+
{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
4+
{{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
5+
{{$token := .CL2_TOKEN }}
6+
7+
{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
8+
9+
# Node resource configuration
10+
{{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
11+
{{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}
12+
13+
# fast fill job configuration - for initial fill up
14+
{{$fillPercentage := DefaultParam .CL2_FILL_PERCENTAGE 90}}
15+
{{$fillPodsCount := DivideInt (MultiplyInt $totalGPUs $fillPercentage) 100}}
16+
{{$fillPodsPerNamespace := DivideInt $fillPodsCount $namespaces}}
17+
{{$longJobSize := 1}}
18+
{{$longJobRunningTime := DefaultParam .CL2_LONG_JOB_RUNNING_TIME "1h"}}
19+
20+
# churn job configuration for steady state
21+
{{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
22+
{{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
23+
{{$smallJobSize := 1}}
24+
{{$smallJobCompletions := 10}}
25+
{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
26+
27+
name: dra-steady-state
28+
29+
namespace:
30+
number: {{$namespaces}}
31+
32+
tuningSets:
33+
- name: FastFill
34+
qpsLoad:
35+
qps: {{$LOAD_TEST_THROUGHPUT}}
36+
- name: SteadyState
37+
qpsLoad:
38+
qps: {{$STEADY_STATE_QPS}}
39+
40+
steps:
41+
- name: Start measurements
42+
measurements:
43+
- Identifier: WaitForFinishedJobs
44+
Method: WaitForFinishedJobs
45+
Params:
46+
action: start
47+
labelSelector: job-type = short-lived
48+
- Identifier: WaitForControlledPodsRunning
49+
Method: WaitForControlledPodsRunning
50+
Params:
51+
action: start
52+
apiVersion: batch/v1
53+
kind: Job
54+
labelSelector: job-type = long-running
55+
operationTimeout: 120s
56+
- Identifier: FastFillSchedulingMetrics
57+
Method: SchedulingMetrics
58+
Params:
59+
action: start
60+
token: {{ $token }}
61+
endpoint: "localhost:10259"
62+
- Identifier: FastFillPodStartupLatency
63+
Method: PodStartupLatency
64+
Params:
65+
action: start
66+
labelSelector: job-type = long-running
67+
threshold: 20s
68+
- name: Clearing SchedulingMetrics
69+
measurements:
70+
- Identifier: FastFillSchedulingMetrics
71+
Method: SchedulingMetrics
72+
Params:
73+
action: reset
74+
token: {{ $token }}
75+
endpoint: "localhost:10259"
76+
- name: Create ResourceClaimTemplates in namespaces
77+
phases:
78+
- namespaceRange:
79+
min: 1
80+
max: {{$namespaces}}
81+
replicasPerNamespace: 1
82+
tuningSet: FastFill
83+
objectBundle:
84+
- basename: single-gpu
85+
objectTemplatePath: "resourceclaimtemplate.yaml"
86+
- name: Fill cluster to {{$fillPercentage}}% utilization
87+
phases:
88+
- namespaceRange:
89+
min: 1
90+
max: {{$namespaces}}
91+
replicasPerNamespace: {{$fillPodsPerNamespace}}
92+
tuningSet: FastFill
93+
objectBundle:
94+
- basename: long-running
95+
objectTemplatePath: "long-running-job.yaml"
96+
templateFillMap:
97+
Replicas: {{$longJobSize}}
98+
Mode: {{$MODE}}
99+
Sleep: {{$longJobRunningTime}}
100+
- name: Wait for fill pods to be running
101+
measurements:
102+
- Identifier: WaitForControlledPodsRunning
103+
Method: WaitForControlledPodsRunning
104+
Params:
105+
action: gather
106+
labelSelector: job-type = long-running
107+
timeout: 15m
108+
- name: Gather measurements for long running pods
109+
measurements:
110+
- Identifier: FastFillSchedulingMetrics
111+
Method: SchedulingMetrics
112+
Params:
113+
action: gather
114+
token: {{ $token }}
115+
endpoint: "localhost:10259"
116+
- Identifier: FastFillPodStartupLatency
117+
Method: PodStartupLatency
118+
Params:
119+
action: gather
120+
- name: reset metrics for steady state churn
121+
measurements:
122+
- Identifier: ChurnSchedulingMetrics
123+
Method: SchedulingMetrics
124+
Params:
125+
action: start
126+
token: {{ $token }}
127+
endpoint: "localhost:10259"
128+
- Identifier: ChurnSchedulingMetrics
129+
Method: SchedulingMetrics
130+
Params:
131+
action: reset
132+
token: {{ $token }}
133+
endpoint: "localhost:10259"
134+
- Identifier: ChurnPodStartupLatency
135+
Method: PodStartupLatency
136+
Params:
137+
action: start
138+
labelSelector: job-type = short-lived
139+
threshold: 20s
140+
- name: Create steady state {{$MODE}} jobs
141+
phases:
142+
- namespaceRange:
143+
min: 1
144+
max: {{$namespaces}}
145+
replicasPerNamespace: {{$smallJobsPerNamespace}}
146+
tuningSet: SteadyState
147+
objectBundle:
148+
- basename: small
149+
objectTemplatePath: "job.yaml"
150+
templateFillMap:
151+
Replicas: {{$smallJobSize}}
152+
CompletionReplicas: {{$smallJobCompletions}}
153+
Mode: {{$MODE}}
154+
Sleep: {{$jobRunningTime}}
155+
- name: Wait for short-lived jobs to finish
156+
measurements:
157+
- Identifier: WaitForFinishedJobs
158+
Method: WaitForFinishedJobs
159+
Params:
160+
action: gather
161+
labelSelector: job-type = short-lived
162+
timeout: 15m
163+
- name: Measure scheduler metrics
164+
measurements:
165+
- Identifier: ChurnSchedulingMetrics
166+
Method: SchedulingMetrics
167+
Params:
168+
action: gather
169+
token: {{ $token }}
170+
endpoint: "localhost:10259"
171+
- Identifier: ChurnPodStartupLatency
172+
Method: PodStartupLatency
173+
Params:
174+
action: gather

clusterloader2/testing/dra/job.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: {{.Name}}
5+
labels:
6+
group: test-job
7+
job-type: short-lived
8+
spec:
9+
parallelism: {{.Replicas}}
10+
completions: {{.CompletionReplicas}}
11+
completionMode: {{.Mode}}
12+
ttlSecondsAfterFinished: 300
13+
template:
14+
metadata:
15+
labels:
16+
group: test-pod
17+
job-type: short-lived
18+
spec:
19+
restartPolicy: Never
20+
containers:
21+
- name: {{.Name}}
22+
image: gcr.io/k8s-staging-perf-tests/sleep:v0.0.3
23+
args:
24+
- {{.Sleep}}
25+
resources:
26+
claims:
27+
- name: gpu
28+
resourceClaims:
29+
- name: gpu
30+
resourceClaimTemplateName: single-gpu-0
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: {{.Name}}
5+
labels:
6+
group: test-job
7+
job-type: long-running
8+
spec:
9+
parallelism: {{.Replicas}}
10+
completions: {{.Replicas}}
11+
completionMode: {{.Mode}}
12+
activeDeadlineSeconds: 86400 # 24 hours
13+
template:
14+
metadata:
15+
labels:
16+
group: test-pod
17+
job-type: long-running
18+
spec:
19+
restartPolicy: Never
20+
containers:
21+
- name: {{.Name}}
22+
image: gcr.io/k8s-staging-perf-tests/sleep:v0.0.3
23+
args:
24+
- {{.Sleep}}
25+
resources:
26+
claims:
27+
- name: gpu
28+
resourceClaims:
29+
- name: gpu
30+
resourceClaimTemplateName: single-gpu-0
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: resource.k8s.io/v1beta1
2+
kind: ResourceClaimTemplate
3+
metadata:
4+
name: {{.Name}}
5+
spec:
6+
spec:
7+
devices:
8+
requests:
9+
- name: gpu
10+
deviceClassName: gpu.example.com

0 commit comments

Comments
 (0)