1
+ {{$MODE := DefaultParam .CL2_MODE "Indexed"}}
2
+ {{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
3
+ {{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
4
+ {{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
5
+ {{$token := .CL2_TOKEN }}
6
+
7
+ {{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
8
+
9
+ # Node resource configuration
10
+ {{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
11
+ {{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}
12
+
13
+ # fast fill job configuration - for initial fill up
14
+ {{$fillPercentage := DefaultParam .CL2_FILL_PERCENTAGE 90}}
15
+ {{$fillPodsCount := DivideInt (MultiplyInt $totalGPUs $fillPercentage) 100}}
16
+ {{$fillPodsPerNamespace := DivideInt $fillPodsCount $namespaces}}
17
+ {{$longJobSize := 1}}
18
+ {{$longJobRunningTime := DefaultParam .CL2_LONG_JOB_RUNNING_TIME "1h"}}
19
+
20
+ # churn job configuration for steady state
21
+ {{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
22
+ {{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
23
+ {{$smallJobSize := 1}}
24
+ {{$smallJobCompletions := 10}}
25
+ {{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
26
+
27
+ name : dra-steady-state
28
+
29
+ namespace :
30
+ number : {{$namespaces}}
31
+
32
+ tuningSets :
33
+ - name : FastFill
34
+ qpsLoad :
35
+ qps : {{$LOAD_TEST_THROUGHPUT}}
36
+ - name : SteadyState
37
+ qpsLoad :
38
+ qps : {{$STEADY_STATE_QPS}}
39
+
40
+ steps :
41
+ - name : Start measurements
42
+ measurements :
43
+ - Identifier : WaitForFinishedJobs
44
+ Method : WaitForFinishedJobs
45
+ Params :
46
+ action : start
47
+ labelSelector : job-type = short-lived
48
+ - Identifier : WaitForControlledPodsRunning
49
+ Method : WaitForControlledPodsRunning
50
+ Params :
51
+ action : start
52
+ apiVersion : batch/v1
53
+ kind : Job
54
+ labelSelector : job-type = long-running
55
+ operationTimeout : 120s
56
+ - Identifier : FastFillSchedulingMetrics
57
+ Method : SchedulingMetrics
58
+ Params :
59
+ action : start
60
+ token : {{ $token }}
61
+ endpoint : " localhost:10259"
62
+ - Identifier : FastFillPodStartupLatency
63
+ Method : PodStartupLatency
64
+ Params :
65
+ action : start
66
+ labelSelector : job-type = long-running
67
+ threshold : 20s
68
+ - name : Clearing SchedulingMetrics
69
+ measurements :
70
+ - Identifier : FastFillSchedulingMetrics
71
+ Method : SchedulingMetrics
72
+ Params :
73
+ action : reset
74
+ token : {{ $token }}
75
+ endpoint : " localhost:10259"
76
+ - name : Create ResourceClaimTemplates in namespaces
77
+ phases :
78
+ - namespaceRange :
79
+ min : 1
80
+ max : {{$namespaces}}
81
+ replicasPerNamespace : 1
82
+ tuningSet : FastFill
83
+ objectBundle :
84
+ - basename : single-gpu
85
+ objectTemplatePath : " resourceclaimtemplate.yaml"
86
+ - name : Fill cluster to {{$fillPercentage}}% utilization
87
+ phases :
88
+ - namespaceRange :
89
+ min : 1
90
+ max : {{$namespaces}}
91
+ replicasPerNamespace : {{$fillPodsPerNamespace}}
92
+ tuningSet : FastFill
93
+ objectBundle :
94
+ - basename : long-running
95
+ objectTemplatePath : " long-running-job.yaml"
96
+ templateFillMap :
97
+ Replicas : {{$longJobSize}}
98
+ Mode : {{$MODE}}
99
+ Sleep : {{$longJobRunningTime}}
100
+ - name : Wait for fill pods to be running
101
+ measurements :
102
+ - Identifier : WaitForControlledPodsRunning
103
+ Method : WaitForControlledPodsRunning
104
+ Params :
105
+ action : gather
106
+ labelSelector : job-type = long-running
107
+ timeout : 15m
108
+ - name : Gather measurements for long running pods
109
+ measurements :
110
+ - Identifier : FastFillSchedulingMetrics
111
+ Method : SchedulingMetrics
112
+ Params :
113
+ action : gather
114
+ token : {{ $token }}
115
+ endpoint : " localhost:10259"
116
+ - Identifier : FastFillPodStartupLatency
117
+ Method : PodStartupLatency
118
+ Params :
119
+ action : gather
120
+ - name : reset metrics for steady state churn
121
+ measurements :
122
+ - Identifier : ChurnSchedulingMetrics
123
+ Method : SchedulingMetrics
124
+ Params :
125
+ action : start
126
+ token : {{ $token }}
127
+ endpoint : " localhost:10259"
128
+ - Identifier : ChurnSchedulingMetrics
129
+ Method : SchedulingMetrics
130
+ Params :
131
+ action : reset
132
+ token : {{ $token }}
133
+ endpoint : " localhost:10259"
134
+ - Identifier : ChurnPodStartupLatency
135
+ Method : PodStartupLatency
136
+ Params :
137
+ action : start
138
+ labelSelector : job-type = short-lived
139
+ threshold : 20s
140
+ - name : Create steady state {{$MODE}} jobs
141
+ phases :
142
+ - namespaceRange :
143
+ min : 1
144
+ max : {{$namespaces}}
145
+ replicasPerNamespace : {{$smallJobsPerNamespace}}
146
+ tuningSet : SteadyState
147
+ objectBundle :
148
+ - basename : small
149
+ objectTemplatePath : " job.yaml"
150
+ templateFillMap :
151
+ Replicas : {{$smallJobSize}}
152
+ CompletionReplicas : {{$smallJobCompletions}}
153
+ Mode : {{$MODE}}
154
+ Sleep : {{$jobRunningTime}}
155
+ - name : Wait for short-lived jobs to finish
156
+ measurements :
157
+ - Identifier : WaitForFinishedJobs
158
+ Method : WaitForFinishedJobs
159
+ Params :
160
+ action : gather
161
+ labelSelector : job-type = short-lived
162
+ timeout : 15m
163
+ - name : Measure scheduler metrics
164
+ measurements :
165
+ - Identifier : ChurnSchedulingMetrics
166
+ Method : SchedulingMetrics
167
+ Params :
168
+ action : gather
169
+ token : {{ $token }}
170
+ endpoint : " localhost:10259"
171
+ - Identifier : ChurnPodStartupLatency
172
+ Method : PodStartupLatency
173
+ Params :
174
+ action : gather
0 commit comments