Skip to content

Commit 19c924b

Browse files
authored
Merge pull request #3368 from alaypatel07/dra-100-node-test
add installing dra driver as dependency and create 100 node test for DRA
2 parents f581a11 + dda44b5 commit 19c924b

14 files changed

+586
-0
lines changed

clusterloader2/cmd/clusterloader.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import (
4343
"k8s.io/perf-tests/clusterloader2/pkg/test"
4444
"k8s.io/perf-tests/clusterloader2/pkg/util"
4545

46+
_ "k8s.io/perf-tests/clusterloader2/pkg/dependency/dra"
4647
_ "k8s.io/perf-tests/clusterloader2/pkg/measurement/common"
4748
_ "k8s.io/perf-tests/clusterloader2/pkg/measurement/common/bundle"
4849
_ "k8s.io/perf-tests/clusterloader2/pkg/measurement/common/dns"
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package dra
18+
19+
import (
20+
"context"
21+
"embed"
22+
"fmt"
23+
"time"
24+
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/apimachinery/pkg/util/wait"
27+
"k8s.io/klog/v2"
28+
"k8s.io/perf-tests/clusterloader2/pkg/dependency"
29+
"k8s.io/perf-tests/clusterloader2/pkg/framework/client"
30+
"k8s.io/perf-tests/clusterloader2/pkg/util"
31+
)
32+
33+
const (
34+
draDependencyName = "DRATestDriver"
35+
//TODO: this needs to be converted into a parameter. Will will not need this until parititionable devices test
36+
draNamespace = "dra-example-driver"
37+
draDaemonsetName = "dra-example-driver-kubeletplugin"
38+
checkDRAReadyInterval = 30 * time.Second
39+
defaultDRATimeout = 10 * time.Minute
40+
)
41+
42+
//go:embed manifests/*.yaml
43+
var manifestsFS embed.FS
44+
45+
func init() {
46+
if err := dependency.Register(draDependencyName, createDRADependency); err != nil {
47+
klog.Fatalf("Cannot register %s: %v", draDependencyName, err)
48+
}
49+
}
50+
51+
func createDRADependency() dependency.Dependency {
52+
return &draDependency{}
53+
}
54+
55+
type draDependency struct{}
56+
57+
func (d *draDependency) Setup(config *dependency.Config) error {
58+
klog.V(2).Infof("%s: Installing DRA example driver", d)
59+
if err := client.CreateNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace); err != nil {
60+
return fmt.Errorf("namespace %s creation error: %v", draNamespace, err)
61+
}
62+
63+
mapping := map[string]interface{}{
64+
"Namespace": draNamespace,
65+
}
66+
if err := config.ClusterFramework.ApplyTemplatedManifests(
67+
manifestsFS,
68+
"manifests/*.yaml",
69+
mapping,
70+
client.Retry(client.IsRetryableAPIError),
71+
); err != nil {
72+
return fmt.Errorf("applying DRA manifests error: %v", err)
73+
}
74+
timeout, err := util.GetDurationOrDefault(config.Params, "timeout", defaultDRATimeout)
75+
if err != nil {
76+
return err
77+
}
78+
klog.V(2).Infof("%s: checking if DRA driver %s is healthy", d, draDaemonsetName)
79+
if err := d.waitForDRADriverToBeHealthy(config, timeout); err != nil {
80+
return err
81+
}
82+
83+
klog.V(2).Infof("%s: DRA example driver installed successfully", d)
84+
return nil
85+
}
86+
87+
func (d *draDependency) Teardown(config *dependency.Config) error {
88+
klog.V(2).Infof("%s: Tearing down DRA example driver", d)
89+
90+
// Delete namespace (this will delete all resources in it)
91+
if err := client.DeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace); err != nil {
92+
return fmt.Errorf("deleting %s namespace error: %v", draNamespace, err)
93+
}
94+
95+
if err := client.WaitForDeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace, client.DefaultNamespaceDeletionTimeout); err != nil {
96+
return err
97+
}
98+
99+
klog.V(2).Infof("%s: DRA example driver uninstalled successfully", d)
100+
return nil
101+
}
102+
103+
func (d *draDependency) waitForDRADriverToBeHealthy(config *dependency.Config, timeout time.Duration) error {
104+
return wait.PollImmediate(
105+
checkDRAReadyInterval,
106+
timeout,
107+
func() (done bool, err error) {
108+
return d.isDRADriverReady(config)
109+
})
110+
}
111+
112+
func (d *draDependency) isDRADriverReady(config *dependency.Config) (done bool, err error) {
113+
ds, err := config.ClusterFramework.GetClientSets().
114+
GetClient().
115+
AppsV1().
116+
DaemonSets(draNamespace).
117+
Get(context.Background(), draDaemonsetName, metav1.GetOptions{})
118+
if err != nil {
119+
return false, fmt.Errorf("failed to get %s: %v", draDaemonsetName, err)
120+
}
121+
ready := ds.Status.NumberReady == ds.Status.DesiredNumberScheduled
122+
if !ready {
123+
klog.V(2).Infof("%s is not ready, "+
124+
"DesiredNumberScheduled: %d, NumberReady: %d", draDaemonsetName, ds.Status.DesiredNumberScheduled, ds.Status.NumberReady)
125+
}
126+
return ready, nil
127+
}
128+
129+
// String returns string representation of this dependency.
130+
func (d *draDependency) String() string {
131+
return draDependencyName
132+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
---
2+
# Source: dra-example-driver/templates/clusterrole.yaml
3+
apiVersion: rbac.authorization.k8s.io/v1
4+
kind: ClusterRole
5+
metadata:
6+
name: dra-example-driver-role
7+
rules:
8+
- apiGroups: ["resource.k8s.io"]
9+
resources: ["resourceclaims"]
10+
verbs: ["get"]
11+
- apiGroups: [""]
12+
resources: ["nodes"]
13+
verbs: ["get"]
14+
- apiGroups: ["resource.k8s.io"]
15+
resources: ["resourceslices"]
16+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
# Source: dra-example-driver/templates/clusterrolebinding.yaml
3+
apiVersion: rbac.authorization.k8s.io/v1
4+
kind: ClusterRoleBinding
5+
metadata:
6+
name: dra-example-driver-role-binding
7+
subjects:
8+
- kind: ServiceAccount
9+
name: dra-example-driver-service-account
10+
namespace: {{.Namespace}}
11+
roleRef:
12+
kind: ClusterRole
13+
name: dra-example-driver-role
14+
apiGroup: rbac.authorization.k8s.io
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
# Source: dra-example-driver/templates/deviceclass.yaml
3+
apiVersion: resource.k8s.io/v1beta1
4+
kind: DeviceClass
5+
metadata:
6+
name: gpu.example.com
7+
spec:
8+
selectors:
9+
- cel:
10+
expression: "device.driver == 'gpu.example.com'"
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
---
2+
# Source: dra-example-driver/templates/kubeletplugin.yaml
3+
apiVersion: apps/v1
4+
kind: DaemonSet
5+
metadata:
6+
name: dra-example-driver-kubeletplugin
7+
namespace: {{.Namespace}}
8+
labels:
9+
helm.sh/chart: dra-example-driver-0.1.3
10+
app.kubernetes.io/name: dra-example-driver
11+
app.kubernetes.io/instance: dra-example-driver
12+
app.kubernetes.io/version: "v0.1.0"
13+
app.kubernetes.io/component: kubeletplugin
14+
spec:
15+
selector:
16+
matchLabels:
17+
app.kubernetes.io/name: dra-example-driver
18+
app.kubernetes.io/instance: dra-example-driver
19+
app.kubernetes.io/component: kubeletplugin
20+
updateStrategy:
21+
type: RollingUpdate
22+
template:
23+
metadata:
24+
labels:
25+
app.kubernetes.io/name: dra-example-driver
26+
app.kubernetes.io/instance: dra-example-driver
27+
app.kubernetes.io/component: kubeletplugin
28+
spec:
29+
priorityClassName: system-node-critical
30+
serviceAccountName: dra-example-driver-service-account
31+
securityContext:
32+
{}
33+
containers:
34+
- name: plugin
35+
securityContext:
36+
privileged: true
37+
image: registry.k8s.io/dra-example-driver/dra-example-driver:v0.1.0
38+
imagePullPolicy: IfNotPresent
39+
command: ["dra-example-kubeletplugin"]
40+
resources:
41+
{}
42+
env:
43+
- name: CDI_ROOT
44+
value: /var/run/cdi
45+
- name: NODE_NAME
46+
valueFrom:
47+
fieldRef:
48+
fieldPath: spec.nodeName
49+
- name: NAMESPACE
50+
valueFrom:
51+
fieldRef:
52+
fieldPath: metadata.namespace
53+
# Simulated number of devices the example driver will pretend to have.
54+
- name: NUM_DEVICES
55+
value: "8"
56+
volumeMounts:
57+
- name: plugins-registry
58+
mountPath: /var/lib/kubelet/plugins_registry
59+
- name: plugins
60+
mountPath: /var/lib/kubelet/plugins
61+
- name: cdi
62+
mountPath: /var/run/cdi
63+
volumes:
64+
- name: plugins-registry
65+
hostPath:
66+
path: /var/lib/kubelet/plugins_registry
67+
- name: plugins
68+
hostPath:
69+
path: /var/lib/kubelet/plugins
70+
- name: cdi
71+
hostPath:
72+
path: /var/run/cdi
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
# Source: dra-example-driver/templates/serviceaccount.yaml
3+
apiVersion: v1
4+
kind: ServiceAccount
5+
metadata:
6+
name: dra-example-driver-service-account
7+
namespace: {{.Namespace}}
8+
labels:
9+
helm.sh/chart: dra-example-driver-0.1.3
10+
app.kubernetes.io/name: dra-example-driver
11+
app.kubernetes.io/instance: dra-example-driver
12+
app.kubernetes.io/version: "v0.1.0"
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
---
2+
# Source: dra-example-driver/templates/validatingadmissionpolicy.yaml
3+
apiVersion: admissionregistration.k8s.io/v1
4+
kind: ValidatingAdmissionPolicy
5+
metadata:
6+
name: resourceslices-policy-dra-example-driver
7+
spec:
8+
failurePolicy: Fail
9+
matchConstraints:
10+
resourceRules:
11+
- apiGroups: ["resource.k8s.io"]
12+
apiVersions: ["v1beta1"]
13+
operations: ["CREATE", "UPDATE", "DELETE"]
14+
resources: ["resourceslices"]
15+
matchConditions:
16+
- name: isRestrictedUser
17+
expression: >-
18+
request.userInfo.username == "system:serviceaccount:dra-example-driver:dra-example-driver-service-account"
19+
variables:
20+
- name: userNodeName
21+
expression: >-
22+
request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('')
23+
- name: objectNodeName
24+
expression: >-
25+
(request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("")
26+
validations:
27+
- expression: variables.userNodeName != ""
28+
message: >-
29+
no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled
30+
- expression: variables.userNodeName == variables.objectNodeName
31+
messageExpression: >-
32+
"this user running on node '"+variables.userNodeName+"' may not modify " +
33+
(variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'")
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
# Source: dra-example-driver/templates/validatingadmissionpolicybinding.yaml
3+
apiVersion: admissionregistration.k8s.io/v1
4+
kind: ValidatingAdmissionPolicyBinding
5+
metadata:
6+
name: resourceslices-policy-dra-example-driver
7+
spec:
8+
policyName: resourceslices-policy-dra-example-driver
9+
validationActions: [Deny]
10+
# All ResourceSlices are matched.

clusterloader2/testing/dra/README.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
### Usage
2+
3+
In order to test the workload here, use the [Getting Started] (../../docs/GETTING_STARTED.md) guide
4+
to set up a kind cluster for the test
5+
6+
#### Steady State DRA Test
7+
8+
This test scenario first fills the cluster to 90% utilization with long-running pods, then measures the performance of
9+
constantly scheduling short-lived pods at a steady rate.
10+
11+
1. Use the following env variables:
12+
```
13+
export CL2_MODE=Indexed
14+
export CL2_NODES_PER_NAMESPACE=1
15+
export CL2_LOAD_TEST_THROUGHPUT=20 # Fast initial fill
16+
export CL2_STEADY_STATE_QPS=5 # Controlled rate for measurement
17+
export CL2_JOB_RUNNING_TIME=30s # Short-lived pods runtime
18+
export CL2_LONG_JOB_RUNNING_TIME=1h # Long-running pods runtime (for cluster fill)
19+
export CL2_GPUS_PER_NODE=8 # GPUs per node
20+
export CL2_FILL_PERCENTAGE=90 # Cluster fill percentage
21+
```
22+
23+
2. Run the test with:
24+
```
25+
./run-e2e.sh cluster-loader2 \
26+
--provider=kind \
27+
--kubeconfig=/root/.kube/config \
28+
--report-dir=/tmp/clusterloader2-results \
29+
--testconfig=testing/dra/config.yaml \
30+
--nodes=5
31+
```
32+
33+
This test will:
34+
1. Create ResourceClaimTemplates in each namespace
35+
2. Fill the cluster to 90% utilization with long-running pods (each using 1 GPU)
36+
3. Measure performance while continuously creating short-lived pods at a steady rate
37+
4. Collect metrics on pod startup latency, job lifecycle latency, and scheduler metrics

0 commit comments

Comments
 (0)