Skip to content

Commit b1e85a9

Browse files
authored
Eliranw/run 28124 utilization metrics (#148)
Add check for workloadKind to generate utilizationmetrics
1 parent ae308ef commit b1e85a9

File tree

2 files changed

+15
-27
lines changed

2 files changed

+15
-27
lines changed

internal/status-updater/handlers/pod/gpu_usage_calculator.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ func calculateGpuUsageFromPodType(dynamicclient dynamic.Interface, pod *v1.Pod,
7171
switch podType {
7272
case "train":
7373
return generateGpuUsageStatus(topology.Range{Min: 80, Max: 100}, gpuFraction, totalGpuMemory, false)
74-
case "build", "interactive-preemptible":
74+
case "build", "interactive-preemptible", "interactive", "distributed":
7575
return generateGpuUsageStatus(topology.Range{Min: 0, Max: 0}, gpuFraction, totalGpuMemory, false)
7676
case "inference":
7777
return generateGpuUsageStatus(topology.Range{Min: 0, Max: 0}, gpuFraction, totalGpuMemory, true)
@@ -104,6 +104,20 @@ func calculateUtilizationFromAnnotation(annotationValue string) (*topology.Range
104104
}
105105

106106
func getPodType(dynamicClient dynamic.Interface, pod *v1.Pod) (string, error) {
107+
if workloadKind, ok := pod.Labels["workloadKind"]; ok {
108+
switch workloadKind {
109+
case "TrainingWorkload":
110+
return "train", nil
111+
case "DistributedWorkload":
112+
return "distributed", nil
113+
case "InferenceWorkload":
114+
return "inference", nil
115+
case "InteractiveWorkload":
116+
return "interactive", nil
117+
}
118+
}
119+
120+
// Fallback to existing PodGroup lookup
107121
podGroupName := pod.Annotations[constants.AnnotationPodGroupName]
108122
if podGroupName == "" {
109123
return "", fmt.Errorf("pod %s has no constants.PodGroupNameAnnotation annotation", pod.Name)

test/integration/kind-cluster-config.yaml

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,6 @@ apiVersion: kind.x-k8s.io/v1alpha4
33
name: dra-cluster
44
featureGates:
55
DynamicResourceAllocation: true
6-
containerdConfigPatches:
7-
# Enable CDI as described in
8-
# https://tags.cncf.io/container-device-interface#containerd-configuration
9-
- |-
10-
[plugins."io.containerd.grpc.v1.cri"]
11-
enable_cdi = true
12-
cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"]
136
nodes:
147
- role: control-plane
158
image: kindest/node:v1.34.0
@@ -19,22 +12,3 @@ nodes:
1912
apiServer:
2013
extraArgs:
2114
runtime-config: "resource.k8s.io/v1beta1=true"
22-
scheduler:
23-
extraArgs:
24-
v: "1"
25-
controllerManager:
26-
extraArgs:
27-
v: "1"
28-
- |
29-
kind: InitConfiguration
30-
nodeRegistration:
31-
kubeletExtraArgs:
32-
v: "4"
33-
- role: worker
34-
image: kindest/node:v1.34.0
35-
kubeadmConfigPatches:
36-
- |
37-
kind: JoinConfiguration
38-
nodeRegistration:
39-
kubeletExtraArgs:
40-
v: "4"

0 commit comments

Comments
 (0)