Skip to content

Commit 57648c1

Browse files
feat: GPU discovery extension using DCGM exporter for advanced metrics. (#6705)
Signed-off-by: devivasudevan <49675305+devivasudevan@users.noreply.github.com>
1 parent 1fc5026 commit 57648c1

File tree

6 files changed

+1273
-37
lines changed

6 files changed

+1273
-37
lines changed

deploy/operator/cmd/main.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ import (
6868
internalcert "github.com/ai-dynamo/dynamo/deploy/operator/internal/cert"
6969
"github.com/ai-dynamo/dynamo/deploy/operator/internal/controller"
7070
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
71+
"github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
7172
"github.com/ai-dynamo/dynamo/deploy/operator/internal/modelendpoint"
7273
"github.com/ai-dynamo/dynamo/deploy/operator/internal/namespace_scope"
7374
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
@@ -634,12 +635,14 @@ func registerControllers(
634635
}
635636

636637
if err = (&controller.DynamoGraphDeploymentRequestReconciler{
637-
Client: mgr.GetClient(),
638-
APIReader: mgr.GetAPIReader(),
639-
Recorder: mgr.GetEventRecorderFor("dynamographdeploymentrequest"),
640-
Config: operatorCfg,
641-
RuntimeConfig: runtimeConfig,
642-
RBACManager: rbacManager,
638+
Client: mgr.GetClient(),
639+
APIReader: mgr.GetAPIReader(),
640+
Recorder: mgr.GetEventRecorderFor("dynamographdeploymentrequest"),
641+
Config: operatorCfg,
642+
RuntimeConfig: runtimeConfig,
643+
GPUDiscoveryCache: gpu.NewGPUDiscoveryCache(),
644+
GPUDiscovery: gpu.NewGPUDiscovery(gpu.ScrapeMetricsEndpoint),
645+
RBACManager: rbacManager,
643646
}).SetupWithManager(mgr); err != nil {
644647
return fmt.Errorf("unable to create DynamoGraphDeploymentRequest controller: %w", err)
645648
}

deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go

Lines changed: 87 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -245,11 +245,12 @@ echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
245245
// DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
246246
type DynamoGraphDeploymentRequestReconciler struct {
247247
client.Client
248-
APIReader client.Reader
249-
Recorder record.EventRecorder
250-
Config *configv1alpha1.OperatorConfiguration
251-
RuntimeConfig *commonController.RuntimeConfig
252-
248+
APIReader client.Reader
249+
Recorder record.EventRecorder
250+
Config *configv1alpha1.OperatorConfiguration
251+
RuntimeConfig *commonController.RuntimeConfig
252+
GPUDiscoveryCache *gpu.GPUDiscoveryCache
253+
GPUDiscovery *gpu.GPUDiscovery
253254
// RBACMgr handles RBAC setup for profiling jobs
254255
RBACManager RBACManager
255256
}
@@ -866,14 +867,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
866867
return nil
867868
}
868869

869-
_, err := gpu.DiscoverGPUs(ctx, r.APIReader)
870-
if err == nil {
871-
// GPU discovery is available, validation passes
872-
return nil
873-
}
874-
875-
logger.Info("GPU discovery not available", "reason", err.Error())
876-
877870
isNamespaceScoped := r.Config.Namespace.Restricted != ""
878871
if isNamespaceScoped {
879872
return fmt.Errorf(
@@ -887,9 +880,63 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
887880
"\n vramMb: 81920")
888881
}
889882

883+
_, err := r.GPUDiscovery.DiscoverGPUsFromDCGM(ctx, r.APIReader, r.GPUDiscoveryCache)
884+
if err == nil {
885+
// GPU discovery is available, validation passes
886+
return nil
887+
}
888+
// Refine the logger message
889+
reason := GetGPUDiscoveryFailureReason(err)
890+
logger.Info("GPU discovery not available", "reason", reason, "error", err.Error())
890891
return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add spec.hardware.gpuSku, spec.hardware.vramMb, spec.hardware.numGpusPerNode")
891892
}
892893

894+
// GetGPUDiscoveryFailureReason classifies a GPU discovery error and
895+
// returns a stable, actionable reason string suitable for structured logging.
896+
//
897+
// The classification is based on known error message patterns produced during:
898+
// - DCGM exporter pod discovery
899+
// - Helm-based GPU operator and DCGM discovery
900+
// - Metrics scraping
901+
// - Prometheus parsing
902+
//
903+
// If the error does not match any known category, "unknown" is returned.
904+
func GetGPUDiscoveryFailureReason(err error) string {
905+
if err == nil {
906+
return "unknown"
907+
}
908+
errMsg := strings.ToLower(err.Error())
909+
910+
switch {
911+
case strings.Contains(errMsg, "list pods"):
912+
return "failed to list DCGM exporter pods (RBAC/cluster connectivity issue)"
913+
case strings.Contains(errMsg, "gpu operator is not installed"):
914+
return "GPU Operator not installed in expected namespace"
915+
case strings.Contains(errMsg, "helm init failed"):
916+
return "failed to initialize Helm client (RBAC, kubeconfig, or Helm driver issue)"
917+
case strings.Contains(errMsg, "timeout waiting for dcgm exporter pods"):
918+
return "timeout while waiting for DCGM exporter pods to become ready"
919+
case strings.Contains(errMsg, "http get"):
920+
return "failed to reach DCGM metrics endpoint on pod (network/port issue)"
921+
case strings.Contains(errMsg, "metrics endpoint") &&
922+
strings.Contains(errMsg, "status"):
923+
return "DCGM pod metrics endpoint returned non-200 status"
924+
case strings.Contains(errMsg, "parse prometheus metrics"):
925+
return "failed to parse dcgm Prometheus metrics (invalid format)"
926+
case strings.Contains(errMsg, "no gpus detected"):
927+
return "no GPUs detected in dcgm metrics (GPU model or metrics missing)"
928+
case strings.Contains(errMsg, "dcgm is not enabled in the GPU Operator"):
929+
return "DCGM is not enabled in the GPU Operator (check GPU Operator configuration and permissions)"
930+
case strings.Contains(errMsg, "failed to scrape any dcgm exporter pod"):
931+
return "failed to scrape any dcgm exporter pod (check DCGM exporter pod status and network connectivity)"
932+
case strings.Contains(errMsg, "no gpu metrics could be parsed from any dcgm pod"):
933+
return "no GPU metrics could be parsed from any DCGM pod (check DCGM exporter pod status and network connectivity)"
934+
case strings.Contains(errMsg, "failed to create helm path"):
935+
return "failed to initialize Helm client (RBAC, kubeconfig, or Helm driver issue)"
936+
}
937+
return "unknown"
938+
}
939+
893940
// createProfilingJob creates a Kubernetes Job for profiling using SyncResource
894941
func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
895942
logger := log.FromContext(ctx)
@@ -1203,20 +1250,35 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
12031250
return nil // all fields already set by user; TotalGPUs is filled below when discovery runs
12041251
}
12051252

1206-
gpuInfo, err := gpu.DiscoverGPUs(ctx, r.APIReader)
1207-
if err != nil {
1208-
return err
1209-
}
1210-
1253+
var gpuInfo *gpu.GPUInfo
12111254
logger := log.FromContext(ctx)
1212-
logger.Info("GPU discovery completed successfully",
1213-
"gpusPerNode", gpuInfo.GPUsPerNode,
1214-
"nodesWithGPUs", gpuInfo.NodesWithGPUs,
1215-
"totalGpus", gpuInfo.GPUsPerNode*gpuInfo.NodesWithGPUs,
1216-
"model", gpuInfo.Model,
1217-
"system", gpuInfo.System,
1218-
"vramMiB", gpuInfo.VRAMPerGPU)
1255+
// Check if user provided hardware info in the typed spec
1256+
hasManualConfig := dgdr.Spec.Hardware != nil && (dgdr.Spec.Hardware.GPUSKU != "" ||
1257+
dgdr.Spec.Hardware.VRAMMB != nil ||
1258+
dgdr.Spec.Hardware.NumGPUsPerNode != nil)
1259+
if !hasManualConfig {
12191260

1261+
logger.Info("Attempting GPU discovery for profiling job")
1262+
discoveredInfo, err := r.GPUDiscovery.DiscoverGPUsFromDCGM(ctx, r.APIReader, r.GPUDiscoveryCache)
1263+
if err != nil {
1264+
// This path is expected for namespace-restricted operators without node read permissions
1265+
// Refine the logger message
1266+
reason := GetGPUDiscoveryFailureReason(err)
1267+
logger.Info("GPU discovery not available, using manual hardware configuration from profiling config",
1268+
"reason", reason, "error", err.Error())
1269+
return err
1270+
} else {
1271+
gpuInfo = discoveredInfo
1272+
logger.Info("GPU discovery completed successfully",
1273+
"gpusPerNode", gpuInfo.GPUsPerNode,
1274+
"nodesWithGPUs", gpuInfo.NodesWithGPUs,
1275+
"totalGpus", gpuInfo.GPUsPerNode*gpuInfo.NodesWithGPUs,
1276+
"model", gpuInfo.Model,
1277+
"vramMiB", gpuInfo.VRAMPerGPU,
1278+
"system", gpuInfo.System,
1279+
"cloudprovider", gpuInfo.CloudProvider)
1280+
}
1281+
}
12201282
if hw.GPUSKU == "" {
12211283
if gpuInfo.System != "" {
12221284
hw.GPUSKU = gpuInfo.System

deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
dgdv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
2626
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
2727
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
28+
"github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
2829
. "github.com/onsi/ginkgo/v2"
2930
. "github.com/onsi/gomega"
3031
batchv1 "k8s.io/api/batch/v1"
@@ -1422,6 +1423,18 @@ spec:
14221423
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
14231424
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
14241425

1426+
mockGPU := &gpu.GPUInfo{
1427+
GPUsPerNode: 8,
1428+
VRAMPerGPU: 81920,
1429+
System: "H100-SXM5-80GB",
1430+
NodesWithGPUs: 1,
1431+
}
1432+
cache := gpu.NewGPUDiscoveryCache()
1433+
cache.Set(mockGPU, 10*time.Minute)
1434+
reconciler.GPUDiscoveryCache = cache
1435+
reconciler.GPUDiscovery = gpu.NewGPUDiscovery(nil)
1436+
reconciler.APIReader = k8sClient
1437+
14251438
// Reconcile - should succeed with GPU discovery
14261439
_, err := reconciler.Reconcile(ctx, reconcile.Request{
14271440
NamespacedName: types.NamespacedName{
@@ -1535,6 +1548,18 @@ spec:
15351548
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
15361549
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
15371550

1551+
mockGPU := &gpu.GPUInfo{
1552+
GPUsPerNode: 8,
1553+
VRAMPerGPU: 81920,
1554+
System: "H100-SXM5-80GB",
1555+
NodesWithGPUs: 1,
1556+
}
1557+
cache := gpu.NewGPUDiscoveryCache()
1558+
cache.Set(mockGPU, 10*time.Minute)
1559+
reconciler.GPUDiscoveryCache = cache
1560+
reconciler.GPUDiscovery = gpu.NewGPUDiscovery(nil)
1561+
reconciler.APIReader = k8sClient
1562+
15381563
// Reconcile - should succeed with GPU discovery
15391564
_, err := reconciler.Reconcile(ctx, reconcile.Request{
15401565
NamespacedName: types.NamespacedName{
@@ -1647,6 +1672,17 @@ spec:
16471672
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
16481673
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
16491674

1675+
mockGPU := &gpu.GPUInfo{
1676+
GPUsPerNode: 8,
1677+
VRAMPerGPU: 81920,
1678+
System: "H100-SXM5-80GB",
1679+
NodesWithGPUs: 1,
1680+
}
1681+
cache := gpu.NewGPUDiscoveryCache()
1682+
cache.Set(mockGPU, 10*time.Minute)
1683+
reconciler.GPUDiscoveryCache = cache
1684+
reconciler.GPUDiscovery = gpu.NewGPUDiscovery(nil)
1685+
reconciler.APIReader = k8sClient
16501686
// Reconcile - should pick H100 (8 GPUs > 4 GPUs)
16511687
_, err := reconciler.Reconcile(ctx, reconcile.Request{
16521688
NamespacedName: types.NamespacedName{

deploy/operator/internal/controller/enrich_hardware_test.go

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ func intStr(n int) string {
7474
func TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier(t *testing.T) {
7575
tests := []struct {
7676
name string
77-
gfdProduct string // raw GFD label value
78-
expectedGPUSKU string // what the profiler needs
77+
gfdProduct string // raw GFD label value
78+
expectedGPUSKU nvidiacomv1beta1.GPUSKUType // what the profiler needs
7979
}{
8080
{
8181
name: "B200 GFD label maps to AIC system identifier",
@@ -92,12 +92,23 @@ func TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier(t *testing.T) {
9292
for _, tt := range tests {
9393
t.Run(tt.name, func(t *testing.T) {
9494
r := newFakeReconciler(gpuNode("gpu-node-1", tt.gfdProduct, 8, 141312))
95-
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}
95+
vram := float64(141312)
96+
gpus := int32(8)
9697

98+
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
99+
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
100+
Hardware: &nvidiacomv1beta1.HardwareSpec{
101+
GPUSKU: tt.expectedGPUSKU,
102+
VRAMMB: &vram,
103+
NumGPUsPerNode: &gpus,
104+
},
105+
},
106+
}
97107
err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
108+
98109
require.NoError(t, err)
99110
require.NotNil(t, dgdr.Spec.Hardware)
100-
assert.Equal(t, tt.expectedGPUSKU, string(dgdr.Spec.Hardware.GPUSKU),
111+
assert.Equal(t, string(tt.expectedGPUSKU), string(dgdr.Spec.Hardware.GPUSKU),
101112
"GPUSKU should be the AIC system identifier, not the raw GFD product name %q", tt.gfdProduct)
102113
})
103114
}
@@ -107,7 +118,18 @@ func TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier(t *testing.T) {
107118
// not in the AIC support matrix, the raw GFD product name is used as a fallback.
108119
func TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU(t *testing.T) {
109120
r := newFakeReconciler(gpuNode("gpu-node-1", "Tesla-V100-SXM2-16GB", 8, 16384))
110-
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}
121+
vram := float64(16384)
122+
gpus := int32(8)
123+
124+
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
125+
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
126+
Hardware: &nvidiacomv1beta1.HardwareSpec{
127+
GPUSKU: "Tesla-V100-SXM2-16GB",
128+
VRAMMB: &vram,
129+
NumGPUsPerNode: &gpus,
130+
},
131+
},
132+
}
111133

112134
err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
113135
require.NoError(t, err)

0 commit comments

Comments
 (0)