@@ -245,11 +245,12 @@ echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
245245// DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
246246type DynamoGraphDeploymentRequestReconciler struct {
247247 client.Client
248- APIReader client.Reader
249- Recorder record.EventRecorder
250- Config * configv1alpha1.OperatorConfiguration
251- RuntimeConfig * commonController.RuntimeConfig
252-
248+ APIReader client.Reader
249+ Recorder record.EventRecorder
250+ Config * configv1alpha1.OperatorConfiguration
251+ RuntimeConfig * commonController.RuntimeConfig
252+ GPUDiscoveryCache * gpu.GPUDiscoveryCache
253+ GPUDiscovery * gpu.GPUDiscovery
253254 // RBACMgr handles RBAC setup for profiling jobs
254255 RBACManager RBACManager
255256}
@@ -866,14 +867,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
866867 return nil
867868 }
868869
869- _ , err := gpu .DiscoverGPUs (ctx , r .APIReader )
870- if err == nil {
871- // GPU discovery is available, validation passes
872- return nil
873- }
874-
875- logger .Info ("GPU discovery not available" , "reason" , err .Error ())
876-
877870 isNamespaceScoped := r .Config .Namespace .Restricted != ""
878871 if isNamespaceScoped {
879872 return fmt .Errorf (
@@ -887,9 +880,63 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
887880 "\n vramMb: 81920" )
888881 }
889882
883+ _ , err := r .GPUDiscovery .DiscoverGPUsFromDCGM (ctx , r .APIReader , r .GPUDiscoveryCache )
884+ if err == nil {
885+ // GPU discovery is available, validation passes
886+ return nil
887+ }
888+ // Refine the logger message
889+ reason := GetGPUDiscoveryFailureReason (err )
890+ logger .Info ("GPU discovery not available" , "reason" , reason , "error" , err .Error ())
890891 return fmt .Errorf ("GPU hardware info required but auto-discovery failed. Add spec.hardware.gpuSku, spec.hardware.vramMb, spec.hardware.numGpusPerNode" )
891892}
892893
894+ // GetGPUDiscoveryFailureReason classifies a GPU discovery error and
895+ // returns a stable, actionable reason string suitable for structured logging.
896+ //
897+ // The classification is based on known error message patterns produced during:
898+ // - DCGM exporter pod discovery
899+ // - Helm-based GPU operator and DCGM discovery
900+ // - Metrics scraping
901+ // - Prometheus parsing
902+ //
903+ // If the error does not match any known category, "unknown" is returned.
904+ func GetGPUDiscoveryFailureReason (err error ) string {
905+ if err == nil {
906+ return "unknown"
907+ }
908+ errMsg := strings .ToLower (err .Error ())
909+
910+ switch {
911+ case strings .Contains (errMsg , "list pods" ):
912+ return "failed to list DCGM exporter pods (RBAC/cluster connectivity issue)"
913+ case strings .Contains (errMsg , "gpu operator is not installed" ):
914+ return "GPU Operator not installed in expected namespace"
915+ case strings .Contains (errMsg , "helm init failed" ):
916+ return "failed to initialize Helm client (RBAC, kubeconfig, or Helm driver issue)"
917+ case strings .Contains (errMsg , "timeout waiting for dcgm exporter pods" ):
918+ return "timeout while waiting for DCGM exporter pods to become ready"
919+ case strings .Contains (errMsg , "http get" ):
920+ return "failed to reach DCGM metrics endpoint on pod (network/port issue)"
921+ case strings .Contains (errMsg , "metrics endpoint" ) &&
922+ strings .Contains (errMsg , "status" ):
923+ return "DCGM pod metrics endpoint returned non-200 status"
924+ case strings .Contains (errMsg , "parse prometheus metrics" ):
925+ return "failed to parse dcgm Prometheus metrics (invalid format)"
926+ case strings .Contains (errMsg , "no gpus detected" ):
927+ return "no GPUs detected in dcgm metrics (GPU model or metrics missing)"
928+ case strings .Contains (errMsg , "dcgm is not enabled in the GPU Operator" ):
929+ return "DCGM is not enabled in the GPU Operator (check GPU Operator configuration and permissions)"
930+ case strings .Contains (errMsg , "failed to scrape any dcgm exporter pod" ):
931+ return "failed to scrape any dcgm exporter pod (check DCGM exporter pod status and network connectivity)"
932+ case strings .Contains (errMsg , "no gpu metrics could be parsed from any dcgm pod" ):
933+ return "no GPU metrics could be parsed from any DCGM pod (check DCGM exporter pod status and network connectivity)"
934+ case strings .Contains (errMsg , "failed to create helm path" ):
935+ return "failed to initialize Helm client (RBAC, kubeconfig, or Helm driver issue)"
936+ }
937+ return "unknown"
938+ }
939+
893940// createProfilingJob creates a Kubernetes Job for profiling using SyncResource
894941func (r * DynamoGraphDeploymentRequestReconciler ) createProfilingJob (ctx context.Context , dgdr * nvidiacomv1beta1.DynamoGraphDeploymentRequest ) error {
895942 logger := log .FromContext (ctx )
@@ -1203,20 +1250,35 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
12031250 return nil // all fields already set by user; TotalGPUs is filled below when discovery runs
12041251 }
12051252
1206- gpuInfo , err := gpu .DiscoverGPUs (ctx , r .APIReader )
1207- if err != nil {
1208- return err
1209- }
1210-
1253+ var gpuInfo * gpu.GPUInfo
12111254 logger := log .FromContext (ctx )
1212- logger .Info ("GPU discovery completed successfully" ,
1213- "gpusPerNode" , gpuInfo .GPUsPerNode ,
1214- "nodesWithGPUs" , gpuInfo .NodesWithGPUs ,
1215- "totalGpus" , gpuInfo .GPUsPerNode * gpuInfo .NodesWithGPUs ,
1216- "model" , gpuInfo .Model ,
1217- "system" , gpuInfo .System ,
1218- "vramMiB" , gpuInfo .VRAMPerGPU )
1255+ // Check if user provided hardware info in the typed spec
1256+ hasManualConfig := dgdr .Spec .Hardware != nil && (dgdr .Spec .Hardware .GPUSKU != "" ||
1257+ dgdr .Spec .Hardware .VRAMMB != nil ||
1258+ dgdr .Spec .Hardware .NumGPUsPerNode != nil )
1259+ if ! hasManualConfig {
12191260
1261+ logger .Info ("Attempting GPU discovery for profiling job" )
1262+ discoveredInfo , err := r .GPUDiscovery .DiscoverGPUsFromDCGM (ctx , r .APIReader , r .GPUDiscoveryCache )
1263+ if err != nil {
1264+ // This path is expected for namespace-restricted operators without node read permissions
1265+ // Refine the logger message
1266+ reason := GetGPUDiscoveryFailureReason (err )
1267+ logger .Info ("GPU discovery not available, using manual hardware configuration from profiling config" ,
1268+ "reason" , reason , "error" , err .Error ())
1269+ return err
1270+ } else {
1271+ gpuInfo = discoveredInfo
1272+ logger .Info ("GPU discovery completed successfully" ,
1273+ "gpusPerNode" , gpuInfo .GPUsPerNode ,
1274+ "nodesWithGPUs" , gpuInfo .NodesWithGPUs ,
1275+ "totalGpus" , gpuInfo .GPUsPerNode * gpuInfo .NodesWithGPUs ,
1276+ "model" , gpuInfo .Model ,
1277+ "vramMiB" , gpuInfo .VRAMPerGPU ,
1278+ "system" , gpuInfo .System ,
1279+ "cloudprovider" , gpuInfo .CloudProvider )
1280+ }
1281+ }
12201282 if hw .GPUSKU == "" {
12211283 if gpuInfo .System != "" {
12221284 hw .GPUSKU = gpuInfo .System
0 commit comments