@@ -2,6 +2,7 @@ package computing
22
33import (
44 "bufio"
5+ "bytes"
56 "context"
67 "encoding/base64"
78 "encoding/json"
@@ -295,6 +296,75 @@ func ReceiveJob(c *gin.Context) {
295296 c .JSON (http .StatusOK , util .CreateSuccessResponse (jobData ))
296297}
297298
299+ func GetResourceExporterMetrics (c * gin.Context ) {
300+ logs .GetLogger ().Info ("Starting GetResourceExporterMetrics function." )
301+ k8sService := NewK8sService ()
302+ if k8sService == nil {
303+ logs .GetLogger ().Info ("Failed to create k8s service client." )
304+ c .JSON (http .StatusInternalServerError , util .CreateErrorResponse (util .ServerError , "failed to create k8s service client" ))
305+ return
306+ }
307+ logs .GetLogger ().Info ("Successfully created k8s service client." )
308+
309+ logs .GetLogger ().Info ("Attempting to list resource-exporter pods in kube-system namespace." )
310+ podList , err := k8sService .k8sClient .CoreV1 ().Pods ("kube-system" ).List (context .Background (), metaV1.ListOptions {
311+ LabelSelector : "app=resource-exporter" ,
312+ })
313+ if err != nil {
314+ logs .GetLogger ().Errorf ("Failed to list resource-exporter pods, error: %v" , err )
315+ c .JSON (http .StatusInternalServerError , util .CreateErrorResponse (util .ServerError , fmt .Sprintf ("failed to list resource-exporter pods: %v" , err )))
316+ return
317+ }
318+ logs .GetLogger ().Infof ("Successfully listed %d resource-exporter pods." , len (podList .Items ))
319+
320+ if len (podList .Items ) == 0 {
321+ logs .GetLogger ().Info ("No resource-exporter pods found." )
322+ c .JSON (http .StatusNotFound , util .CreateErrorResponse (util .ServerError , "resource-exporter pod not found" ))
323+ return
324+ }
325+ logs .GetLogger ().Info ("Resource-exporter pod found." )
326+
327+ // Assuming we only need metrics from one resource-exporter pod, pick the first one
328+ resourceExporterPodName := podList .Items [0 ].Name
329+ if resourceExporterPodName == "" {
330+ logs .GetLogger ().Info ("Resource-exporter pod name is empty." )
331+ c .JSON (http .StatusInternalServerError , util .CreateErrorResponse (util .ServerError , "resource-exporter pod name not found" ))
332+ return
333+ }
334+ logs .GetLogger ().Infof ("Resource-exporter pod name: %s" , resourceExporterPodName )
335+
336+ var mergedMetrics bytes.Buffer
337+
338+ // Get /node/metrics using PodDoCommand
339+ logs .GetLogger ().Infof ("Attempting to get /node/metrics from resource-exporter pod %s." , resourceExporterPodName )
340+ // wget -q -O - localhost:9000/node/metrics
341+ nodeMetricsCmd := []string {"wget" , "-q" , "-O" , "-" , "localhost:9000/node/metrics" }
342+ nodeMetricsStdout , _ , err := k8sService .PodDoCommand ("kube-system" , resourceExporterPodName , "" , nodeMetricsCmd )
343+ if err != nil {
344+ logs .GetLogger ().Errorf ("Failed to get /node/metrics from resource-exporter pod %s: %v" , resourceExporterPodName , err )
345+ c .JSON (http .StatusInternalServerError , util .CreateErrorResponse (util .ServerError , fmt .Sprintf ("failed to get node metrics: %v" , err )))
346+ return
347+ }
348+ logs .GetLogger ().Info ("Successfully retrieved /node/metrics." )
349+ mergedMetrics .WriteString (nodeMetricsStdout )
350+ mergedMetrics .WriteString ("\n " )
351+
352+ // Get /dcgm/metrics using PodDoCommand
353+ logs .GetLogger ().Infof ("Attempting to get /dcgm/metrics from resource-exporter pod %s." , resourceExporterPodName )
354+ dcgmMetricsCmd := []string {"wget" , "-q" , "-O" , "-" , "localhost:9000/dcgm/metrics" }
355+ dcgmMetricsStdout , _ , err := k8sService .PodDoCommand ("kube-system" , resourceExporterPodName , "" , dcgmMetricsCmd )
356+ if err != nil {
357+ logs .GetLogger ().Errorf ("Failed to get /dcgm/metrics from resource-exporter pod %s: %v" , resourceExporterPodName , err )
358+ c .JSON (http .StatusInternalServerError , util .CreateErrorResponse (util .ServerError , fmt .Sprintf ("failed to get dcgm metrics: %v" , err )))
359+ return
360+ }
361+ logs .GetLogger ().Info ("Successfully retrieved /dcgm/metrics." )
362+ mergedMetrics .WriteString (dcgmMetricsStdout )
363+
364+ logs .GetLogger ().Info ("Returning merged resource-exporter metrics." )
365+ c .String (http .StatusOK , mergedMetrics .String ())
366+ }
367+
298368func submitJob (jobData * models.JobData ) error {
299369 cpRepoPath , ok := os .LookupEnv ("CP_PATH" )
300370 if ! ok {
@@ -1775,7 +1845,7 @@ func downloadModelUrl(namespace, jobUuid, serviceIp string, podCmd []string) {
17751845 return
17761846 }
17771847
1778- if err = k8sService .PodDoCommand (namespace , podName , "" , podCmd ); err != nil {
1848+ if _ , _ , err = k8sService .PodDoCommand (namespace , podName , "" , podCmd ); err != nil {
17791849 logs .GetLogger ().Error (err )
17801850 return
17811851 }
0 commit comments