@@ -41,6 +41,7 @@ import (
4141
4242 tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion/api/v1"
4343 tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
44+ "github.com/NexusGPU/tensor-fusion/internal/config"
4445 "github.com/NexusGPU/tensor-fusion/internal/controller"
4546 "github.com/NexusGPU/tensor-fusion/internal/scheduler"
4647 "github.com/NexusGPU/tensor-fusion/internal/server"
@@ -69,6 +70,7 @@ func main() {
6970 var secureMetrics bool
7071 var enableHTTP2 bool
7172 var tlsOpts []func (* tls.Config )
73+ var gpuInfoConfig string
7274
7375 flag .StringVar (& metricsAddr , "metrics-bind-address" , "0" , "The address the metrics endpoint binds to. " +
7476 "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service." )
@@ -80,6 +82,8 @@ func main() {
8082 "If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead." )
8183 flag .BoolVar (& enableHTTP2 , "enable-http2" , false ,
8284 "If set, HTTP/2 will be enabled for the metrics and webhook servers" )
85+ flag .StringVar (& gpuInfoConfig , "gpu-info-config" ,
86+ "/etc/tensor-fusion/gpu-info.yaml" , "specify the path to gpuInfoConfig file" )
8387 opts := zap.Options {
8488 Development : true ,
8589 }
@@ -107,6 +111,12 @@ func main() {
107111 TLSOpts : tlsOpts ,
108112 })
109113
114+ gpuInfos , err := config .LoadGpuInfoFromFile (gpuInfoConfig )
115+ if err != nil {
116+ ctrl .Log .Error (err , "unable to read gpuInfoConfig file" )
117+ gpuInfos = make ([]config.GpuInfo , 0 )
118+ }
119+
110120 // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
111121 // More info:
112122 // - https://pkg.go.dev/sigs.k8s.io/[email protected] /pkg/metrics/server @@ -260,6 +270,7 @@ func main() {
260270 Scheme : mgr .GetScheme (),
261271 Scheduler : scheduler ,
262272 Recorder : mgr .GetEventRecorderFor ("tensorfusionworkload" ),
273+ GpuInfos : gpuInfos ,
263274 }).SetupWithManager (mgr ); err != nil {
264275 setupLog .Error (err , "unable to create controller" , "controller" , "TensorFusionWorkload" )
265276 os .Exit (1 )
0 commit comments