99
1010 "github.com/NVIDIA/go-nvml/pkg/nvml"
1111 tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
12+ "github.com/NexusGPU/tensor-fusion-operator/internal/config"
1213 "github.com/NexusGPU/tensor-fusion-operator/internal/reporter"
14+ "github.com/samber/lo"
1315 "k8s.io/apimachinery/pkg/api/resource"
1416 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1517 ctrl "sigs.k8s.io/controller-runtime"
@@ -19,9 +21,11 @@ import (
1921func main () {
2022 var dryRun bool
2123 var hostname string
22-
24+ var gpuInfoConfig string
2325 flag .BoolVar (& dryRun , "dry-run" , false , "dry run mode" )
2426 flag .StringVar (& hostname , "hostname" , "" , "hostname" )
27+ flag .StringVar (& gpuInfoConfig , "gpu-info-config" , "" , "specify the path to gpuInfoConfig file" )
28+
2529 if hostname == "" {
2630 hostname = os .Getenv ("HOSTNAME" )
2731 }
@@ -34,6 +38,12 @@ func main() {
3438 flag .Parse ()
3539 ctrl .SetLogger (zap .New (zap .UseFlagOptions (& opts )))
3640
41+ gpuinfos , err := config .LoadGpuInfoFromFile (gpuInfoConfig )
42+ if err != nil {
43+ ctrl .Log .Error (err , "unable to read gpuInfoConfig file" )
44+ os .Exit (1 )
45+ }
46+
3747 ret := nvml .Init ()
3848 if ret != nvml .SUCCESS {
3949 ctrl .Log .Error (errors .New (nvml .ErrorString (ret )), "unable to initialize NVML" )
@@ -90,15 +100,21 @@ func main() {
90100 ctrl .Log .Error (errors .New (nvml .ErrorString (ret )), "unable to get memory info of device" , "index" , i )
91101 os .Exit (1 )
92102 }
103+ info , ok := lo .Find (gpuinfos , func (info config.GpuInfo ) bool {
104+ return info .FullModelName == deviceName
105+ })
106+ tflops := info .Fp16TFlops
107+ if ! ok {
108+ tflops = resource .MustParse ("0" )
109+ }
93110 gpu := & tfv1.GPU {
94111 ObjectMeta : metav1.ObjectMeta {
95112 Name : uuid ,
96113 },
97114 Status : tfv1.GPUStatus {
98- Capacity : tfv1.Resource {
99- Vram : resource .MustParse (fmt .Sprintf ("%dKi" , memInfo .Total )),
100- // TODO: compute Tflops based on GPU model
101- Tflops : resource .MustParse ("100" ),
115+ Capacity : & tfv1.Resource {
116+ Vram : resource .MustParse (fmt .Sprintf ("%dKi" , memInfo .Total )),
117+ Tflops : tflops ,
102118 },
103119 UUID : uuid ,
104120 GPUModel : deviceName ,
@@ -112,7 +128,11 @@ func main() {
112128 // keep Available field
113129 available := gpu .Status .Available
114130 gpu .Status = gpuCopy .Status
115- gpu .Status .Available = available
131+ if available != nil {
132+ gpu .Status .Available = available
133+ } else {
134+ gpu .Status .Available = gpu .Status .Capacity
135+ }
116136 return nil
117137 }); err != nil {
118138 ctrl .Log .Error (err , "failed to report GPU" , "gpu" , gpu )
0 commit comments