@@ -8,6 +8,10 @@ import (
88 "os"
99 "path/filepath"
1010 "strings"
11+ "syscall"
12+ "time"
13+
14+ "github.com/shirou/gopsutil/mem"
1115
1216 "github.com/NVIDIA/go-nvml/pkg/nvml"
1317 tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
@@ -43,7 +47,7 @@ func main() {
4347 k8sNodeName = os .Getenv ("HOSTNAME" )
4448 }
4549
46- k8sclient , err := kubeClient ()
50+ k8sClient , err := kubeClient ()
4751 if err != nil {
4852 ctrl .Log .Error (err , "unable to create kubeClient" )
4953 os .Exit (1 )
@@ -93,7 +97,7 @@ func main() {
9397 Name : gpuNodeName ,
9498 },
9599 }
96- if err := k8sclient .Get (ctx , client .ObjectKeyFromObject (gpunode ), gpunode ); err != nil {
100+ if err := k8sClient .Get (ctx , client .ObjectKeyFromObject (gpunode ), gpunode ); err != nil {
97101 ctrl .Log .Error (err , "unable to get gpuNode" )
98102 os .Exit (1 )
99103 }
@@ -103,6 +107,8 @@ func main() {
103107 availableTFlops := resource .MustParse ("0" )
104108 availableVRAM := resource .MustParse ("0Ki" )
105109
110+ allDeviceIDs := make ([]string , 0 )
111+
106112 for i := 0 ; i < count ; i ++ {
107113 device , ret := nvml .DeviceGetHandleByIndex (i )
108114 if ret != nvml .SUCCESS {
@@ -122,6 +128,8 @@ func main() {
122128 os .Exit (1 )
123129 }
124130
131+ allDeviceIDs = append (allDeviceIDs , uuid )
132+
125133 memInfo , ret := device .GetMemoryInfo_v2 ()
126134 if ret != nvml .SUCCESS {
127135 ctrl .Log .Error (errors .New (nvml .ErrorString (ret )), "unable to get memory info of device" , "index" , i )
@@ -137,6 +145,12 @@ func main() {
137145 gpu := & tfv1.GPU {
138146 ObjectMeta : metav1.ObjectMeta {
139147 Name : uuid ,
148+ Labels : map [string ]string {
149+ constants .LabelKeyOwner : gpunode .Name ,
150+ },
151+ Annotations : map [string ]string {
152+ constants .GPULastReportTimeAnnotationKey : time .Now ().Format (time .RFC3339 ),
153+ },
140154 },
141155 }
142156
@@ -157,7 +171,7 @@ func main() {
157171 "kubernetes.io/hostname" : k8sNodeName ,
158172 },
159173 }
160- _ , err = controllerutil .CreateOrUpdate (ctx , k8sclient , gpu , func () error { return nil })
174+ _ , err = controllerutil .CreateOrUpdate (ctx , k8sClient , gpu , func () error { return nil })
161175 if err != nil {
162176 ctrl .Log .Error (err , "failed to create GPU" , "gpu" , gpu )
163177 os .Exit (1 )
@@ -170,7 +184,7 @@ func main() {
170184 gpu .Status .Available = available
171185 }
172186
173- if err := k8sclient .Status ().Patch (ctx , gpu , client .Merge ); err != nil {
187+ if err := k8sClient .Status ().Patch (ctx , gpu , client .Merge ); err != nil {
174188 ctrl .Log .Error (err , "failed to update status of GPU" , "gpu" , gpu )
175189 os .Exit (1 )
176190 }
@@ -186,8 +200,13 @@ func main() {
186200 ns .TotalVRAM = totalVRAM
187201 ns .AvailableTFlops = availableTFlops
188202 ns .AvailableVRAM = availableVRAM
203+ ns .TotalGPUs = int32 (count )
204+ ns .ManagedGPUs = int32 (count )
205+ ns .ManagedGPUDeviceIDs = allDeviceIDs
206+ ns .NodeInfo .RAMSize = * resource .NewQuantity (getTotalHostRAM (), resource .DecimalSI )
207+ ns .NodeInfo .DataDiskSize = * resource .NewQuantity (getDiskInfo (constants .TFDataPath ), resource .DecimalSI )
189208 gpunode .Status = * ns
190- if err := k8sclient .Status ().Patch (ctx , gpunode , client .Merge ); err != nil {
209+ if err := k8sClient .Status ().Patch (ctx , gpunode , client .Merge ); err != nil {
191210 ctrl .Log .Error (err , "failed to update status of GPUNode" )
192211 os .Exit (1 )
193212 }
@@ -228,3 +247,43 @@ func kubeClient() (client.Client, error) {
228247 }
229248 return client , nil
230249}
250+
251+ func getTotalHostRAM () int64 {
252+ v , err := mem .VirtualMemory ()
253+ if err != nil {
254+ fmt .Printf ("error getting memory info: %v\n " , err )
255+ return 0
256+ }
257+ return int64 (v .Total )
258+ }
259+
260+ func getDiskInfo (path string ) (total int64 ) {
261+ absPath , err := filepath .Abs (path )
262+ if err != nil {
263+ fmt .Printf ("error getting disk path: %v\n " , err )
264+ return 0
265+ }
266+
267+ var stat syscall.Statfs_t
268+ err = syscall .Statfs (absPath , & stat )
269+ if err != nil {
270+ if errors .Is (err , syscall .ENOENT ) {
271+ err = os .MkdirAll (absPath , 0755 )
272+ if err != nil {
273+ fmt .Printf ("error creating folder: %s, err: %v\n " , absPath , err )
274+ return 0
275+ }
276+ err = syscall .Statfs (absPath , & stat )
277+ if err != nil {
278+ fmt .Printf ("error getting disk stats after creation: %v\n " , err )
279+ return 0
280+ }
281+ } else {
282+ fmt .Printf ("error getting disk stats: %v\n " , err )
283+ return 0
284+ }
285+ }
286+
287+ total = int64 (stat .Blocks * uint64 (stat .Bsize ))
288+ return total
289+ }
0 commit comments