Skip to content

Commit d44959d

Browse files
author
sonic
committed
add log
1 parent cae6f6e commit d44959d

File tree

3 files changed

+16
-12
lines changed

3 files changed

+16
-12
lines changed

internal/computing/k8s_service.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -717,15 +717,17 @@ type GpuData struct {
717717
UsedIndex []string
718718
}
719719

720-
func (s *K8sService) GetNodeGpuSummary(ctx context.Context) (map[string]map[string]GpuData, error) {
720+
func (s *K8sService) GetNodeGpuSummary(ctx context.Context) (map[string]map[string]GpuData, map[string]string, error) {
721721
nodeGpuInfoMap, err := s.GetResourceExporterPodLog(ctx)
722722
if err != nil {
723723
logs.GetLogger().Errorf("Collect cluster gpu info Failed, if have available gpu, please check resource-exporter. error: %+v", err)
724-
return nil, err
724+
return nil, nil, err
725725
}
726726

727+
var nodeMachineId = make(map[string]string)
727728
var nodeGpuSummary = make(map[string]map[string]GpuData)
728729
for nodeName, gpu := range nodeGpuInfoMap {
730+
nodeMachineId[nodeName] = gpu.MachineId + "&" + gpu.ProductUuid
729731
if gpu.Gpu.AttachedGpus > 0 {
730732
var nodeGpu = make(map[string]GpuData)
731733
for _, g := range gpu.Gpu.Details {
@@ -756,7 +758,7 @@ func (s *K8sService) GetNodeGpuSummary(ctx context.Context) (map[string]map[stri
756758
nodeGpuSummary[nodeName] = nodeGpu
757759
}
758760
}
759-
return nodeGpuSummary, nil
761+
return nodeGpuSummary, nodeMachineId, nil
760762
}
761763

762764
func (s *K8sService) GetAllActivePod(ctx context.Context) ([]coreV1.Pod, error) {

internal/computing/space_service.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1566,7 +1566,7 @@ func checkResourceAvailableForSpace(jobUuid string, jobType int, resourceConfig
15661566
return false, "", nil, 0, nil, err
15671567
}
15681568

1569-
nodeGpuSummary, err := k8sService.GetNodeGpuSummary(context.TODO())
1569+
nodeGpuSummary, nodeNameMachineId, err := k8sService.GetNodeGpuSummary(context.TODO())
15701570
if err != nil {
15711571
logs.GetLogger().Errorf("Failed collect k8s gpu, error: %+v", err)
15721572
return false, "", nil, 0, nil, err
@@ -1577,7 +1577,7 @@ func checkResourceAvailableForSpace(jobUuid string, jobType int, resourceConfig
15771577
needCpu := hardwareDetail.Cpu.Quantity
15781578
needMemory := float64(hardwareDetail.Memory.Quantity)
15791579
needStorage := float64(hardwareDetail.Storage.Quantity)
1580-
logs.GetLogger().Infof("checkResourceForSpace: needCpu: %d, needMemory: %.2f, needStorage: %.2f, needGpu: %s, gpuNum: %d", needCpu, needMemory, needStorage, gpuName, gpuNum)
1580+
logs.GetLogger().Infof("job_uuid: %s, checkResourceForSpace: needCpu: %d, needMemory: %.2f, needStorage: %.2f, needGpu: %s, gpuNum: %d", jobUuid, needCpu, needMemory, needStorage, gpuName, gpuNum)
15811581

15821582
type gpuData struct {
15831583
Total int
@@ -1603,7 +1603,7 @@ func checkResourceAvailableForSpace(jobUuid string, jobType int, resourceConfig
16031603
}
16041604
}
16051605

1606-
logs.GetLogger().Infof("checkResourceForSpace: nodeName: %s,remainingCpu: %d, remainingMemory: %.2f, remainingStorage: %.2f, remainingGpu: %+v", node.Name, remainderCpu, remainderMemory, remainderStorage, freeGpuMap)
1606+
logs.GetLogger().Infof("nodeName: %s, machineId&productUuid: %s, remainingCpu: %d, remainingMemory: %.2f, remainingStorage: %.2f, remainingGpu: %+v", node.Name, nodeNameMachineId[node.Name], remainderCpu, remainderMemory, remainderStorage, freeGpuMap)
16071607

16081608
if remainderCpu < needCpu {
16091609
noAvailableStr = append(noAvailableStr, fmt.Sprintf("cpu need: %d, remainder: %d", needCpu, remainderCpu))
@@ -1672,7 +1672,7 @@ func checkResourceAvailableForImage(jobUuid string, hardwareType string, resourc
16721672
return false, "", nil, nil, nil, err
16731673
}
16741674

1675-
nodeGpuSummary, err := k8sService.GetNodeGpuSummary(context.TODO())
1675+
nodeGpuSummary, nodeNameMachineId, err := k8sService.GetNodeGpuSummary(context.TODO())
16761676
if err != nil {
16771677
logs.GetLogger().Errorf("Failed collect k8s gpu, error: %+v", err)
16781678
return false, "", nil, nil, nil, err
@@ -1714,7 +1714,7 @@ func checkResourceAvailableForImage(jobUuid string, hardwareType string, resourc
17141714
}
17151715
}
17161716

1717-
logs.GetLogger().Infof("checkResourceForSpace: nodeName: %s,remainingCpu: %d, remainingMemory: %.2f, remainingStorage: %.2f, remainingGpu: %+v", node.Name, remainderCpu, remainderMemory, remainderStorage, freeGpuMap)
1717+
logs.GetLogger().Infof("nodeName: %s, machineId&productUuid: %s, remainingCpu: %d, remainingMemory: %.2f, remainingStorage: %.2f, remainingGpu: %+v", node.Name, nodeNameMachineId[node.Name], remainderCpu, remainderMemory, remainderStorage, freeGpuMap)
17181718

17191719
if remainderCpu < needCpu {
17201720
noAvailableStr = append(noAvailableStr, fmt.Sprintf("cpu need: %d, remainder: %d", needCpu, remainderCpu))
@@ -1810,7 +1810,7 @@ func checkResourceAvailableForUbi(taskId, taskType int, gpuName string, resource
18101810
return "", "", 0, 0, 0, nil, nil, err
18111811
}
18121812

1813-
nodeGpuSummary, err := k8sService.GetNodeGpuSummary(context.TODO())
1813+
nodeGpuSummary, nodeNameMachineId, err := k8sService.GetNodeGpuSummary(context.TODO())
18141814
if err != nil {
18151815
logs.GetLogger().Errorf("Failed collect k8s gpu, error: %+v", err)
18161816
return "", "", 0, 0, 0, nil, nil, err
@@ -1842,7 +1842,7 @@ func checkResourceAvailableForUbi(taskId, taskType int, gpuName string, resource
18421842
remainderStorage := float64(remainderResource[ResourceStorage] / 1024 / 1024 / 1024)
18431843

18441844
logs.GetLogger().Infof("checkResourceAvailableForUbi: needCpu: %d, needMemory: %.2f, needStorage: %.2f", needCpu, needMemory, needStorage)
1845-
logs.GetLogger().Infof("checkResourceAvailableForUbi: remainingCpu: %d, remainingMemory: %.2f, remainingStorage: %.2f", remainderCpu, remainderMemory, remainderStorage)
1845+
logs.GetLogger().Infof("nodeName: %s, machineId&productUuid: %s, remainingCpu: %d, remainingMemory: %.2f, remainingStorage: %.2f", node.Name, nodeNameMachineId[node.Name], remainderCpu, remainderMemory, remainderStorage)
18461846

18471847
if remainderCpu < needCpu {
18481848
noAvailableStr = append(noAvailableStr, fmt.Sprintf("cpu need: %d, remainder: %d", needCpu, remainderCpu))

internal/models/resources.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@ type NodeResource struct {
2323
}
2424

2525
type CollectNodeInfo struct {
26-
Gpu Gpu `json:"gpu"`
27-
CpuName string `json:"cpu_name"`
26+
Gpu Gpu `json:"gpu"`
27+
CpuName string `json:"cpu_name"`
28+
ProductUuid string `json:"product_uuid"`
29+
MachineId string `json:"machine_id"`
2830
}
2931
type Gpu struct {
3032
DriverVersion string `json:"driver_version"`

0 commit comments

Comments
 (0)