Skip to content

Commit 95b30d6

Browse files
committed
feat: get tflops from config file
1 parent 3c9d0cd commit 95b30d6

File tree

10 files changed

+83
-21
lines changed

10 files changed

+83
-21
lines changed

api/v1/gpu_types.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ type GPUStatus struct {
2626
// +kubebuilder:default=Pending
2727
Phase TensorFusionGPUPhase `json:"phase"`
2828

29-
Capacity Resource `json:"capacity"`
30-
Available Resource `json:"available"`
29+
Capacity *Resource `json:"capacity"`
30+
Available *Resource `json:"available"`
3131

3232
UUID string `json:"uuid"`
3333

api/v1/zz_generated.deepcopy.go

Lines changed: 10 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/nodediscovery/main.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ import (
99

1010
"github.com/NVIDIA/go-nvml/pkg/nvml"
1111
tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
12+
"github.com/NexusGPU/tensor-fusion-operator/internal/config"
1213
"github.com/NexusGPU/tensor-fusion-operator/internal/reporter"
14+
"github.com/samber/lo"
1315
"k8s.io/apimachinery/pkg/api/resource"
1416
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1517
ctrl "sigs.k8s.io/controller-runtime"
@@ -19,9 +21,11 @@ import (
1921
func main() {
2022
var dryRun bool
2123
var hostname string
22-
24+
var gpuInfoConfig string
2325
flag.BoolVar(&dryRun, "dry-run", false, "dry run mode")
2426
flag.StringVar(&hostname, "hostname", "", "hostname")
27+
flag.StringVar(&gpuInfoConfig, "gpu-info-config", "", "specify the path to gpuInfoConfig file")
28+
2529
if hostname == "" {
2630
hostname = os.Getenv("HOSTNAME")
2731
}
@@ -34,6 +38,12 @@ func main() {
3438
flag.Parse()
3539
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
3640

41+
gpuinfos, err := config.LoadGpuInfoFromFile(gpuInfoConfig)
42+
if err != nil {
43+
ctrl.Log.Error(err, "unable to read gpuInfoConfig file")
44+
os.Exit(1)
45+
}
46+
3747
ret := nvml.Init()
3848
if ret != nvml.SUCCESS {
3949
ctrl.Log.Error(errors.New(nvml.ErrorString(ret)), "unable to initialize NVML")
@@ -90,15 +100,21 @@ func main() {
90100
ctrl.Log.Error(errors.New(nvml.ErrorString(ret)), "unable to get memory info of device", "index", i)
91101
os.Exit(1)
92102
}
103+
info, ok := lo.Find(gpuinfos, func(info config.GpuInfo) bool {
104+
return info.FullModelName == deviceName
105+
})
106+
tflops := info.Fp16TFlops
107+
if !ok {
108+
tflops = resource.MustParse("0")
109+
}
93110
gpu := &tfv1.GPU{
94111
ObjectMeta: metav1.ObjectMeta{
95112
Name: uuid,
96113
},
97114
Status: tfv1.GPUStatus{
98-
Capacity: tfv1.Resource{
99-
Vram: resource.MustParse(fmt.Sprintf("%dKi", memInfo.Total)),
100-
// TODO: compute Tflops based on GPU model
101-
Tflops: resource.MustParse("100"),
115+
Capacity: &tfv1.Resource{
116+
Vram: resource.MustParse(fmt.Sprintf("%dKi", memInfo.Total)),
117+
Tflops: tflops,
102118
},
103119
UUID: uuid,
104120
GPUModel: deviceName,
@@ -112,7 +128,11 @@ func main() {
112128
// keep Available field
113129
available := gpu.Status.Available
114130
gpu.Status = gpuCopy.Status
115-
gpu.Status.Available = available
131+
if available != nil {
132+
gpu.Status.Available = available
133+
} else {
134+
gpu.Status.Available = gpu.Status.Capacity
135+
}
116136
return nil
117137
}); err != nil {
118138
ctrl.Log.Error(err, "failed to report GPU", "gpu", gpu)

internal/config/gpu_info.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package config
2+
3+
import (
4+
"os"
5+
6+
"k8s.io/apimachinery/pkg/api/resource"
7+
"sigs.k8s.io/yaml"
8+
)
9+
10+
type GpuInfo struct {
11+
Model string `json:"model"`
12+
Vendor string `json:"vendor"`
13+
CostPerHour float64 `json:"costPerHour"`
14+
Fp16TFlops resource.Quantity `json:"fp16TFlops"`
15+
FullModelName string `json:"fullModelName"`
16+
}
17+
18+
func LoadGpuInfoFromFile(filename string) ([]GpuInfo, error) {
19+
infos := make([]GpuInfo, 0)
20+
data, err := os.ReadFile(filename)
21+
if err != nil {
22+
return infos, err
23+
}
24+
err = yaml.Unmarshal(data, &infos)
25+
if err != nil {
26+
return nil, err
27+
}
28+
return infos, nil
29+
}

internal/constants/constants.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ const (
1212

1313
LabelKeyOwner = Domain + "/managed-by"
1414

15-
GPUNodePoolIdentifierLabelPrefix = Domain + "/pool/"
16-
GPUNodePoolIdentifierLabelFormat = Domain + "/pool/%s"
15+
GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-"
16+
GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s"
1717
NodeDeletionMark = Domain + "/should-delete"
1818

1919
TensorFusionEnabledLabelKey = Domain + "/enabled"

internal/controller/gpupool_controller.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ func (r *GPUPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
8686
}
8787

8888
// For provisioning mode, check if need to scale up GPUNodes upon AvailableCapacity changed
89-
isProvisioningMode := pool.Spec.NodeManagerConfig.NodeProvisioner != nil && pool.Spec.NodeManagerConfig.NodeProvisioner.Mode == tfv1.NodeProvisionerModeNative
89+
isProvisioningMode := pool.Spec.NodeManagerConfig != nil &&
90+
pool.Spec.NodeManagerConfig.NodeProvisioner != nil &&
91+
pool.Spec.NodeManagerConfig.NodeProvisioner.Mode == tfv1.NodeProvisionerModeNative
9092
if isProvisioningMode {
9193
if err := r.reconcilePoolCapacityWithProvisioner(ctx, pool); err != nil {
9294
return ctrl.Result{}, err

internal/controller/node_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ func (r *NodeReconciler) generateGPUNode(ctx context.Context, node *corev1.Node,
9393
ObjectMeta: metav1.ObjectMeta{
9494
Name: node.Name,
9595
Labels: map[string]string{
96-
fmt.Sprint(constants.GPUNodePoolIdentifierLabelFormat, poolName): "true",
96+
fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, poolName): "true",
9797
},
9898
},
9999
Spec: tfv1.GPUNodeSpec{

internal/controller/tensorfusionconnection_controller_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,11 @@ var _ = Describe("TensorFusionConnection Controller", func() {
8686
NodeSelector: map[string]string{
8787
"kubernetes.io/hostname": "mock-node",
8888
},
89-
Capacity: tfv1.Resource{
89+
Capacity: &tfv1.Resource{
9090
Tflops: resource.MustParse("2"),
9191
Vram: resource.MustParse("2Gi"),
9292
},
93-
Available: tfv1.Resource{
93+
Available: &tfv1.Resource{
9494
Tflops: resource.MustParse("2"),
9595
Vram: resource.MustParse("2Gi"),
9696
},

internal/reporter/reporter.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ func NewDryRunReporter() Reporter {
2727

2828
func (r *DryRunReporter) Report(ctx context.Context, obj client.Object, f controllerutil.MutateFn) error {
2929
log := log.FromContext(ctx)
30+
if err := f(); err != nil {
31+
return err
32+
}
3033
objYaml, err := yaml.Marshal(obj)
3134
if err != nil {
3235
return err

internal/scheduler/naive_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ func createGPU(name string, tflops, vram string) *tfv1.GPU {
1414
Name: name,
1515
},
1616
Status: tfv1.GPUStatus{
17-
Available: tfv1.Resource{
17+
Available: &tfv1.Resource{
1818
Tflops: resource.MustParse(tflops),
1919
Vram: resource.MustParse(vram),
2020
},
@@ -189,11 +189,11 @@ func TestNaiveScheduler_Release(t *testing.T) {
189189
Name: "gpu1",
190190
},
191191
Status: tfv1.GPUStatus{
192-
Capacity: tfv1.Resource{
192+
Capacity: &tfv1.Resource{
193193
Tflops: resource.MustParse("100"),
194194
Vram: resource.MustParse("16Gi"),
195195
},
196-
Available: tfv1.Resource{
196+
Available: &tfv1.Resource{
197197
Tflops: resource.MustParse("100"),
198198
Vram: resource.MustParse("16Gi"),
199199
},
@@ -218,11 +218,11 @@ func TestNaiveScheduler_Release(t *testing.T) {
218218
Name: "gpu1",
219219
},
220220
Status: tfv1.GPUStatus{
221-
Capacity: tfv1.Resource{
221+
Capacity: &tfv1.Resource{
222222
Tflops: resource.MustParse("100"),
223223
Vram: resource.MustParse("16Gi"),
224224
},
225-
Available: tfv1.Resource{
225+
Available: &tfv1.Resource{
226226
Tflops: resource.MustParse("100"),
227227
Vram: resource.MustParse("16Gi"),
228228
},

0 commit comments

Comments
 (0)