Skip to content

Commit e4916bb

Browse files
authored
Fix provisioning mode bugs (#54)
* fix: helm release longer than 63 char issue * fix: simplify the full name * fix: provisioning mode issue * fix: lint in node discovery
1 parent 69fe42e commit e4916bb

File tree

18 files changed

+236
-90
lines changed

18 files changed

+236
-90
lines changed

.vscode/launch.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,14 @@
4848
"ENABLE_WEBHOOKS": "false"
4949
},
5050
"program": "${workspaceFolder}/cmd/operator/main.go",
51+
},
52+
{
53+
"name": "Debug Test Code",
54+
"type": "go",
55+
"request": "launch",
56+
"mode": "auto",
57+
"console": "integratedTerminal",
58+
"program": "${workspaceFolder}/cmd/tmp/main.go",
5159
}
5260
]
5361
}

.vscode/settings.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"Finalizers",
2727
"goconst",
2828
"golint",
29+
"gopsutil",
2930
"gosec",
3031
"gpunode",
3132
"gpunodeclasses",
@@ -56,6 +57,7 @@
5657
"schedulingconfigtemplate",
5758
"schedulingconfigtemplates",
5859
"schedulingcorev",
60+
"shirou",
5961
"subresource",
6062
"tensorfusion",
6163
"tensorfusionaiv",

api/v1/tensorfusioncluster_funcs.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ func (tfc *TensorFusionCluster) RefreshStatus(ownedPools []GPUPool) {
114114
tfc.Status.AvailableTFlops = resource.Quantity{}
115115
tfc.Status.AvailableVRAM = resource.Quantity{}
116116

117+
tfc.Status.VirtualTFlops = resource.Quantity{}
118+
tfc.Status.VirtualVRAM = resource.Quantity{}
119+
117120
for i, gpuPool := range ownedPools {
118121
if gpuPool.Status.Phase != constants.PhaseRunning {
119122
tfc.Status.NotReadyGPUPools = append(tfc.Status.NotReadyGPUPools, gpuPool.Name)
@@ -126,5 +129,8 @@ func (tfc *TensorFusionCluster) RefreshStatus(ownedPools []GPUPool) {
126129
tfc.Status.TotalVRAM.Add(gpuPool.Status.TotalVRAM)
127130
tfc.Status.AvailableTFlops.Add(gpuPool.Status.AvailableTFlops)
128131
tfc.Status.AvailableVRAM.Add(gpuPool.Status.AvailableVRAM)
132+
133+
tfc.Status.VirtualTFlops.Add(gpuPool.Status.VirtualTFlops)
134+
tfc.Status.VirtualVRAM.Add(gpuPool.Status.VirtualVRAM)
129135
}
130136
}

charts/tensor-fusion/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.2.4
18+
version: 1.2.5
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/templates/_helpers.tpl

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,7 @@ We truncate at 63 chars because some Kubernetes name fields are limited to this
1515
If release name contains chart name it will be used as a full name.
1616
*/}}
1717
{{- define "tensor-fusion.fullname" -}}
18-
{{- if .Values.fullnameOverride }}
19-
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
20-
{{- else }}
21-
{{- $name := default .Chart.Name .Values.nameOverride }}
22-
{{- if contains $name .Release.Name }}
23-
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
24-
{{- else }}
25-
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
26-
{{- end }}
27-
{{- end }}
18+
{{- default .Release.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
2819
{{- end }}
2920

3021
{{/*

charts/tensor-fusion/templates/admission-webhooks/job.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: batch/v1
22
kind: Job
33
metadata:
4-
name: {{ include "tensor-fusion.fullname" . }}-create-admission-webhook-secret
4+
name: {{ include "tensor-fusion.fullname" . }}-add-hook-crt
55
namespace: {{ include "tensor-fusion.namespace" . }}
66
annotations:
77
"helm.sh/hook": pre-install,pre-upgrade
@@ -13,7 +13,7 @@ spec:
1313
{{- end }}
1414
template:
1515
metadata:
16-
name: {{ include "tensor-fusion.fullname" . }}-create-admission-webhook-secret
16+
name: {{ include "tensor-fusion.fullname" . }}-add-hook-crt
1717
spec:
1818
containers:
1919
- name: create

cmd/nodediscovery/main.go

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ import (
88
"os"
99
"path/filepath"
1010
"strings"
11+
"syscall"
12+
"time"
13+
14+
"github.com/shirou/gopsutil/mem"
1115

1216
"github.com/NVIDIA/go-nvml/pkg/nvml"
1317
tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
@@ -43,7 +47,7 @@ func main() {
4347
k8sNodeName = os.Getenv("HOSTNAME")
4448
}
4549

46-
k8sclient, err := kubeClient()
50+
k8sClient, err := kubeClient()
4751
if err != nil {
4852
ctrl.Log.Error(err, "unable to create kubeClient")
4953
os.Exit(1)
@@ -93,7 +97,7 @@ func main() {
9397
Name: gpuNodeName,
9498
},
9599
}
96-
if err := k8sclient.Get(ctx, client.ObjectKeyFromObject(gpunode), gpunode); err != nil {
100+
if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpunode), gpunode); err != nil {
97101
ctrl.Log.Error(err, "unable to get gpuNode")
98102
os.Exit(1)
99103
}
@@ -103,6 +107,8 @@ func main() {
103107
availableTFlops := resource.MustParse("0")
104108
availableVRAM := resource.MustParse("0Ki")
105109

110+
allDeviceIDs := make([]string, 0)
111+
106112
for i := 0; i < count; i++ {
107113
device, ret := nvml.DeviceGetHandleByIndex(i)
108114
if ret != nvml.SUCCESS {
@@ -122,6 +128,8 @@ func main() {
122128
os.Exit(1)
123129
}
124130

131+
allDeviceIDs = append(allDeviceIDs, uuid)
132+
125133
memInfo, ret := device.GetMemoryInfo_v2()
126134
if ret != nvml.SUCCESS {
127135
ctrl.Log.Error(errors.New(nvml.ErrorString(ret)), "unable to get memory info of device", "index", i)
@@ -137,6 +145,12 @@ func main() {
137145
gpu := &tfv1.GPU{
138146
ObjectMeta: metav1.ObjectMeta{
139147
Name: uuid,
148+
Labels: map[string]string{
149+
constants.LabelKeyOwner: gpunode.Name,
150+
},
151+
Annotations: map[string]string{
152+
constants.GPULastReportTimeAnnotationKey: time.Now().Format(time.RFC3339),
153+
},
140154
},
141155
}
142156

@@ -157,7 +171,7 @@ func main() {
157171
"kubernetes.io/hostname": k8sNodeName,
158172
},
159173
}
160-
_, err = controllerutil.CreateOrUpdate(ctx, k8sclient, gpu, func() error { return nil })
174+
_, err = controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error { return nil })
161175
if err != nil {
162176
ctrl.Log.Error(err, "failed to create GPU", "gpu", gpu)
163177
os.Exit(1)
@@ -170,7 +184,7 @@ func main() {
170184
gpu.Status.Available = available
171185
}
172186

173-
if err := k8sclient.Status().Patch(ctx, gpu, client.Merge); err != nil {
187+
if err := k8sClient.Status().Patch(ctx, gpu, client.Merge); err != nil {
174188
ctrl.Log.Error(err, "failed to update status of GPU", "gpu", gpu)
175189
os.Exit(1)
176190
}
@@ -186,8 +200,13 @@ func main() {
186200
ns.TotalVRAM = totalVRAM
187201
ns.AvailableTFlops = availableTFlops
188202
ns.AvailableVRAM = availableVRAM
203+
ns.TotalGPUs = int32(count)
204+
ns.ManagedGPUs = int32(count)
205+
ns.ManagedGPUDeviceIDs = allDeviceIDs
206+
ns.NodeInfo.RAMSize = *resource.NewQuantity(getTotalHostRAM(), resource.DecimalSI)
207+
ns.NodeInfo.DataDiskSize = *resource.NewQuantity(getDiskInfo(constants.TFDataPath), resource.DecimalSI)
189208
gpunode.Status = *ns
190-
if err := k8sclient.Status().Patch(ctx, gpunode, client.Merge); err != nil {
209+
if err := k8sClient.Status().Patch(ctx, gpunode, client.Merge); err != nil {
191210
ctrl.Log.Error(err, "failed to update status of GPUNode")
192211
os.Exit(1)
193212
}
@@ -228,3 +247,43 @@ func kubeClient() (client.Client, error) {
228247
}
229248
return client, nil
230249
}
250+
251+
func getTotalHostRAM() int64 {
252+
v, err := mem.VirtualMemory()
253+
if err != nil {
254+
fmt.Printf("error getting memory info: %v\n", err)
255+
return 0
256+
}
257+
return int64(v.Total)
258+
}
259+
260+
func getDiskInfo(path string) (total int64) {
261+
absPath, err := filepath.Abs(path)
262+
if err != nil {
263+
fmt.Printf("error getting disk path: %v\n", err)
264+
return 0
265+
}
266+
267+
var stat syscall.Statfs_t
268+
err = syscall.Statfs(absPath, &stat)
269+
if err != nil {
270+
if errors.Is(err, syscall.ENOENT) {
271+
err = os.MkdirAll(absPath, 0755)
272+
if err != nil {
273+
fmt.Printf("error creating folder: %s, err: %v\n", absPath, err)
274+
return 0
275+
}
276+
err = syscall.Statfs(absPath, &stat)
277+
if err != nil {
278+
fmt.Printf("error getting disk stats after creation: %v\n", err)
279+
return 0
280+
}
281+
} else {
282+
fmt.Printf("error getting disk stats: %v\n", err)
283+
return 0
284+
}
285+
}
286+
287+
total = int64(stat.Blocks * uint64(stat.Bsize))
288+
return total
289+
}

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ require (
8888
github.com/prometheus/client_model v0.6.1 // indirect
8989
github.com/prometheus/common v0.61.0 // indirect
9090
github.com/prometheus/procfs v0.15.1 // indirect
91+
github.com/shirou/gopsutil v3.21.11+incompatible // indirect
9192
github.com/spf13/cobra v1.8.1 // indirect
9293
github.com/spf13/pflag v1.0.5 // indirect
9394
github.com/stoewer/go-strcase v1.3.0 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,8 @@ github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWN
188188
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
189189
github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc=
190190
github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU=
191+
github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI=
192+
github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
191193
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
192194
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
193195
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=

internal/cloudprovider/alibaba/ecs.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ func (p AlibabaGPUNodeProvider) CreateNode(ctx context.Context, param *types.Nod
120120
}
121121

122122
return &types.GPUNodeStatus{
123-
InstanceID: response.RequestId,
123+
InstanceID: response.InstanceIdSets.InstanceIdSet[0],
124124
CreatedAt: time.Now(),
125125
}, nil
126126
}
@@ -129,6 +129,7 @@ func (p AlibabaGPUNodeProvider) TerminateNode(ctx context.Context, param *types.
129129
request := ecs.CreateDeleteInstanceRequest()
130130
request.InstanceId = param.InstanceID
131131
request.RegionId = param.Region
132+
request.Force = requests.NewBoolean(true)
132133
response, err := p.client.DeleteInstance(request)
133134
if err != nil {
134135
return fmt.Errorf("failed to terminate instance: %w", err)

0 commit comments

Comments
 (0)