Skip to content

Commit e4fab68

Browse files
authored
fix: add log collection config and anti affinity for operator, assign port for worker/lab-pod (#208)
* fix: add log collection config and anti affinity for operator, assign component label for log grouping * fix: port allocator issues * fix: hypervisor permission issue; assign port from leader * fix: metrics and port allocator bugs * fix: typo * fix: optimize tests * fix: move global mutex of portallocator to struct * fix: multiple gpu metrics * fix: merge conflict * chore: merge code unit test issues * fix: unit test issues, workload count bug * fix: gpu sync time in testing mode * fix: gpu allocator test case bug
1 parent 5cbfd14 commit e4fab68

File tree

70 files changed

+2407
-639
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+2407
-639
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Dockerfile.cross
1212

1313
# Output of the go coverage tool, specifically when used with LiteIDE
1414
*.out
15+
cover.out.*
1516

1617
# Go workspace file
1718
go.work

.vscode/launch.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,10 @@
5555
"type": "go",
5656
"request": "launch",
5757
"mode": "test",
58-
"program": "${workspaceFolder}",
58+
"env": {
59+
"GO_TESTING": "true"
60+
},
61+
"program": "${workspaceFolder}/internal/controller",
5962
"console": "integratedTerminal"
6063
}
6164
]

.vscode/settings.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
{
22
"cSpell.words": [
3+
"alertmanager",
34
"alicloud",
45
"Aliyun",
56
"AMDCDNA",
67
"AMDRDNA",
78
"apimachinery",
9+
"automount",
810
"AWSGPU",
911
"batchv",
12+
"burstable",
1013
"CDNA",
1114
"certificaterequests",
1215
"certmanager",
@@ -39,6 +42,7 @@
3942
"greptime",
4043
"greptimedb",
4144
"healthz",
45+
"iface",
4246
"karpenter",
4347
"kubebuilder",
4448
"KUBECONFIG",
@@ -51,6 +55,7 @@
5155
"NVML",
5256
"omitempty",
5357
"onsi",
58+
"portallocator",
5459
"printcolumn",
5560
"prometheusagents",
5661
"prometheuses",
@@ -62,11 +67,13 @@
6267
"schedulingconfigtemplates",
6368
"schedulingcorev",
6469
"shirou",
70+
"strategicpatches",
6571
"subresource",
6672
"tensorfusion",
6773
"tensorfusionaiv",
6874
"tensorfusioncluster",
6975
"tensorfusionclusters",
76+
"tensorfusionworkload",
7077
"Tera",
7178
"tflops",
7279
"Tmpl",

Makefile

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,8 @@ vet: ## Run go vet against code.
6262

6363
.PHONY: test
6464
test: manifests generate fmt vet envtest ## Run tests.
65-
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -timeout 0 -coverprofile cover.out
65+
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e
6666

67-
# TODO(user): To use a different vendor for e2e tests, modify the setup under 'tests/e2e'.
68-
# The default setup assumes Kind is pre-installed and builds/loads the Manager Docker image locally.
69-
# Prometheus and CertManager are installed by default; skip with:
70-
# - PROMETHEUS_INSTALL_SKIP=true
71-
# - CERT_MANAGER_INSTALL_SKIP=true
7267
.PHONY: test-e2e
7368
test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated environment using Kind.
7469
@command -v kind >/dev/null 2>&1 || { \

api/v1/gpu_types.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,18 @@ type GPUStatus struct {
3636
GPUModel string `json:"gpuModel"`
3737

3838
Message string `json:"message"`
39+
40+
// +optional
41+
RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`
42+
}
43+
44+
type RunningAppDetail struct {
45+
// Workload name namespace
46+
Name string `json:"name,omitempty"`
47+
Namespace string `json:"namespace,omitempty"`
48+
49+
// Worker count
50+
Count int `json:"count"`
3951
}
4052

4153
// +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating

api/v1/gpunode_funcs.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, in
1212
TotalTFlops: initTFlops,
1313
TotalVRAM: initVRAM,
1414
TotalGPUs: initGPUs,
15-
AllocationDetails: &[]GPUNodeAllocationDetails{},
15+
AllocationInfo: []*RunningAppDetail{},
1616
LoadedModels: &[]string{},
1717
ManagedGPUDeviceIDs: []string{},
1818
ObservedGeneration: node.Generation,

api/v1/gpunode_types.go

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -94,20 +94,8 @@ type GPUNodeStatus struct {
9494

9595
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
9696

97-
// Allocation details is for node compaction, and calculate used apps
9897
// +optional
99-
AllocationDetails *[]GPUNodeAllocationDetails `json:"allocationDetails,omitempty"`
100-
}
101-
102-
type GPUNodeAllocationDetails struct {
103-
PodID string `json:"podID,omitempty"`
104-
PodName string `json:"podName,omitempty"`
105-
Namespace string `json:"namespace"`
106-
WorkloadName string `json:"workload,omitempty"`
107-
108-
Requests GPUResourceUnit `json:"requests"`
109-
Limits GPUResourceUnit `json:"limits"`
110-
QoS QoSLevel `json:"qos,omitempty"`
98+
AllocationInfo []*RunningAppDetail `json:"allocationInfo,omitempty"`
11199
}
112100

113101
// +kubebuilder:validation:Enum=Pending;Provisioning;Migrating;Running;Succeeded;Failed;Unknown;Destroying

api/v1/gpupool_types.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ type QosPricing struct {
293293

294294
Requests GPUResourcePricingUnit `json:"requests,omitempty"`
295295

296-
// Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be cheaper, for example Low QoS, ratio should be 0.5
296+
// Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be lower, so that user can get burstable GPU resources with very low cost
297297
// +kubebuilder:default="1"
298298
LimitsOverRequestsChargingRatio string `json:"limitsOverRequests,omitempty"`
299299
}
@@ -372,6 +372,8 @@ type GPUPoolStatus struct {
372372
AvailableTFlops resource.Quantity `json:"availableTFlops"`
373373
AvailableVRAM resource.Quantity `json:"availableVRAM"`
374374

375+
RunningAppsCnt int32 `json:"runningAppsCnt,omitempty"`
376+
375377
// +optional
376378
VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
377379
// +optional

api/v1/zz_generated.deepcopy.go

Lines changed: 34 additions & 25 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.2.22
18+
version: 1.3.2
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.12.1"
24+
appVersion: "1.30.3"

0 commit comments

Comments
 (0)