Skip to content

Commit ca47ebc

Browse files
authored
feat: refactor tensor fusion scheduler to k8s scheduler framework, support ngpu mode (#241)
* feat: refactor to k8s scheduler framework * fix: add scheduler framework and basic tests, align log to klog * fix: convert scheduler kubeconfig and other args * fix: new scheduler issues * fix: remove submodule * fix: typo * fix: scheduler refactor * fix: scheduler scoring * fix: new scheduler bugs * fix: scheduler bugs * fix: refactor quota impl * fix: test case * fix: schedule unit test and bug fix * chore: lint issue * fix: refactor namespaced level quota store * fix: unit test bugs * fix: unit test bugs * fix: quota store unit test issue * fix: assign namespace level gpu resource defaults, unit test issues * fix: scheduler refactor issues * fix: unit test issues, gpu informer update handler bug * fix: tf connection ut bugs * chore: typo * fix: restore mock pool rolling update interval * fix: rolling update ut bug * fix: debounce reconcile bug, change to ratelimiter, add hypervisor config env * fix: remove unused files and vars, use k8s ptr pkg
1 parent 8ed9024 commit ca47ebc

File tree

86 files changed

+5409
-3908
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+5409
-3908
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,6 @@ cmd/*/__debug*
3333

3434
prompts/*
3535

36-
tmp*
36+
tmp*
37+
38+
__debug*

.vscode/launch.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
"program": "${workspaceFolder}/cmd/main.go",
1616
"args": [
1717
"--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
18-
"--alert-rule-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
19-
"--enable-alert", "true"
18+
"--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
19+
"--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
20+
// "--enable-alert"
2021
]
2122
},
2223
{
@@ -60,6 +61,7 @@
6061
"request": "launch",
6162
"mode": "test",
6263
"env": {
64+
"DEBUG_MODE": "true",
6365
"GO_TESTING": "true"
6466
},
6567
"program": "${workspaceFolder}/internal/controller",

.vscode/settings.json

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
{
22
"cSpell.words": [
3+
"admissionv",
34
"alertmanager",
45
"alertname",
56
"alicloud",
67
"Aliyun",
78
"AMDCDNA",
89
"AMDRDNA",
910
"apimachinery",
11+
"apimachineryruntime",
12+
"apiruntime",
13+
"apiutil",
1014
"automount",
1115
"AWSGPU",
1216
"batchv",
@@ -16,23 +20,30 @@
1620
"certificaterequests",
1721
"certmanager",
1822
"clientgoscheme",
23+
"clientset",
1924
"cloudnative",
2025
"cloudprovider",
2126
"clusterissuers",
27+
"componentconfig",
28+
"configz",
2229
"controllerutil",
2330
"corev",
2431
"crds",
2532
"CUDA",
2633
"cycjimmy",
34+
"datanode",
35+
"defaultbinder",
2736
"dylib",
2837
"envtest",
2938
"essd",
3039
"Eventf",
3140
"finalizer",
3241
"Finalizers",
42+
"frameworkruntime",
3343
"FULLTEXT",
3444
"goconst",
3545
"gocyclo",
46+
"goerrors",
3647
"golint",
3748
"Gomega",
3849
"gopsutil",
@@ -44,23 +55,36 @@
4455
"gpunodes",
4556
"gpupool",
4657
"gpupools",
58+
"gpuresourcequotas",
59+
"gpuresources",
4760
"GPUT",
61+
"gputopo",
4862
"GPUVRAM",
4963
"greptime",
5064
"greptimedb",
5165
"healthz",
5266
"iface",
67+
"imageutils",
68+
"jsonpatch",
5369
"karpenter",
70+
"klog",
71+
"Klogr",
5472
"kubebuilder",
5573
"KUBECONFIG",
5674
"Kubelet",
75+
"kubescheduler",
76+
"kubeschedulerconfig",
5777
"kustomization",
5878
"metav",
5979
"metricsserver",
80+
"Milli",
6081
"mito",
82+
"ngpu",
6183
"nindent",
84+
"noderesources",
6285
"nolint",
6386
"NVML",
87+
"objs",
6488
"omitempty",
6589
"onsi",
6690
"portallocator",
@@ -69,16 +93,21 @@
6993
"prometheusagents",
7094
"prometheuses",
7195
"prometheusrules",
96+
"queuesort",
7297
"RDNA",
7398
"readyz",
7499
"replicaset",
75100
"runbook",
76101
"runpod",
77102
"samber",
103+
"sched",
104+
"schedulerserverconfig",
78105
"schedulingconfigtemplate",
79106
"schedulingconfigtemplates",
80107
"schedulingcorev",
81108
"shirou",
109+
"shortuuid",
110+
"strategicpatch",
82111
"strategicpatches",
83112
"subresource",
84113
"Tabler",
@@ -87,13 +116,19 @@
87116
"tensorfusioncluster",
88117
"tensorfusionclusters",
89118
"tensorfusionworkload",
119+
"tensorfusionworkloads",
90120
"Tera",
121+
"testutil",
91122
"tflops",
92123
"timberio",
93124
"Tmpl",
94125
"Tolerations",
126+
"utilerrors",
95127
"utilruntime",
128+
"vgpu",
96129
"webhookcorev",
130+
"workloadprofiles",
131+
"workqueue",
97132
"Xlarge"
98133
]
99134
}

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ vet: ## Run go vet against code.
6464
test: manifests generate fmt vet envtest ## Run tests.
6565
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e
6666

67+
.PHONY: ut
68+
ut: manifests generate ## Run unit tests by make ut F=<focus-file>
69+
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" cd internal/controller && GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 --focus-file $F && cd ../../
70+
6771
.PHONY: test-e2e
6872
test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated environment using Kind.
6973
@command -v kind >/dev/null 2>&1 || { \

api/v1/base_types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
package v1
22

3+
import "fmt"
4+
35
type NameNamespace struct {
46
Name string `json:"name,omitempty"`
57
Namespace string `json:"namespace,omitempty"`
68
}
9+
10+
func (n NameNamespace) String() string {
11+
return fmt.Sprintf("%s/%s", n.Namespace, n.Name)
12+
}

api/v1/gpupool_types.go

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -105,16 +105,15 @@ type ProvisioningMode string
105105

106106
const (
107107
ProvisioningModeProvisioned ProvisioningMode = "Provisioned"
108-
ProvisioningModeAutoSelect ProvisioningMode = "AutoSelect"
108+
109+
ProvisioningModeAutoSelect ProvisioningMode = "AutoSelect"
110+
111+
ProvisioningModeKarpenter ProvisioningMode = "Karpenter"
109112
)
110113

111114
// NodeProvisioner or NodeSelector, they are exclusive.
112115
// NodeSelector is for existing GPUs, NodeProvisioner is for Karpenter-like auto management.
113116
type NodeProvisioner struct {
114-
// Mode could be Karpenter or Native, for Karpenter mode, node provisioner will start dummy nodes to provision and warmup GPU nodes, do nothing for CPU nodes, for Native mode, provisioner will create or compact GPU & CPU nodes based on current pods
115-
// +kubebuilder:default=Native
116-
Mode NodeProvisionerMode `json:"mode,omitempty"`
117-
118117
NodeClass string `json:"nodeClass,omitempty"`
119118

120119
// +optional
@@ -159,14 +158,6 @@ const (
159158
BudgetExceedStrategyAlertAndTerminateVM BudgetExceedStrategy = "AlertAndTerminateVM"
160159
)
161160

162-
// +kubebuilder:validation:Enum=Native;Karpenter
163-
type NodeProvisionerMode string
164-
165-
const (
166-
NodeProvisionerModeNative NodeProvisionerMode = "Native"
167-
NodeProvisionerModeKarpenter NodeProvisionerMode = "Karpenter"
168-
)
169-
170161
type Requirement struct {
171162
Key NodeRequirementKey `json:"key,omitempty"`
172163

@@ -347,6 +338,9 @@ type ClientConfig struct {
347338

348339
// +optional
349340
PatchToContainer *runtime.RawExtension `json:"patchToContainer,omitempty"`
341+
342+
// +optional
343+
PatchEmbeddedWorkerToPod *runtime.RawExtension `json:"patchEmbeddedWorkerToPod,omitempty"`
350344
}
351345

352346
// GPUPoolStatus defines the observed state of GPUPool.

0 commit comments

Comments
 (0)