Skip to content

Commit 55e9d42

Browse files
committed
fix: bump chart version, update CRDs
1 parent 1bc2394 commit 55e9d42

33 files changed

+1681
-365
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,6 @@ go.work
3030
cmd/__debug*
3131
cmd/*/__debug*
3232

33-
prompts/*
33+
prompts/*
34+
35+
tmp*

.vscode/launch.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"type": "go",
2020
"request": "launch",
2121
"mode": "auto",
22+
"console": "integratedTerminal",
2223
"env": {
2324
"KUBECONFIG": "~/.kube/config-tf-dev",
2425
"ENABLE_WEBHOOKS": "false"

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ help: ## Display this help.
4545

4646
.PHONY: manifests
4747
manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
48-
$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
48+
$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases && \
49+
cp -r config/crd/bases/ ./charts/tensor-fusion/crds/
4950

5051
.PHONY: generate
5152
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.

api/v1/gpunode_types.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ type GPUNodeStatus struct {
6262
TotalTFlops resource.Quantity `json:"totalTFlops"`
6363
TotalVRAM resource.Quantity `json:"totalVRAM"`
6464

65+
VirtualTFlops resource.Quantity `json:"virtualTFlops"`
66+
VirtualVRAM resource.Quantity `json:"virtualVRAM"`
67+
6568
AvailableTFlops resource.Quantity `json:"availableTFlops"`
6669
AvailableVRAM resource.Quantity `json:"availableVRAM"`
6770

@@ -119,6 +122,10 @@ type GPUNodeInfo struct {
119122
GPUCount int32 `json:"gpuCount,omitempty"`
120123
OperatingSystem string `json:"operatingSystem,omitempty"`
121124
Architecture string `json:"architecture,omitempty"`
125+
126+
// Additional space for L1/L2 VRAM buffer
127+
RAMSize resource.Quantity `json:"ramSize,omitempty"`
128+
DataDiskSize resource.Quantity `json:"dataDiskSize,omitempty"`
122129
}
123130

124131
type NodeHypervisorStatus struct {
@@ -130,7 +137,14 @@ type NodeHypervisorStatus struct {
130137
// +kubebuilder:object:root=true
131138
// +kubebuilder:subresource:status
132139
// +kubebuilder:resource:scope=Cluster
133-
140+
// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
141+
// +kubebuilder:printcolumn:name="Total TFlops",type="string",JSONPath=".status.totalTFlops"
142+
// +kubebuilder:printcolumn:name="Total VRAM",type="string",JSONPath=".status.totalVRAM"
143+
// +kubebuilder:printcolumn:name="Virtual TFlops",type="string",JSONPath=".status.virtualTFlops"
144+
// +kubebuilder:printcolumn:name="Virtual VRAM",type="string",JSONPath=".status.virtualVRAM"
145+
// +kubebuilder:printcolumn:name="Available TFlops",type="string",JSONPath=".status.availableTFlops"
146+
// +kubebuilder:printcolumn:name="Available VRAM",type="string",JSONPath=".status.availableVRAM"
147+
// +kubebuilder:printcolumn:name="GPU Count",type="integer",JSONPath=".status.totalGPUs"
134148
// GPUNode is the Schema for the gpunodes API.
135149
type GPUNode struct {
136150
metav1.TypeMeta `json:",inline"`

api/v1/gpupool_types.go

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -169,22 +169,23 @@ type Requirement struct {
169169
Values []string `json:"values,omitempty"`
170170
}
171171

172-
// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-arch;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
172+
// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/region;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-arch;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
173173
type NodeRequirementKey string
174174

175175
const (
176176
NodeRequirementKeyInstanceType NodeRequirementKey = "node.kubernetes.io/instance-type"
177177
NodeRequirementKeyArchitecture NodeRequirementKey = "kubernetes.io/arch"
178178
NodeRequirementKeyGPUArchitecture NodeRequirementKey = "tensor-fusion.ai/gpu-arch"
179179

180-
NodeRequirementKeyOS NodeRequirementKey = "kubernetes.io/os"
181-
NodeRequirementKeyZone NodeRequirementKey = "topology.kubernetes.io/zone"
180+
NodeRequirementKeyOS NodeRequirementKey = "kubernetes.io/os"
181+
NodeRequirementKeyRegion NodeRequirementKey = "topology.kubernetes.io/region"
182+
NodeRequirementKeyZone NodeRequirementKey = "topology.kubernetes.io/zone"
182183

183184
// capacity-type is charging method, can be spot/preemptive or on-demand
184185
NodeRequirementKeyCapacityType NodeRequirementKey = "karpenter.sh/capacity-type"
185186

186187
NodeRequirementKeyInstanceFamily NodeRequirementKey = "tensor-fusion.ai/gpu-instance-family"
187-
NodeRequirementKeyInstanceSize NodeRequirementKey = "karpenter.k8s.aws/gpu-instance-size"
188+
NodeRequirementKeyInstanceSize NodeRequirementKey = "tensor-fusion.ai/gpu-instance-size"
188189
)
189190

190191
type Taint struct {
@@ -363,11 +364,6 @@ type GPUPoolStatus struct {
363364
AvailableTFlops resource.Quantity `json:"availableTFlops"`
364365
AvailableVRAM resource.Quantity `json:"availableVRAM"`
365366

366-
// If using provisioner, GPU nodes could be outside of the K8S cluster.
367-
// The GPUNodes custom resource will be created and deleted automatically.
368-
// ProvisioningStatus is to track the status of those outside GPU nodes.
369-
ProvisioningStatus PoolProvisioningStatus `json:"provisioningStatus"`
370-
371367
// when updating any component version or config, pool controller will perform rolling update.
372368
// the status will be updated periodically, default to 5s, progress will be 0-100.
373369
// when the progress is 100, the component version or config is fully updated.
@@ -388,6 +384,7 @@ type GPUPoolStatus struct {
388384
// +kubebuilder:default=""
389385
// If the budget is exceeded, the set value in comma separated string to indicate which period caused the exceeding.
390386
// If this field is not empty, scheduler will not schedule new AI workloads and stop scaling-up check.
387+
// TODO not implemented yet
391388
BudgetExceeded string `json:"budgetExceeded,omitempty"`
392389
}
393390

@@ -427,6 +424,7 @@ type PoolComponentStatus struct {
427424
// +kubebuilder:subresource:status
428425
// +kubebuilder:resource:scope=Cluster
429426

427+
// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
430428
// +kubebuilder:printcolumn:name="TFlops Oversubscription",type="string",JSONPath=".spec.capacityConfig.oversubscription.tflopsOversellRatio"
431429
// +kubebuilder:printcolumn:name="Mode",type="string",JSONPath=".status.mode"
432430
// +kubebuilder:printcolumn:name="Default Scheduling Strategy",type="string",JSONPath=".spec.schedulingConfigTemplate"

api/v1/tensorfusioncluster_types.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -143,16 +143,21 @@ const (
143143
)
144144

145145
type ComputingVendorParams struct {
146+
// +optional
146147
DefaultRegion string `json:"defaultRegion,omitempty"` // Region for the computing vendor.
147148

148-
// the secret of access key and secret key, must be mounted as env var or file path
149-
AccessKeyPath string `json:"accessKeyPath,omitempty"`
150-
SecretKeyPath string `json:"secretKeyPath,omitempty"`
151-
AccessKeyEnvVar string `json:"accessKeyEnvVar,omitempty"`
152-
SecretKeyEnvVar string `json:"secretKeyEnvVar,omitempty"`
149+
// the secret of access key and secret key or config file, must be mounted as file path
150+
// +optional
151+
AccessKeyPath string `json:"accessKeyPath,omitempty"`
152+
// +optional
153+
SecretKeyPath string `json:"secretKeyPath,omitempty"`
153154

154155
// preferred IAM role since it's more secure
156+
// +optional
155157
IAMRole string `json:"iamRole,omitempty"`
158+
159+
// +optional
160+
ConfigFile string `json:"configFile,omitempty"`
156161
}
157162

158163
// StorageVendorConfig defines Postgres database with extensions for timeseries storage and other resource aggregation results, system events and diagnostics reports etc.

api/v1/zz_generated.deepcopy.go

Lines changed: 5 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.0.3
18+
version: 1.1.0
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.10.1"
24+
appVersion: "1.12.0"

charts/tensor-fusion/crds/tensor-fusion.ai_gpunodeclasses.yaml

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,10 @@ spec:
4747
ebs:
4848
properties:
4949
deleteOnTermination:
50+
default: true
5051
type: boolean
5152
encrypted:
53+
default: false
5254
type: boolean
5355
volumeSize:
5456
type: string
@@ -58,39 +60,98 @@ spec:
5860
type: object
5961
type: array
6062
instanceProfile:
63+
description: The instance profile to use, assign IAM role and permissions
64+
for EC2 instances
6165
type: string
66+
launchTemplate:
67+
description: The launch template to use for VM instances, if set,
68+
all other fields could be skipped
69+
properties:
70+
id:
71+
description: The item ID
72+
type: string
73+
name:
74+
description: The item name
75+
type: string
76+
tags:
77+
additionalProperties:
78+
type: string
79+
description: Query by tags
80+
type: object
81+
type: object
6282
metadataOptions:
83+
description: for AWS only, IMDSv2 metadata service options
6384
properties:
6485
httpEndpoint:
65-
type: string
86+
default: true
87+
type: boolean
6688
httpProtocolIPv6:
67-
type: string
89+
default: false
90+
type: boolean
6891
httpPutResponseHopLimit:
92+
default: 1
6993
type: integer
7094
httpTokens:
95+
default: required
7196
type: string
7297
type: object
7398
osImageSelectorTerms:
99+
description: the OS image identifier string, default to use first
100+
one, if not found, fallback to others
74101
items:
75102
properties:
76-
name:
103+
id:
104+
description: The item ID
77105
type: string
78-
owner:
106+
name:
107+
description: The item name
79108
type: string
109+
tags:
110+
additionalProperties:
111+
type: string
112+
description: Query by tags
113+
type: object
80114
type: object
81115
type: array
116+
osImageType:
117+
default: Private
118+
description: Could be private or public, varies in different cloud
119+
vendor, define where to query the OSImageID
120+
enum:
121+
- Private
122+
- Public
123+
- System
124+
type: string
82125
securityGroupSelectorTerms:
83126
items:
84127
properties:
85128
id:
129+
description: The item ID
130+
type: string
131+
name:
132+
description: The item name
86133
type: string
134+
tags:
135+
additionalProperties:
136+
type: string
137+
description: Query by tags
138+
type: object
87139
type: object
88140
type: array
89141
subnetSelectorTerms:
90142
items:
91143
properties:
92144
id:
145+
description: The item ID
146+
type: string
147+
name:
148+
description: The item name
93149
type: string
150+
tags:
151+
additionalProperties:
152+
type: string
153+
description: Query by tags
154+
type: object
94155
type: object
95156
type: array
96157
tags:

0 commit comments

Comments
 (0)