NexusGPU
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.vscode/launch.json‎
Lines changed: 1 addition & 0 deletions b/‎.vscode/launch.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 2 additions & 1 deletion b/‎Makefile‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎api/v1/gpunode_types.go‎
Lines changed: 15 additions & 1 deletion b/‎api/v1/gpunode_types.go‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎api/v1/gpupool_types.go‎
Lines changed: 7 additions & 9 deletions b/‎api/v1/gpupool_types.go‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎api/v1/tensorfusioncluster_types.go‎
Lines changed: 10 additions & 5 deletions b/‎api/v1/tensorfusioncluster_types.go‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎api/v1/zz_generated.deepcopy.go‎
Lines changed: 5 additions & 2 deletions b/‎api/v1/zz_generated.deepcopy.go‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 2 additions & 2 deletions b/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpunodeclasses.yaml‎
Lines changed: 65 additions & 4 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpunodeclasses.yaml‎
Lines changed: 65 additions & 4 deletions
@@ -30,4 +30,6 @@ go.work
 cmd/__debug*
 cmd/*/__debug*
 
-prompts/*
+prompts/*
+
+tmp*
@@ -19,6 +19,7 @@
             "type": "go",
             "request": "launch",
             "mode": "auto",
+            "console": "integratedTerminal",
             "env": {
                 "KUBECONFIG": "~/.kube/config-tf-dev",
                 "ENABLE_WEBHOOKS": "false"
 
@@ -45,7 +45,8 @@ help: ## Display this help.
 
 .PHONY: manifests
 manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
-	$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
+	$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases && \
+	cp -r config/crd/bases/ ./charts/tensor-fusion/crds/
 
 .PHONY: generate
 generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
 
@@ -62,6 +62,9 @@ type GPUNodeStatus struct {
 	TotalTFlops resource.Quantity `json:"totalTFlops"`
 	TotalVRAM   resource.Quantity `json:"totalVRAM"`
 
+	VirtualTFlops resource.Quantity `json:"virtualTFlops"`
+	VirtualVRAM   resource.Quantity `json:"virtualVRAM"`
+
 	AvailableTFlops resource.Quantity `json:"availableTFlops"`
 	AvailableVRAM   resource.Quantity `json:"availableVRAM"`
 
@@ -119,6 +122,10 @@ type GPUNodeInfo struct {
 	GPUCount         int32  `json:"gpuCount,omitempty"`
 	OperatingSystem  string `json:"operatingSystem,omitempty"`
 	Architecture     string `json:"architecture,omitempty"`
+
+	// Additional space for L1/L2 VRAM buffer
+	RAMSize      resource.Quantity `json:"ramSize,omitempty"`
+	DataDiskSize resource.Quantity `json:"dataDiskSize,omitempty"`
 }
 
 type NodeHypervisorStatus struct {
@@ -130,7 +137,14 @@ type NodeHypervisorStatus struct {
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:resource:scope=Cluster
-
+// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
+// +kubebuilder:printcolumn:name="Total TFlops",type="string",JSONPath=".status.totalTFlops"
+// +kubebuilder:printcolumn:name="Total VRAM",type="string",JSONPath=".status.totalVRAM"
+// +kubebuilder:printcolumn:name="Virtual TFlops",type="string",JSONPath=".status.virtualTFlops"
+// +kubebuilder:printcolumn:name="Virtual VRAM",type="string",JSONPath=".status.virtualVRAM"
+// +kubebuilder:printcolumn:name="Available TFlops",type="string",JSONPath=".status.availableTFlops"
+// +kubebuilder:printcolumn:name="Available VRAM",type="string",JSONPath=".status.availableVRAM"
+// +kubebuilder:printcolumn:name="GPU Count",type="integer",JSONPath=".status.totalGPUs"
 // GPUNode is the Schema for the gpunodes API.
 type GPUNode struct {
 	metav1.TypeMeta   `json:",inline"`
 
@@ -169,22 +169,23 @@ type Requirement struct {
 	Values []string `json:"values,omitempty"`
 }
 
-// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-arch;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
+// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/region;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-arch;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
 type NodeRequirementKey string
 
 const (
 	NodeRequirementKeyInstanceType    NodeRequirementKey = "node.kubernetes.io/instance-type"
 	NodeRequirementKeyArchitecture    NodeRequirementKey = "kubernetes.io/arch"
 	NodeRequirementKeyGPUArchitecture NodeRequirementKey = "tensor-fusion.ai/gpu-arch"
 
-	NodeRequirementKeyOS   NodeRequirementKey = "kubernetes.io/os"
-	NodeRequirementKeyZone NodeRequirementKey = "topology.kubernetes.io/zone"
+	NodeRequirementKeyOS     NodeRequirementKey = "kubernetes.io/os"
+	NodeRequirementKeyRegion NodeRequirementKey = "topology.kubernetes.io/region"
+	NodeRequirementKeyZone   NodeRequirementKey = "topology.kubernetes.io/zone"
 
 	// capacity-type is charging method, can be spot/preemptive or on-demand
 	NodeRequirementKeyCapacityType NodeRequirementKey = "karpenter.sh/capacity-type"
 
 	NodeRequirementKeyInstanceFamily NodeRequirementKey = "tensor-fusion.ai/gpu-instance-family"
-	NodeRequirementKeyInstanceSize   NodeRequirementKey = "karpenter.k8s.aws/gpu-instance-size"
+	NodeRequirementKeyInstanceSize   NodeRequirementKey = "tensor-fusion.ai/gpu-instance-size"
 )
 
 type Taint struct {
@@ -363,11 +364,6 @@ type GPUPoolStatus struct {
 	AvailableTFlops resource.Quantity `json:"availableTFlops"`
 	AvailableVRAM   resource.Quantity `json:"availableVRAM"`
 
-	// If using provisioner, GPU nodes could be outside of the K8S cluster.
-	// The GPUNodes custom resource will be created and deleted automatically.
-	// ProvisioningStatus is to track the status of those outside GPU nodes.
-	ProvisioningStatus PoolProvisioningStatus `json:"provisioningStatus"`
-
 	// when updating any component version or config, pool controller will perform rolling update.
 	// the status will be updated periodically, default to 5s, progress will be 0-100.
 	// when the progress is 100, the component version or config is fully updated.
@@ -388,6 +384,7 @@ type GPUPoolStatus struct {
 	// +kubebuilder:default=""
 	// If the budget is exceeded, the set value in comma separated string to indicate which period caused the exceeding.
 	// If this field is not empty, scheduler will not schedule new AI workloads and stop scaling-up check.
+	// TODO not implemented yet
 	BudgetExceeded string `json:"budgetExceeded,omitempty"`
 }
 
@@ -427,6 +424,7 @@ type PoolComponentStatus struct {
 // +kubebuilder:subresource:status
 // +kubebuilder:resource:scope=Cluster
 
+// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
 // +kubebuilder:printcolumn:name="TFlops Oversubscription",type="string",JSONPath=".spec.capacityConfig.oversubscription.tflopsOversellRatio"
 // +kubebuilder:printcolumn:name="Mode",type="string",JSONPath=".status.mode"
 // +kubebuilder:printcolumn:name="Default Scheduling Strategy",type="string",JSONPath=".spec.schedulingConfigTemplate"
 
@@ -143,16 +143,21 @@ const (
 )
 
 type ComputingVendorParams struct {
+	// +optional
 	DefaultRegion string `json:"defaultRegion,omitempty"` // Region for the computing vendor.
 
-	// the secret of access key and secret key, must be mounted as env var or file path
-	AccessKeyPath   string `json:"accessKeyPath,omitempty"`
-	SecretKeyPath   string `json:"secretKeyPath,omitempty"`
-	AccessKeyEnvVar string `json:"accessKeyEnvVar,omitempty"`
-	SecretKeyEnvVar string `json:"secretKeyEnvVar,omitempty"`
+	// the secret of access key and secret key or config file, must be mounted as file path
+	// +optional
+	AccessKeyPath string `json:"accessKeyPath,omitempty"`
+	// +optional
+	SecretKeyPath string `json:"secretKeyPath,omitempty"`
 
 	// preferred IAM role since it's more secure
+	// +optional
 	IAMRole string `json:"iamRole,omitempty"`
+
+	// +optional
+	ConfigFile string `json:"configFile,omitempty"`
 }
 
 // StorageVendorConfig defines Postgres database with extensions for timeseries storage and other resource aggregation results, system events and diagnostics reports etc.
 
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.0.3
+version: 1.1.0
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "1.10.1"
+appVersion: "1.12.0"
@@ -47,8 +47,10 @@ spec:
                     ebs:
                       properties:
                         deleteOnTermination:
+                          default: true
                           type: boolean
                         encrypted:
+                          default: false
                           type: boolean
                         volumeSize:
                           type: string
@@ -58,39 +60,98 @@ spec:
                   type: object
                 type: array
               instanceProfile:
+                description: The instance profile to use, assign IAM role and permissions
+                  for EC2 instances
                 type: string
+              launchTemplate:
+                description: The launch template to use for VM instances, if set,
+                  all other fields could be skipped
+                properties:
+                  id:
+                    description: The item ID
+                    type: string
+                  name:
+                    description: The item name
+                    type: string
+                  tags:
+                    additionalProperties:
+                      type: string
+                    description: Query by tags
+                    type: object
+                type: object
               metadataOptions:
+                description: for AWS only, IMDSv2 metadata service options
                 properties:
                   httpEndpoint:
-                    type: string
+                    default: true
+                    type: boolean
                   httpProtocolIPv6:
-                    type: string
+                    default: false
+                    type: boolean
                   httpPutResponseHopLimit:
+                    default: 1
                     type: integer
                   httpTokens:
+                    default: required
                     type: string
                 type: object
               osImageSelectorTerms:
+                description: the OS image identifier string, default to use first
+                  one, if not found, fallback to others
                 items:
                   properties:
-                    name:
+                    id:
+                      description: The item ID
                       type: string
-                    owner:
+                    name:
+                      description: The item name
                       type: string
+                    tags:
+                      additionalProperties:
+                        type: string
+                      description: Query by tags
+                      type: object
                   type: object
                 type: array
+              osImageType:
+                default: Private
+                description: Could be private or public, varies in different cloud
+                  vendor, define where to query the OSImageID
+                enum:
+                - Private
+                - Public
+                - System
+                type: string
               securityGroupSelectorTerms:
                 items:
                   properties:
                     id:
+                      description: The item ID
+                      type: string
+                    name:
+                      description: The item name
                       type: string
+                    tags:
+                      additionalProperties:
+                        type: string
+                      description: Query by tags
+                      type: object
                   type: object
                 type: array
               subnetSelectorTerms:
                 items:
                   properties:
                     id:
+                      description: The item ID
+                      type: string
+                    name:
+                      description: The item name
                       type: string
+                    tags:
+                      additionalProperties:
+                        type: string
+                      description: Query by tags
+                      type: object
                   type: object
                 type: array
               tags: