NexusGPU
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.vscode/settings.json‎
Lines changed: 29 additions & 1 deletion b/‎.vscode/settings.json‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎api/v1/gpunode_types.go‎
Lines changed: 4 additions & 0 deletions b/‎api/v1/gpunode_types.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎api/v1/gpupool_types.go‎
Lines changed: 14 additions & 10 deletions b/‎api/v1/gpupool_types.go‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎api/v1/tensorfusioncluster_types.go‎
Lines changed: 45 additions & 9 deletions b/‎api/v1/tensorfusioncluster_types.go‎
Lines changed: 45 additions & 9 deletions
diff --git a/‎charts/tensor-fusion/templates/gpu-cost-config.yaml‎
Lines changed: 7 additions & 1 deletion b/‎charts/tensor-fusion/templates/gpu-cost-config.yaml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎cmd/operator/main.go‎
Lines changed: 1 addition & 0 deletions b/‎cmd/operator/main.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎config/crd/bases/tensor-fusion.ai_gpunodes.yaml‎
Lines changed: 3 additions & 0 deletions b/‎config/crd/bases/tensor-fusion.ai_gpunodes.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎config/crd/bases/tensor-fusion.ai_gpupools.yaml‎
Lines changed: 5 additions & 3 deletions b/‎config/crd/bases/tensor-fusion.ai_gpupools.yaml‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml‎
Lines changed: 33 additions & 7 deletions b/‎config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml‎
Lines changed: 33 additions & 7 deletions
@@ -28,4 +28,6 @@ go.work
 .DS_Store
 
 cmd/__debug*
-cmd/*/__debug*
+cmd/*/__debug*
+
+prompts/*
@@ -1,40 +1,68 @@
 {
     "cSpell.words": [
+        "Aliyun",
+        "AMDCDNA",
+        "AMDRDNA",
         "apimachinery",
+        "AWSGPU",
+        "batchv",
+        "CDNA",
         "certificaterequests",
         "certmanager",
         "clientgoscheme",
+        "cloudnative",
+        "cloudprovider",
         "clusterissuers",
         "controllerutil",
+        "corev",
         "crds",
+        "CUDA",
         "cycjimmy",
+        "dylib",
+        "Eventf",
         "finalizer",
         "Finalizers",
         "goconst",
         "golint",
         "gosec",
+        "gpunode",
+        "gpunodeclasses",
+        "gpunodes",
+        "gpupool",
+        "gpupools",
         "healthz",
         "karpenter",
         "kubebuilder",
+        "KUBECONFIG",
         "Kubelet",
         "kustomization",
         "metav",
         "metricsserver",
+        "nindent",
         "nolint",
+        "NVML",
+        "omitempty",
         "onsi",
+        "printcolumn",
         "prometheusagents",
         "prometheuses",
         "prometheusrules",
+        "RDNA",
         "readyz",
+        "runpod",
         "schedulingconfigtemplate",
         "schedulingconfigtemplates",
+        "schedulingcorev",
         "subresource",
         "tensorfusion",
         "tensorfusionaiv",
         "tensorfusioncluster",
         "tensorfusionclusters",
+        "Tera",
+        "tflops",
         "Tmpl",
         "utilruntime",
-        "webhookcorev"
+        "webhookcorev",
+        "Xlarge"
     ]
 }
@@ -106,6 +106,10 @@ const (
 )
 
 type GPUNodeInfo struct {
+	// +optional
+	// only set when node is managed by TensorFusion
+	InstanceID string `json:"instanceID,omitempty"`
+
 	Hostname         string `json:"hostname,omitempty"`
 	IP               string `json:"ip,omitempty"`
 	KernelVersion    string `json:"kernelVersion,omitempty"`
 
@@ -69,21 +69,21 @@ type Oversubscription struct {
 	// +kubebuilder:default=50
 	// +kubebuilder:validation:Minimum=0
 	// +kubebuilder:validation:Maximum=100
-	VramExpandToHostMem int32 `json:"vramExpandToHostMem,omitempty"`
+	VRAMExpandToHostMem int32 `json:"vramExpandToHostMem,omitempty"`
 
 	// the percentage of Host Disk appending to GPU VRAM, default to 70%
 	// +optional
 	// +kubebuilder:default=70
 	// +kubebuilder:validation:Minimum=0
 	// +kubebuilder:validation:Maximum=100
-	VramExpandToHostDisk int32 `json:"vramExpandToHostDisk,omitempty"`
+	VRAMExpandToHostDisk int32 `json:"vramExpandToHostDisk,omitempty"`
 
-	// The multipler of TFlops to oversell, default to 500%, indicates 5 times oversell
+	// The multi of TFlops to oversell, default to 500%, indicates 5 times oversell
 	// +optional
 	// +kubebuilder:default=500
 	// +kubebuilder:validation:Minimum=100
 	// +kubebuilder:validation:Maximum=100000
-	TflopsOversellRatio int32 `json:"tflopsOversellRatio,omitempty"`
+	TFlopsOversellRatio int32 `json:"tflopsOversellRatio,omitempty"`
 }
 
 type NodeManagerConfig struct {
@@ -171,14 +171,18 @@ type Requirement struct {
 	Values []string `json:"values,omitempty"`
 }
 
-// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/zone;karpenter.sh/capacity-type
+// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-arch
 type NodeRequirementKey string
 
 const (
-	NodeRequirementKeyInstanceType NodeRequirementKey = "node.kubernetes.io/instance-type"
-	NodeRequirementKeyArchitecture NodeRequirementKey = "kubernetes.io/arch"
-	NodeRequirementKeyOS           NodeRequirementKey = "kubernetes.io/os"
-	NodeRequirementKeyZone         NodeRequirementKey = "topology.kubernetes.io/zone"
+	NodeRequirementKeyInstanceType    NodeRequirementKey = "node.kubernetes.io/instance-type"
+	NodeRequirementKeyArchitecture    NodeRequirementKey = "kubernetes.io/arch"
+	NodeRequirementKeyGPUArchitecture NodeRequirementKey = "tensor-fusion.ai/gpu-arch"
+
+	NodeRequirementKeyOS   NodeRequirementKey = "kubernetes.io/os"
+	NodeRequirementKeyZone NodeRequirementKey = "topology.kubernetes.io/zone"
+
+	// capacity-type is charging method, can be spot/preemptive or on-demand
 	NodeRequirementKeyCapacityType NodeRequirementKey = "karpenter.sh/capacity-type"
 )
 
@@ -363,7 +367,7 @@ type GPUPoolStatus struct {
 	// ProvisioningStatus is to track the status of those outside GPU nodes.
 	ProvisioningStatus PoolProvisioningStatus `json:"provisioningStatus"`
 
-	// when updating any component version or config, poolcontroller will perform rolling update.
+	// when updating any component version or config, pool controller will perform rolling update.
 	// the status will be updated periodically, default to 5s, progress will be 0-100.
 	// when the progress is 100, the component version or config is fully updated.
 	ComponentStatus PoolComponentStatus `json:"componentStatus"`
 
@@ -101,22 +101,58 @@ type GPUPoolDefinition struct {
 
 // ComputingVendorConfig defines the Cloud vendor connection such as AWS, GCP, Azure etc.
 type ComputingVendorConfig struct {
-	Name     string `json:"name,omitempty"`     // Name of the computing vendor.
-	Type     string `json:"type,omitempty"`     // Type of the computing vendor (e.g., aws, lambdalabs, gcp, azure, together.ai).
-	AuthType string `json:"authType,omitempty"` // Authentication type (e.g., accessKey, serviceAccount).
+	Name string `json:"name,omitempty"`
+
+	// support popular cloud providers
+	Type ComputingVendorName `json:"type,omitempty"`
+
+	AuthType AuthTypeEnum `json:"authType,omitempty"` // Authentication type (e.g., accessKey, serviceAccount).
 
 	// +optional
+	// +kubebuilder:default=true
 	Enable *bool `json:"enable,omitempty"` // Enable or disable the computing vendor.
 
-	GPUNodeControllerType string                `json:"gpuNodeControllerType,omitempty"` // Type of GPU node controller (e.g., asg, karpenter, native).
-	Params                ComputingVendorParams `json:"params,omitempty"`
+	Params ComputingVendorParams `json:"params,omitempty"`
 }
 
+// +kubebuilder:validation:Enum=accessKey;serviceAccountRole
+type AuthTypeEnum string
+
+const (
+	AuthTypeAccessKey          AuthTypeEnum = "accessKey"
+	AuthTypeServiceAccountRole AuthTypeEnum = "serviceAccountRole"
+)
+
+// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;aliyun;nvidia;tencent;runpod
+type ComputingVendorName string
+
+const (
+	ComputingVendorAWS        ComputingVendorName = "aws"
+	ComputingVendorGCP        ComputingVendorName = "gcp"
+	ComputingVendorAzure      ComputingVendorName = "azure"
+	ComputingVendorOracle     ComputingVendorName = "oracle-oci"
+	ComputingVendorIBM        ComputingVendorName = "ibm"
+	ComputingVendorOpenShift  ComputingVendorName = "openshift"
+	ComputingVendorVultr      ComputingVendorName = "vultr"
+	ComputingVendorTogetherAI ComputingVendorName = "together-ai"
+	ComputingVendorLambdaLabs ComputingVendorName = "lambda-labs"
+	ComputingVendorAliyun     ComputingVendorName = "aliyun"
+	ComputingVendorNvidia     ComputingVendorName = "nvidia"
+	ComputingVendorTencent    ComputingVendorName = "tencent"
+	ComputingVendorRunPod     ComputingVendorName = "runpod"
+)
+
 type ComputingVendorParams struct {
-	Region    string `json:"region,omitempty"`    // Region for the computing vendor.
-	AccessKey string `json:"accessKey,omitempty"` // Access key for the computing vendor.
-	SecretKey string `json:"secretKey,omitempty"` // Secret key for the computing vendor.
-	IAMRole   string `json:"iamRole,omitempty"`   // IAM role for the computing vendor like AWS
+	DefaultRegion string `json:"defaultRegion,omitempty"` // Region for the computing vendor.
+
+	// the secret of access key and secret key, must be mounted as env var or file path
+	AccessKeyPath   string `json:"accessKeyPath,omitempty"`
+	SecretKeyPath   string `json:"secretKeyPath,omitempty"`
+	AccessKeyEnvVar string `json:"accessKeyEnvVar,omitempty"`
+	SecretKeyEnvVar string `json:"secretKeyEnvVar,omitempty"`
+
+	// preferred IAM role since it's more secure
+	IAMRole string `json:"iamRole,omitempty"`
 }
 
 // StorageVendorConfig defines Postgres database with extensions for timeseries storage and other resource aggregation results, system events and diagnostics reports etc.
 
@@ -37,9 +37,15 @@ data:
       costPerHour: 1.64
       fp16TFlops: 312
 
+    - model: A10
+      vendor: NVIDIA
+      costPerHour: 0.9
+      fp16TFlops: 125
+
+    # A10G has less CUDA core than A10, but with RT cores for rendering case
     - model: A10G
       vendor: NVIDIA
-      costPerHour: 0.4
+      costPerHour: 0.75 # from lambda labs
       fp16TFlops: 125
 
     - model: A40
 
@@ -199,6 +199,7 @@ func main() {
 	GPUPoolReconciler := &controller.GPUPoolReconciler{
 		Client:       mgr.GetClient(),
 		Scheme:       mgr.GetScheme(),
+		Recorder:     mgr.GetEventRecorderFor("GPUPool"),
 		GpuPoolState: gpuPoolState,
 		GpuNodeState: gpuNodeState,
 	}
 
@@ -224,6 +224,9 @@ spec:
                     type: string
                   hostname:
                     type: string
+                  instanceID:
+                    description: only set when node is managed by TensorFusion
+                    type: string
                   ip:
                     type: string
                   kernelVersion:
 
@@ -130,8 +130,8 @@ spec:
                     properties:
                       tflopsOversellRatio:
                         default: 500
-                        description: The multipler of TFlops to oversell, default
-                          to 500%, indicates 5 times oversell
+                        description: The multi of TFlops to oversell, default to 500%,
+                          indicates 5 times oversell
                         format: int32
                         maximum: 100000
                         minimum: 100
@@ -293,6 +293,7 @@ spec:
                               - kubernetes.io/os
                               - topology.kubernetes.io/zone
                               - karpenter.sh/capacity-type
+                              - tensor-fusion.ai/gpu-arch
                               type: string
                             operator:
                               default: In
@@ -343,6 +344,7 @@ spec:
                               - kubernetes.io/os
                               - topology.kubernetes.io/zone
                               - karpenter.sh/capacity-type
+                              - tensor-fusion.ai/gpu-arch
                               type: string
                             operator:
                               default: In
@@ -750,7 +752,7 @@ spec:
                 type: string
               componentStatus:
                 description: |-
-                  when updating any component version or config, poolcontroller will perform rolling update.
+                  when updating any component version or config, pool controller will perform rolling update.
                   the status will be updated periodically, default to 5s, progress will be 0-100.
                   when the progress is 100, the component version or config is fully updated.
                 properties:
 
@@ -70,25 +70,49 @@ spec:
                   such as AWS, GCP, Azure etc.
                 properties:
                   authType:
+                    enum:
+                    - accessKey
+                    - serviceAccountRole
                     type: string
                   enable:
+                    default: true
                     type: boolean
-                  gpuNodeControllerType:
-                    type: string
                   name:
                     type: string
                   params:
                     properties:
-                      accessKey:
+                      accessKeyEnvVar:
+                        type: string
+                      accessKeyPath:
+                        description: the secret of access key and secret key, must
+                          be mounted as env var or file path
+                        type: string
+                      defaultRegion:
                         type: string
                       iamRole:
+                        description: preferred IAM role since it's more secure
                         type: string
-                      region:
+                      secretKeyEnvVar:
                         type: string
-                      secretKey:
+                      secretKeyPath:
                         type: string
                     type: object
                   type:
+                    description: support popular cloud providers
+                    enum:
+                    - aws
+                    - lambda-labs
+                    - gcp
+                    - azure
+                    - oracle-oci
+                    - ibm
+                    - openshift
+                    - vultr
+                    - together-ai
+                    - aliyun
+                    - nvidia
+                    - tencent
+                    - runpod
                     type: string
                 type: object
               dataPipelines:
@@ -204,8 +228,8 @@ spec:
                               properties:
                                 tflopsOversellRatio:
                                   default: 500
-                                  description: The multipler of TFlops to oversell,
-                                    default to 500%, indicates 5 times oversell
+                                  description: The multi of TFlops to oversell, default
+                                    to 500%, indicates 5 times oversell
                                   format: int32
                                   maximum: 100000
                                   minimum: 100
@@ -368,6 +392,7 @@ spec:
                                         - kubernetes.io/os
                                         - topology.kubernetes.io/zone
                                         - karpenter.sh/capacity-type
+                                        - tensor-fusion.ai/gpu-arch
                                         type: string
                                       operator:
                                         default: In
@@ -418,6 +443,7 @@ spec:
                                         - kubernetes.io/os
                                         - topology.kubernetes.io/zone
                                         - karpenter.sh/capacity-type
+                                        - tensor-fusion.ai/gpu-arch
                                         type: string
                                       operator:
                                         default: In
Original file line number	Diff line number	Diff line change
`@@ -199,6 +199,7 @@ func main() {`
`199`	`199`	`GPUPoolReconciler := &controller.GPUPoolReconciler{`
`200`	`200`	`Client: mgr.GetClient(),`
`201`	`201`	`Scheme: mgr.GetScheme(),`
	`202`	`+ Recorder: mgr.GetEventRecorderFor("GPUPool"),`
`202`	`203`	`GpuPoolState: gpuPoolState,`
`203`	`204`	`GpuNodeState: gpuNodeState,`
`204`	`205`	`}`