NexusGPU
diff --git a/‎.mirrord/mirrord.json‎
Lines changed: 1 addition & 5 deletions b/‎.mirrord/mirrord.json‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎api/v1/gpupool_types.go‎
Lines changed: 7 additions & 15 deletions b/‎api/v1/gpupool_types.go‎
Lines changed: 7 additions & 15 deletions
diff --git a/‎api/v1/tensorfusionconnection_types.go‎
Lines changed: 1 addition & 0 deletions b/‎api/v1/tensorfusionconnection_types.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/v1/zz_generated.deepcopy.go‎
Lines changed: 3 additions & 7 deletions b/‎api/v1/zz_generated.deepcopy.go‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎charts/tensor-fusion/templates/gpupool-sample.yaml‎
Lines changed: 69 additions & 0 deletions b/‎charts/tensor-fusion/templates/gpupool-sample.yaml‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/templates/tensor-fusion-config.yaml‎
Lines changed: 0 additions & 71 deletions b/‎charts/tensor-fusion/templates/tensor-fusion-config.yaml‎
Lines changed: 0 additions & 71 deletions
diff --git a/‎charts/tensor-fusion/templates/vector.yaml‎
Lines changed: 10 additions & 3 deletions b/‎charts/tensor-fusion/templates/vector.yaml‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎charts/tensor-fusion/values.yaml‎
Lines changed: 2 additions & 2 deletions b/‎charts/tensor-fusion/values.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmd/main.go‎
Lines changed: 12 additions & 21 deletions b/‎cmd/main.go‎
Lines changed: 12 additions & 21 deletions
diff --git a/‎config/crd/bases/tensor-fusion.ai_gpupools.yaml‎
Lines changed: 12 additions & 14 deletions b/‎config/crd/bases/tensor-fusion.ai_gpupools.yaml‎
Lines changed: 12 additions & 14 deletions
@@ -8,10 +8,6 @@
         "env": true
     },
     "target": {
-        "namespace": "tensor-fusion",
-        "path": {
-            "deployment": "tensor-fusion-operator-controller-manager",
-            "container": "manager"
-        }
+        "namespace": "tensor-fusion"
     }
 }
@@ -157,27 +157,19 @@ type ComponentConfig struct {
 	Client     ClientConfig     `json:"client,omitempty"`
 }
 
-type WorkerConfig struct {
-	Image             string               `json:"image,omitempty"` // "stable" | "latest" | "nightly"
-	Port              int                  `json:"port,omitempty"`
-	HostNetwork       *bool                `json:"hostNetwork,omitempty"`
-	WorkerPodTemplate runtime.RawExtension `json:"workerPodTemplate,omitempty"` // Mixin extra spec.
-}
-
 type HypervisorConfig struct {
 	Image                       string               `json:"image,omitempty"`
 	HypervisorDaemonSetTemplate runtime.RawExtension `json:"hypervisorDaemonSetTemplate,omitempty"` // Mixin extra spec.
 }
 
-// TODO: client mutation webhook need TLS cert, need check using cert-manager or other ways
-type ClientConfig struct {
-	Image    string `json:"image,omitempty"`
-	Protocol string `json:"protocol,omitempty"`
-	Port     int    `json:"port,omitempty"`
+type WorkerConfig struct {
+	PodTemplate runtime.RawExtension `json:"podTemplate"`
+}
 
-	// +optional
-	// define how to inject the client pod
-	PodTemplateMergePatch runtime.RawExtension `json:"podTemplateMergePatch,omitempty"` // Add other things to the original pod.
+type ClientConfig struct {
+	OperatorEndpoint string               `json:"operatorEndpoint"`
+	PatchToPod       runtime.RawExtension `json:"patchToPod"`
+	PatchToContainer runtime.RawExtension `json:"patchToContainer"`
 }
 
 // GPUPoolStatus defines the observed state of GPUPool.
 
@@ -33,6 +33,7 @@ type Resources struct {
 
 // TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.
 type TensorFusionConnectionSpec struct {
+	PoolName  string    `json:"poolName"`
 	Resources Resources `json:"resources"`
 }
 
 
@@ -0,0 +1,69 @@
+apiVersion: tensor-fusion.ai/v1
+kind: GPUPool
+metadata:
+  name: {{ include "tensor-fusion.fullname" . }}-gpupool-sample
+spec:
+  componentConfig:
+    worker:
+      podTemplate:
+        template:
+          spec:
+            terminationGracePeriodSeconds: 0
+            runtimeClassName: nvidia
+            volumes:
+              - name: worker-sock
+                hostPath:
+                  path: /tensor-fusion/worker/sock
+                  type: DirectoryOrCreate
+            hostNetwork: true
+            hostPID: true
+            containers:
+              - name: tensor-fusion-worker
+                image: tensorfusion/tensor-fusion-worker:latest
+                env:
+                  - name: POD_NAME
+                    valueFrom:
+                      fieldRef:
+                        apiVersion: v1
+                        fieldPath: metadata.name
+                  - name: TF_ENABLE_LOG
+                    value: '1'
+                volumeMounts:
+                  - name: worker-sock
+                    mountPath: /tensor-fusion/worker/sock
+                command:
+                  - /home/app/tensor-fusion-worker
+                  - -n
+                  - native
+                  - -p
+                  - '$(TENSOR_FUSION_WORKER_PORT)'
+                  - -a
+                  - '0x1129'
+                  - -l
+                  - /tensor-fusion/worker/sock/$(POD_NAME).sock
+    client:
+      operatorEndpoint: http://{{ include "tensor-fusion.fullname" . }}.{{ include "tensor-fusion.namespace" . }}:8080
+      patchToContainer:
+        volumeMounts:
+          - mountPath: /tensor-fusion
+            name: tf-libs
+        env:
+          - name: LD_PRELOAD
+            value: /tensor-fusion/libcuda.so
+          - name: TF_ENABLE_LOG
+            value: '1'
+      patchToPod:
+        spec:
+          volumes:
+            - name: tf-libs
+              emptyDir: {}
+          initContainers:
+            - name: inject-lib
+              image: tensorfusion/tensor-fusion-client:latest
+              command:
+                - sh
+                - -c
+                - cp /home/app/*.so /tensor-fusion/ && cp nvidia-smi-linux-amd64-550.54.15 /tensor-fusion/nvidia-smi
+              volumeMounts:
+                - mountPath: /tensor-fusion
+                  name: tf-libs
@@ -39,16 +39,23 @@ data:
           - prepare_metrics
         all_metrics: true
         metrics: []
+
+      prepare_controller_metrics:
+        type: remap
+        inputs: controller_metrics
+        source: |
+          .namespace = "tf"
+
     sinks:
       sink_greptimedb_hypervisor_metrics:
         type: greptimedb_metrics
         inputs:
           - log_to_metric
         new_naming: false
-        endpoint: {{ .Values.hypervisor.greptimedbEendpoint }}
+        endpoint: {{ .Values.hypervisor.greptimedbEndpoint }}
 
       sink_greptimedb_controller_metrics:
         type: prometheus_remote_write
         inputs:
-          - controller_metrics
-        endpoint: {{ .Values.controller.greptimedbEendpoint }}
+          - prepare_controller_metrics
+        endpoint: {{ .Values.controller.greptimedbEndpoint }}
@@ -20,7 +20,7 @@ serviceAccount:
   annotations: {}
 
 hypervisor:
-  greptimedbEendpoint: greptimedb-standalone.greptimedb.svc.cluster.local:4001
+  greptimedbEndpoint: greptimedb-standalone.greptimedb.svc.cluster.local:4001
   image:
     repository: tensorfusion/tensor-fusion-hypervisor
     # Overrides the image tag whose default is the chart appVersion.
@@ -51,7 +51,7 @@ controller:
   tolerations: []
   affinity: {}
 
-  greptimedbEendpoint: http://greptimedb-standalone.greptimedb.svc.cluster.local:4000/v1/prometheus/write?db=public
+  greptimedbEndpoint: http://greptimedb-standalone.greptimedb.svc.cluster.local:4000/v1/prometheus/write?db=public
   admissionWebhooks:
     failurePolicy: Fail
     secretName: tensor-fusion-webhook-secret
 
@@ -26,14 +26,13 @@ import (
 	// to ensure that exec-entrypoint and run can make use of them.
 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 
-	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/config"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/controller"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/server"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/server/router"
 	webhookcorev1 "github.com/NexusGPU/tensor-fusion-operator/internal/webhook/v1"
-	"github.com/NexusGPU/tensor-fusion-operator/internal/worker"
 	"k8s.io/apimachinery/pkg/runtime"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
@@ -55,7 +54,7 @@ var (
 func init() {
 	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
 
-	utilruntime.Must(tensorfusionaiv1.AddToScheme(scheme))
+	utilruntime.Must(tfv1.AddToScheme(scheme))
 	// +kubebuilder:scaffold:scheme
 }
 
@@ -65,10 +64,8 @@ func main() {
 	var probeAddr string
 	var secureMetrics bool
 	var enableHTTP2 bool
-	var configFile string
 	var tlsOpts []func(*tls.Config)
 
-	flag.StringVar(&configFile, "config", "/etc/tensor-fusion/config.yaml", "Config file of tensor-fusion-operator")
 	flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
 		"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
 	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
@@ -154,21 +151,14 @@ func main() {
 	}
 
 	ctx := context.Background()
-	config, err := config.LoadConfig(configFile)
-	if os.IsNotExist(err) {
-		setupLog.Info("config file is not exists, use default config", "configFile", configFile)
-	} else if err != nil {
-		setupLog.Error(err, "unable to load config", "configFile", configFile, "err", err)
-		os.Exit(1)
-	}
+	gpuPoolState := config.NewGpuPoolStateImpl()
+
 	scheduler := scheduler.NewNaiveScheduler()
 	if err = (&controller.TensorFusionConnectionReconciler{
-		Client:    mgr.GetClient(),
-		Scheme:    mgr.GetScheme(),
-		Scheduler: scheduler,
-		WorkerGenerator: &worker.WorkerGenerator{
-			WorkerConfig: &config.Worker,
-		},
+		Client:       mgr.GetClient(),
+		Scheme:       mgr.GetScheme(),
+		Scheduler:    scheduler,
+		GpuPoolState: gpuPoolState,
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "TensorFusionConnection")
 		os.Exit(1)
@@ -185,7 +175,7 @@ func main() {
 
 	// nolint:goconst
 	if os.Getenv("ENABLE_WEBHOOKS") != "false" {
-		if err = webhookcorev1.SetupPodWebhookWithManager(mgr, &config.PodMutation); err != nil {
+		if err = webhookcorev1.SetupPodWebhookWithManager(mgr, gpuPoolState); err != nil {
 			setupLog.Error(err, "unable to create webhook", "webhook", "Pod")
 			os.Exit(1)
 		}
@@ -199,8 +189,9 @@ func main() {
 		os.Exit(1)
 	}
 	if err = (&controller.GPUPoolReconciler{
-		Client: mgr.GetClient(),
-		Scheme: mgr.GetScheme(),
+		Client:       mgr.GetClient(),
+		Scheme:       mgr.GetScheme(),
+		GpuPoolState: gpuPoolState,
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "GPUPool")
 		os.Exit(1)
 
@@ -89,16 +89,18 @@ spec:
                 properties:
                   client:
                     properties:
-                      image:
+                      operatorEndpoint:
                         type: string
-                      podTemplateMergePatch:
-                        description: define how to inject the client pod
+                      patchToContainer:
                         type: object
                         x-kubernetes-preserve-unknown-fields: true
-                      port:
-                        type: integer
-                      protocol:
-                        type: string
+                      patchToPod:
+                        type: object
+                        x-kubernetes-preserve-unknown-fields: true
+                    required:
+                    - operatorEndpoint
+                    - patchToContainer
+                    - patchToPod
                     type: object
                   hypervisor:
                     properties:
@@ -110,15 +112,11 @@ spec:
                     type: object
                   worker:
                     properties:
-                      hostNetwork:
-                        type: boolean
-                      image:
-                        type: string
-                      port:
-                        type: integer
-                      workerPodTemplate:
+                      podTemplate:
                         type: object
                         x-kubernetes-preserve-unknown-fields: true
+                    required:
+                    - podTemplate
                     type: object
                 type: object
               nodeManagerConfig:
Original file line number	Diff line number	Diff line change
`@@ -8,10 +8,6 @@`
`8`	`8`	`"env": true`
`9`	`9`	`},`
`10`	`10`	`"target": {`
`11`		`- "namespace": "tensor-fusion",`
`12`		`- "path": {`
`13`		`- "deployment": "tensor-fusion-operator-controller-manager",`
`14`		`- "container": "manager"`
`15`		`- }`
	`11`	`+ "namespace": "tensor-fusion"`
`16`	`12`	`}`
`17`	`13`	`}`
Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ type Resources struct {`
`33`	`33`
`34`	`34`	`// TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.`
`35`	`35`	`type TensorFusionConnectionSpec struct {`
	`36`	+ PoolName string `json:"poolName"`
`36`	`37`	Resources Resources `json:"resources"`
`37`	`38`	`}`
`38`	`39`