NexusGPU
diff --git a/‎cmd/operator/main.go‎
Lines changed: 1 addition & 1 deletion b/‎cmd/operator/main.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎go.mod‎
Lines changed: 5 additions & 1 deletion b/‎go.mod‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎go.sum‎
Lines changed: 5 additions & 0 deletions b/‎go.sum‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎internal/constants/constants.go‎
Lines changed: 2 additions & 1 deletion b/‎internal/constants/constants.go‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎internal/controller/gpu_controller.go‎
Lines changed: 61 additions & 21 deletions b/‎internal/controller/gpu_controller.go‎
Lines changed: 61 additions & 21 deletions
diff --git a/‎internal/controller/gpu_controller_test.go‎
Lines changed: 21 additions & 5 deletions b/‎internal/controller/gpu_controller_test.go‎
Lines changed: 21 additions & 5 deletions
diff --git a/‎internal/controller/tensorfusionconnection_controller.go‎
Lines changed: 2 additions & 2 deletions b/‎internal/controller/tensorfusionconnection_controller.go‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎internal/controller/tensorfusionconnection_controller_test.go‎
Lines changed: 4 additions & 3 deletions b/‎internal/controller/tensorfusionconnection_controller_test.go‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎internal/scheduler/filter.go‎
Lines changed: 78 additions & 0 deletions b/‎internal/scheduler/filter.go‎
Lines changed: 78 additions & 0 deletions
@@ -154,7 +154,7 @@ func main() {
 
 	ctx := context.Background()
 
-	scheduler := scheduler.NewNaiveScheduler()
+	scheduler := scheduler.NewScheduler(mgr.GetClient())
 	if err = (&controller.TensorFusionConnectionReconciler{
 		Client:    mgr.GetClient(),
 		Scheme:    mgr.GetScheme(),
 
@@ -14,6 +14,8 @@ require (
 	github.com/onsi/gomega v1.36.2
 	github.com/prometheus/client_golang v1.20.5
 	github.com/samber/lo v1.47.0
+	github.com/shirou/gopsutil v3.21.11+incompatible
+	github.com/stretchr/testify v1.10.0
 	golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67
 	gomodules.xyz/jsonpatch/v2 v2.4.0
 	k8s.io/api v0.32.1
@@ -53,6 +55,7 @@ require (
 	github.com/go-logr/logr v1.4.2 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/go-logr/zapr v1.3.0 // indirect
+	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
 	github.com/go-openapi/jsonreference v0.21.0 // indirect
 	github.com/go-openapi/swag v0.23.0 // indirect
@@ -85,16 +88,17 @@ require (
 	github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect
 	github.com/pelletier/go-toml/v2 v2.2.3 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/client_model v0.6.1 // indirect
 	github.com/prometheus/common v0.61.0 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
-	github.com/shirou/gopsutil v3.21.11+incompatible // indirect
 	github.com/spf13/cobra v1.8.1 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/stoewer/go-strcase v1.3.0 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
+	github.com/yusufpapurcu/wmi v1.2.4 // indirect
 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect
 	go.opentelemetry.io/otel v1.33.0 // indirect
 
@@ -78,6 +78,8 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
 github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
 github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
 github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
+github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
+github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
 github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
 github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
 github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
@@ -218,6 +220,8 @@ github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
+github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
 go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
 go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU=
@@ -284,6 +288,7 @@ golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
 
@@ -23,8 +23,9 @@ const (
 
 	GPULastReportTimeAnnotationKey = Domain + "/last-sync"
 
+	GpuPoolKey = Domain + "/gpupool"
+
 	// Annotation key constants
-	GpuPoolAnnotationKey = Domain + "/gpupool"
 	// %s -> container_name
 	TFLOPSRequestAnnotationFormat = Domain + "/tflops-request-%s"
 	VRAMRequestAnnotationFormat   = Domain + "/vram-request-%s"
 
@@ -18,15 +18,18 @@ package controller
 
 import (
 	"context"
+	"fmt"
+	"strings"
 
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/constants"
+	scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
+	"github.com/samber/lo"
+	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
-	"sigs.k8s.io/controller-runtime/pkg/event"
-	"sigs.k8s.io/controller-runtime/pkg/predicate"
-
-	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
-	scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
 )
 
 // GPUReconciler reconciles a GPU object
@@ -43,6 +46,59 @@ type GPUReconciler struct {
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.
 func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+	gpu := &tfv1.GPU{}
+	if err := r.Get(ctx, req.NamespacedName, gpu); err != nil {
+		if errors.IsNotFound(err) {
+			return ctrl.Result{}, nil
+		}
+		return ctrl.Result{}, err
+	}
+
+	kgvs, _, err := r.Scheme.ObjectKinds(&tfv1.GPUNode{})
+	if err != nil {
+		return ctrl.Result{}, fmt.Errorf("get object kinds for GPUNode: %w", err)
+	}
+
+	owner, ok := lo.Find(gpu.OwnerReferences, func(or metav1.OwnerReference) bool {
+		for _, kvg := range kgvs {
+			if kvg.Kind == or.Kind && fmt.Sprintf("%s/%s", kvg.Group, kvg.Version) == or.APIVersion {
+				return true
+			}
+		}
+		return false
+	})
+
+	if !ok {
+		return ctrl.Result{}, fmt.Errorf("owner node %s not found", gpu.Name)
+	}
+
+	gpunode := &tfv1.GPUNode{}
+	if err := r.Get(ctx, client.ObjectKey{Name: owner.Name}, gpunode); err != nil {
+		return ctrl.Result{}, fmt.Errorf("get node %s: %w", owner.Name, err)
+	}
+
+	var poolName string
+	for labelKey := range gpunode.Labels {
+		after, ok := strings.CutPrefix(labelKey, constants.GPUNodePoolIdentifierLabelPrefix)
+		if ok {
+			poolName = after
+			break
+		}
+	}
+
+	if poolName == "" {
+		return ctrl.Result{}, fmt.Errorf("node %s is not assigned to any pool", gpunode.Name)
+	}
+
+	if gpu.Labels == nil {
+		gpu.Labels = make(map[string]string)
+	}
+	gpu.Labels[constants.GpuPoolKey] = poolName
+
+	// update gpu
+	if err := r.Update(ctx, gpu); err != nil {
+		return ctrl.Result{}, fmt.Errorf("update gpu %s: %w", gpu.Name, err)
+	}
 	return ctrl.Result{}, nil
 }
 
@@ -51,21 +107,5 @@ func (r *GPUReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager)
 	return ctrl.NewControllerManagedBy(mgr).
 		For(&tfv1.GPU{}).
 		Named("gpu").
-		WithEventFilter(
-			predicate.Funcs{
-				CreateFunc: func(e event.CreateEvent) bool {
-					r.Scheduler.OnAdd(e.Object.(*tfv1.GPU))
-					return true
-				},
-				UpdateFunc: func(e event.UpdateEvent) bool {
-					r.Scheduler.OnUpdate(e.ObjectOld.(*tfv1.GPU), e.ObjectNew.(*tfv1.GPU))
-					return true
-				},
-				DeleteFunc: func(e event.DeleteEvent) bool {
-					r.Scheduler.OnDelete(e.Object.(*tfv1.GPU))
-					return true
-				},
-			},
-		).
 		Complete(r)
 }
@@ -18,13 +18,17 @@ package controller
 
 import (
 	"context"
+	"fmt"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/constants"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/kubernetes/scheme"
+	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 )
 
@@ -36,11 +40,23 @@ var _ = Describe("GPU Controller", func() {
 
 		typeNamespacedName := types.NamespacedName{
 			Name:      resourceName,
-			Namespace: "default", // TODO(user):Modify as needed
+			Namespace: "default",
 		}
 		gpu := &tfv1.GPU{}
+		gpunode := &tfv1.GPUNode{}
 
 		BeforeEach(func() {
+			By("creating the custom resource for the Kind GPUNode")
+			gpunode = &tfv1.GPUNode{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: resourceName + "-node",
+					Labels: map[string]string{
+						fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, "mock"): "true",
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, gpunode)).To(Succeed())
+
 			By("creating the custom resource for the Kind GPU")
 			err := k8sClient.Get(ctx, typeNamespacedName, gpu)
 			if err != nil && errors.IsNotFound(err) {
@@ -49,20 +65,22 @@ var _ = Describe("GPU Controller", func() {
 						Name:      resourceName,
 						Namespace: "default",
 					},
-					// TODO(user): Specify other spec details if needed.
 				}
+				Expect(controllerutil.SetControllerReference(gpunode, resource, scheme.Scheme)).To(Succeed())
 				Expect(k8sClient.Create(ctx, resource)).To(Succeed())
 			}
 		})
 
 		AfterEach(func() {
-			// TODO(user): Cleanup logic after each test, like removing the resource instance.
 			resource := &tfv1.GPU{}
 			err := k8sClient.Get(ctx, typeNamespacedName, resource)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Cleanup the specific resource instance GPU")
 			Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
+
+			By("Cleanup the specific resource instance GPUNode")
+			Expect(k8sClient.Delete(ctx, gpunode)).To(Succeed())
 		})
 		It("should successfully reconcile the resource", func() {
 			By("Reconciling the created resource")
@@ -75,8 +93,6 @@ var _ = Describe("GPU Controller", func() {
 				NamespacedName: typeNamespacedName,
 			})
 			Expect(err).NotTo(HaveOccurred())
-			// TODO(user): Add more specific assertions depending on your controller's reconciliation logic.
-			// Example: If you expect a certain status condition after reconciliation, verify it here.
 		})
 	})
 })
@@ -81,7 +81,7 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 	if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending {
 		// Try to get an available gpu from scheduler
 		var err error
-		gpu, err = r.Scheduler.Schedule(connection.Spec.Resources.Requests)
+		gpu, err = r.Scheduler.Schedule(ctx, connection.Spec.PoolName, connection.Spec.Resources.Requests)
 		if err != nil {
 			log.Error(err, "Failed to schedule gpu instance")
 			connection.Status.Phase = tfv1.TensorFusionConnectionPending
@@ -186,7 +186,7 @@ func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, c
 	}
 
 	// Release the resources
-	if err := r.Scheduler.Release(connection.Spec.Resources.Requests, gpu); err != nil {
+	if err := r.Scheduler.Release(ctx, connection.Spec.Resources.Requests, gpu); err != nil {
 		return false, err
 	}
 
 
@@ -44,10 +44,12 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 			Name:      resourceName,
 			Namespace: "default",
 		}
-		scheduler := scheduler.NewNaiveScheduler()
 		gpu := &tfv1.GPU{
 			ObjectMeta: metav1.ObjectMeta{
 				Name: "mock-gpu",
+				Labels: map[string]string{
+					constants.GpuPoolKey: "mock",
+				},
 			},
 		}
 		BeforeEach(func() {
@@ -77,7 +79,6 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 				Expect(k8sClient.Create(ctx, resource)).To(Succeed())
 			}
 
-			scheduler.OnAdd(gpu)
 			Expect(k8sClient.Create(ctx, gpu)).To(Succeed())
 			gpu.Status = tfv1.GPUStatus{
 				Phase: tfv1.TensorFusionGPUPhaseRunning,
@@ -111,7 +112,7 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 			controllerReconciler := &TensorFusionConnectionReconciler{
 				Client:    k8sClient,
 				Scheme:    k8sClient.Scheme(),
-				Scheduler: scheduler,
+				Scheduler: scheduler.NewScheduler(k8sClient),
 			}
 			_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
 				NamespacedName: typeNamespacedName,
 
@@ -0,0 +1,78 @@
+package scheduler
+
+import (
+	"context"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+)
+
+// GPUFilter defines an interface for filtering GPU candidates
+type GPUFilter interface {
+	// Filter filters the list of GPUs and returns only those that pass the filter criteria
+	// The implementation should not modify the input slice
+	Filter(ctx context.Context, gpus []tfv1.GPU) ([]tfv1.GPU, error)
+}
+
+// FilterRegistry provides an immutable collection of GPU filters
+// with methods to create new instances with additional filters
+type FilterRegistry struct {
+	parent  *FilterRegistry // Reference to parent registry
+	filters []GPUFilter     // Only contains filters added at this level
+}
+
+// NewFilterRegistry creates a new empty filter registry
+func NewFilterRegistry() *FilterRegistry {
+	return &FilterRegistry{
+		parent:  nil,
+		filters: []GPUFilter{},
+	}
+}
+
+// With creates a new FilterRegistry with the provided filters added
+// The original FilterRegistry is not modified
+func (fr *FilterRegistry) With(filters ...GPUFilter) *FilterRegistry {
+	if len(filters) == 0 {
+		return fr
+	}
+
+	// Create a new registry with the current one as parent
+	return &FilterRegistry{
+		parent:  fr,
+		filters: filters,
+	}
+}
+
+// Apply applies the filters in this registry to the given GPU list
+// Filters are applied in the order they were added (parent filters first)
+func (fr *FilterRegistry) Apply(ctx context.Context, gpus []tfv1.GPU) ([]tfv1.GPU, error) {
+	// First apply parent filters (if any)
+	filteredGPUs := gpus
+	var err error
+
+	if fr.parent != nil {
+		filteredGPUs, err = fr.parent.Apply(ctx, filteredGPUs)
+		if err != nil {
+			return nil, err
+		}
+
+		// If no GPUs left after parent filtering, return early
+		if len(filteredGPUs) == 0 {
+			return filteredGPUs, nil
+		}
+	}
+
+	// Then apply filters at this level
+	for _, filter := range fr.filters {
+		filteredGPUs, err = filter.Filter(ctx, filteredGPUs)
+		if err != nil {
+			return nil, err
+		}
+
+		// If no GPUs left after filtering, return early
+		if len(filteredGPUs) == 0 {
+			return filteredGPUs, nil
+		}
+	}
+
+	return filteredGPUs, nil
+}