kube-nexus
diff --git a/‎pkg/plugins/backfill/backfill.go‎
Lines changed: 85 additions & 2 deletions b/‎pkg/plugins/backfill/backfill.go‎
Lines changed: 85 additions & 2 deletions
diff --git a/‎pkg/plugins/numatopology/numatopology.go‎
Lines changed: 117 additions & 1 deletion b/‎pkg/plugins/numatopology/numatopology.go‎
Lines changed: 117 additions & 1 deletion
diff --git a/‎pkg/plugins/preemption/gang_preemption.go‎
Lines changed: 31 additions & 1 deletion b/‎pkg/plugins/preemption/gang_preemption.go‎
Lines changed: 31 additions & 1 deletion
@@ -29,6 +29,9 @@ import (
 	"github.com/kube-nexus/kubenexus-scheduler/pkg/plugins/profileclassifier"
 )
 
+// GPU resource name constant
+const GPUResourceName = "nvidia.com/gpu"
+
 // BackfillScoring implements opportunistic scheduling to maximize cluster utilization
 // by allowing low-priority "backfill" pods to use idle resources that would otherwise
 // be wasted.
@@ -128,6 +131,7 @@ func (b *BackfillScoring) Score(ctx context.Context, state framework.CycleState,
 	// Calculate node resource utilization
 	allocatableCPU := float64(node.Status.Allocatable.Cpu().MilliValue())
 	allocatableMemory := float64(node.Status.Allocatable.Memory().Value())
+	allocatableGPU := node.Status.Allocatable[v1.ResourceName(GPUResourceName)]
 
 	if allocatableCPU == 0 || allocatableMemory == 0 {
 		// Node has no allocatable resources, return neutral score
@@ -151,13 +155,17 @@ func (b *BackfillScoring) Score(ctx context.Context, state framework.CycleState,
 
 	requestedCPU := float64(0)
 	requestedMemory := float64(0)
+	requestedGPU := float64(0)
 
 	// Only sum pods that are scheduled on THIS specific node
 	for _, podOnNode := range allPods {
 		if podOnNode.Spec.NodeName == node.Name {
 			for _, container := range podOnNode.Spec.Containers {
 				requestedCPU += float64(container.Resources.Requests.Cpu().MilliValue())
 				requestedMemory += float64(container.Resources.Requests.Memory().Value())
+				if gpu, ok := container.Resources.Requests[v1.ResourceName(GPUResourceName)]; ok {
+					requestedGPU += float64(gpu.Value())
+				}
 			}
 		}
 	}
@@ -166,8 +174,29 @@ func (b *BackfillScoring) Score(ctx context.Context, state framework.CycleState,
 	cpuUtilization := (requestedCPU / allocatableCPU) * 100.0
 	memoryUtilization := (requestedMemory / allocatableMemory) * 100.0
 
-	// Weighted average: 60% CPU, 40% Memory (CPU is typically more constrained)
-	utilization := (cpuUtilization * 0.6) + (memoryUtilization * 0.4)
+	// GPU utilization calculation (only if node has GPUs)
+	gpuUtilization := 0.0
+	if allocatableGPU.Value() > 0 {
+		gpuUtilization = (requestedGPU / float64(allocatableGPU.Value())) * 100.0
+	}
+
+	// Cap individual utilizations at 100%
+	if cpuUtilization > 100.0 {
+		cpuUtilization = 100.0
+	}
+	if memoryUtilization > 100.0 {
+		memoryUtilization = 100.0
+	}
+	if gpuUtilization > 100.0 {
+		gpuUtilization = 100.0
+	}
+
+	// Weighted average: 35% CPU, 35% Memory, 30% GPU (critical in GPU clusters)
+	// For nodes without GPUs, GPU utilization is 0 and doesn't affect score
+	utilization := (cpuUtilization * 0.35) + (memoryUtilization * 0.35)
+	if allocatableGPU.Value() > 0 {
+		utilization += (gpuUtilization * 0.30)
+	}
 
 	// Cap at 100% to handle overcommitted nodes
 	if utilization > 100.0 {
@@ -180,6 +209,14 @@ func (b *BackfillScoring) Score(ctx context.Context, state framework.CycleState,
 	// Determine if this is a backfill pod
 	isBackfillPod := b.getPreemptibilityFromProfile(state, pod)
 
+	// Get pod's tenant tier for tenant-aware scoring
+	tenantTier := b.getTenantTierFromProfile(state, pod)
+
+	// Apply tenant-aware adjustments
+	// Silver/Bronze backfill should avoid Gold-reserved resources
+	tenantAdjustment := b.calculateTenantAdjustment(tenantTier, node)
+	utilization += tenantAdjustment
+
 	if isBackfillPod {
 		// BACKFILL POD STRATEGY: Prefer nodes with MORE idle resources
 		// Score = idle% (0-100)
@@ -291,6 +328,52 @@ func (b *BackfillScoring) isBackfillEligible(pod *v1.Pod) bool {
 	return false
 }
 
+// getTenantTierFromProfile gets pod's tenant tier from ProfileClassifier
+func (b *BackfillScoring) getTenantTierFromProfile(state framework.CycleState, pod *v1.Pod) string {
+	profile, err := profileclassifier.GetProfile(&state)
+	if err == nil && profile != nil {
+		return string(profile.TenantTier)
+	}
+	// Default to bronze if ProfileClassifier not available
+	return "bronze"
+}
+
+// calculateTenantAdjustment applies tenant-aware penalty to backfill pods
+// Silver/Bronze backfill pods get penalty for using Gold-reserved resources
+func (b *BackfillScoring) calculateTenantAdjustment(tenantTier string, node *v1.Node) float64 {
+	// Check if node is reserved for a specific tenant tier
+	if node.Labels == nil {
+		return 0
+	}
+
+	reservedTier, ok := node.Labels["tenant.kubenexus.io/reserved-tier"]
+	if !ok {
+		return 0 // No reservation, no adjustment
+	}
+
+	// Tenant hierarchy: gold > silver > bronze
+	tierPriority := map[string]int{
+		"gold":   3,
+		"silver": 2,
+		"bronze": 1,
+	}
+
+	podPriority := tierPriority[tenantTier]
+	nodePriority := tierPriority[reservedTier]
+
+	// If pod tier is lower than node's reserved tier, apply penalty
+	if podPriority < nodePriority {
+		penalty := float64(nodePriority-podPriority) * 15.0 // 15-30 point penalty
+		klog.V(4).InfoS("BackfillScoring: applying tenant tier penalty",
+			"podTier", tenantTier,
+			"nodeReservedTier", reservedTier,
+			"penalty", penalty)
+		return -penalty
+	}
+
+	return 0 // No penalty if tier matches or exceeds
+}
+
 // New initializes a new BackfillScoring plugin and returns it.
 func New(_ context.Context, _ runtime.Object, handle framework.Handle) (framework.Plugin, error) {
 	podLister := handle.SharedInformerFactory().Core().V1().Pods().Lister()
 
@@ -134,6 +134,10 @@ const (
 	WeightMemoryBandwidth = 0.25 // 25% weight for memory bandwidth availability
 	WeightNUMADistance    = 0.20 // 20% weight for NUMA distance/latency
 	WeightGangAffinity    = 0.15 // 15% weight for gang member affinity
+
+	// GPU-NUMA co-alignment
+	BonusGPUNUMACoLocation = 15 // Bonus when GPU and CPU are on same NUMA
+	PenaltyGPUNUMAMismatch = 25 // Penalty when GPU and CPU on different NUMA
 )
 
 // NUMANode represents a single NUMA node on a server
@@ -215,9 +219,19 @@ func (n *NUMATopology) Filter(ctx context.Context, state framework.CycleState, p
 	// Calculate pod resource requirements
 	podCPU, podMemory := n.getPodResourceRequests(pod)
 
+	// Get GPU-NUMA mapping if pod requests GPUs
+	gpuNUMAMapping := n.getGPUNUMAMapping(node, pod)
+
 	// Check if pod fits in any single NUMA node
 	for _, numa := range numaNodes {
 		if numa.AvailableCPUs >= int(podCPU) && numa.AvailableMemory >= podMemory {
+			// If pod requests GPUs, verify co-alignment
+			if len(gpuNUMAMapping) > 0 && !n.canFitGPUsInNUMA(pod, &numa, gpuNUMAMapping) {
+				klog.V(5).InfoS("NUMATopology: skipping NUMA due to GPU-NUMA mismatch",
+					"pod", klog.KObj(pod), "numaNode", numa.ID, "node", node.Name)
+				continue
+			}
+
 			klog.V(4).InfoS("NUMATopology: pod fits in NUMA node",
 				"pod", klog.KObj(pod), "cpu", podCPU, "memoryGB", podMemory/(1024*1024*1024), "numaNode", numa.ID, "node", node.Name)
 			return framework.NewStatus(framework.Success, "")
@@ -282,6 +296,9 @@ func (n *NUMATopology) Score(ctx context.Context, state framework.CycleState, po
 	// Get NUMA affinity preferences
 	preferredNUMAs, avoidNUMAs := n.getNUMAAffinityPreferences(pod)
 
+	// Get GPU-NUMA mapping
+	gpuNUMAMapping := n.getGPUNUMAMapping(node, pod)
+
 	// Find best NUMA node fit
 	var bestScore float64
 	bestNUMAID := -1
@@ -345,11 +362,22 @@ func (n *NUMATopology) Score(ctx context.Context, state framework.CycleState, po
 		// 4. GANG AFFINITY SCORE (15%)
 		gangScore = n.calculateGangAffinityScore(pod, numa, node)
 
+		// 5. GPU-NUMA CO-ALIGNMENT BONUS (applied as adjustment)
+		gpuBonus := float64(n.calculateGPUNUMABonus(pod, &numa, gpuNUMAMapping))
+
 		// Calculate weighted total score
 		totalScore := (fitScore * WeightNUMAFit) +
 			(memBandwidthScore * WeightMemoryBandwidth) +
 			(distanceScore * WeightNUMADistance) +
-			(gangScore * WeightGangAffinity)
+			(gangScore * WeightGangAffinity) +
+			gpuBonus
+
+		// Cap score at 100
+		if totalScore > 100.0 {
+			totalScore = 100.0
+		} else if totalScore < 0 {
+			totalScore = 0
+		}
 
 		if totalScore > bestScore {
 			bestScore = totalScore
@@ -808,3 +836,91 @@ func categorizePressure(utilization float64) string {
 		return "low"
 	}
 }
+
+// getGPUNUMAMapping extracts GPU-to-NUMA node mapping from node labels
+// Returns map[gpuIndex]numaNodeID
+// Supports labels like: gpu.kubenexus.io/numa-node-0=0, gpu.kubenexus.io/numa-node-1=1
+func (n *NUMATopology) getGPUNUMAMapping(node *v1.Node, pod *v1.Pod) map[int]int {
+	mapping := make(map[int]int)
+
+	if node.Labels == nil {
+		return mapping
+	}
+
+	// Try to extract GPU count and NUMA mappings
+	for i := 0; i < 16; i++ { // Support up to 16 GPUs
+		label := fmt.Sprintf("gpu.kubenexus.io/numa-node-%d", i)
+		if numaStr, ok := node.Labels[label]; ok {
+			if numaID, err := strconv.Atoi(numaStr); err == nil {
+				mapping[i] = numaID
+			}
+		}
+	}
+
+	return mapping
+}
+
+// canFitGPUsInNUMA checks if requested GPUs can be placed on the target NUMA node
+func (n *NUMATopology) canFitGPUsInNUMA(pod *v1.Pod, numa *NUMANode, gpuNUMAMapping map[int]int) bool {
+	// Get GPU request count
+	gpusRequested := 0
+	for _, container := range pod.Spec.Containers {
+		if gpu, ok := container.Resources.Requests[v1.ResourceName("nvidia.com/gpu")]; ok {
+			gpusRequested += int(gpu.Value())
+		}
+	}
+
+	if gpusRequested == 0 {
+		return true // No GPU request, allow
+	}
+
+	if len(gpuNUMAMapping) == 0 {
+		// No GPU-NUMA mapping info, allow (assume kubelet will handle)
+		return true
+	}
+
+	// Count how many requested GPUs can fit on target NUMA node
+	gpusOnNUMA := 0
+	for _, numaID := range gpuNUMAMapping {
+		if numaID == numa.ID {
+			gpusOnNUMA++
+		}
+	}
+
+	// Check if we have enough GPUs on this NUMA node
+	return gpusOnNUMA >= gpusRequested
+}
+
+// calculateGPUNUMABonus calculates bonus/penalty for GPU-NUMA alignment
+func (n *NUMATopology) calculateGPUNUMABonus(pod *v1.Pod, numa *NUMANode, gpuNUMAMapping map[int]int) int64 {
+	gpusRequested := 0
+	for _, container := range pod.Spec.Containers {
+		if gpu, ok := container.Resources.Requests[v1.ResourceName("nvidia.com/gpu")]; ok {
+			gpusRequested += int(gpu.Value())
+		}
+	}
+
+	if gpusRequested == 0 {
+		return 0 // No GPU request, no bonus
+	}
+
+	if len(gpuNUMAMapping) == 0 {
+		return 0 // No GPU-NUMA mapping info, no bonus
+	}
+
+	// Count GPUs on target NUMA node
+	gpusOnNUMA := 0
+	for _, numaID := range gpuNUMAMapping {
+		if numaID == numa.ID {
+			gpusOnNUMA++
+		}
+	}
+
+	// If all requested GPUs are on same NUMA as CPUs, give bonus
+	if gpusOnNUMA >= gpusRequested {
+		return BonusGPUNUMACoLocation
+	}
+
+	// If some GPUs are on different NUMA, apply penalty
+	return -PenaltyGPUNUMAMismatch
+}
@@ -100,14 +100,20 @@ func (gp *GangPreemption) PostFilter(ctx context.Context, state framework.CycleS
 
 	klog.V(3).InfoS("GangPreemption: found victim pods to preempt for gang", "victimCount", len(victims), "namespace", pod.Namespace, "podGroup", podGroupName)
 
+	// Mark victims with preemption annotation for ResourceReservation coordination
+	// This ensures atomicity by preventing other pods from stealing freed capacity
+	for _, victim := range victims {
+		gp.markVictimForPreemption(victim, podGroupName)
+	}
+
 	// Create the preemption result
 	nominatedNodeName := gp.selectNominatedNode(victims, nodeInfos)
 
 	return &framework.PostFilterResult{
 		NominatingInfo: &framework.NominatingInfo{
 			NominatedNodeName: nominatedNodeName,
 		},
-	}, framework.NewStatus(framework.Success, fmt.Sprintf("preempting %d pods", len(victims)))
+	}, framework.NewStatus(framework.Success, fmt.Sprintf("preempting %d pods to benefit gang %s", len(victims), podGroupName))
 }
 
 // ResourceRequirements represents the total resources needed by a gang
@@ -378,6 +384,30 @@ func getTierPriority(tier string) int {
 	}
 }
 
+// markVictimForPreemption annotates a pod to indicate it's being preempted for a gang
+// This helps ResourceReservation track preemption and ensure atomicity
+func (gp *GangPreemption) markVictimForPreemption(pod *v1.Pod, ganGroupName string) {
+	if pod == nil || pod.Annotations == nil {
+		return
+	}
+
+	// Add annotation to track preemption coordination
+	// ResourceReservation will use this to ensure freed capacity is reserved
+	podCopy := pod.DeepCopy()
+	if podCopy.Annotations == nil {
+		podCopy.Annotations = make(map[string]string)
+	}
+
+	podCopy.Annotations["scheduling.kubenexus.io/preemption-for-gang"] = ganGroupName
+	podCopy.Annotations["scheduling.kubenexus.io/preemption-timestamp"] = fmt.Sprintf("%d", time.Now().Unix())
+
+	// Note: In a real implementation, we would patch the pod in the API server
+	// For now, we just log this for observability
+	klog.V(5).InfoS("Marked victim pod for preemption",
+		"pod", klog.KObj(pod),
+		"gang", ganGroupName)
+}
+
 // New creates a new GangPreemption plugin
 func New(_ context.Context, _ runtime.Object, handle framework.Handle) (framework.Plugin, error) {
 	podLister := handle.SharedInformerFactory().Core().V1().Pods().Lister()