fix: critical logic bugs across plugins - hard rejection filters, metrics, topology checks (#23)

gouthamreddykotapalle · web-flow · commit a5b8f3dad8ea · 2026-03-11T14:21:10.000-07:00
* fix: critical logic bugs across plugins - hard rejection filters, metrics, topology checks

## Critical Logic Fixes

### ResourceFragmentationScore
- Add Filter plugin to hard-reject nodes where GPU or tenant tier requirements aren't met
- Fix pristine island label override (label now properly overrides allocation calculation)
- Add structured logging (klog.InfoS) to Filter for visibility

### Coscheduling
- Fix GangCompletionLatency metric (was always ~0, now uses gang submission timestamp)
- Fix pod group label mismatch between calculateTotalPods and calculateRunningPodsExcluding

### NUMATopology
- Add thread-safety to gangState map with RWMutex
- Fix NUMA fit score formula (inverted - penalized empty nodes, now prefers remaining capacity)

### ResourceReservation
- Fix isGangComplete to skip terminating pods and check PodRunning phase

### VRAMScheduler
- Add 5-second timeout to DRA ResourceSlice queries (prevents scheduler stalls)
- Fix GPU default logic (only defaults to 1 GPU if VRAM explicitly requested)

### GitHub Actions
- Fix security workflow permissions for CodeQL SARIF uploads

* chore: add gitignore rules to prevent root binaries being committed

- Exclude /scheduler and /webhook from root (should use /bin folder)
- Ensures binaries are only built to bin/ directory going forward

* fix: add mutex locking to gangState access in NUMATopology

- Wrap gangState reads with RLock in calculateGangAffinityScore
- Wrap gangState reads/writes with Lock/Unlock in recordGangPlacement
- Fixes unused field linting error by actually using the mutex
diff --git a/.gitignore b/.gitignore
@@ -6,9 +6,11 @@
 *.out
 coverage.txt
 
-# Binaries in root directory
+# Binaries in root directory (should be in /bin only)
 kubenexus-scheduler
 kubenexus-webhook
+/scheduler
+/webhook
 
 # used for the code generators only
 /vendor/
diff --git a/pkg/plugins/coscheduling/coscheduling.go b/pkg/plugins/coscheduling/coscheduling.go
@@ -297,7 +297,15 @@ func (cs *Coscheduling) Permit(ctx context.Context, state framework.CycleState,
 	klog.V(3).InfoS("Permit: pod group ready to schedule",
 		"namespace", namespace, "podGroup", podGroupName, "current", current, "minAvailable", minAvailable)
 	schedulermetrics.GangSchedulingDecisions.WithLabelValues("success", namespace).Inc()
-	schedulermetrics.GangCompletionLatency.WithLabelValues(namespace, podGroupName, fmt.Sprintf("%d", minAvailable)).Observe(time.Since(time.Now()).Seconds())
+
+	// Record gang completion latency from initial submission time
+	key := utils.GetPodGroupKey(namespace, podGroupName)
+	if pgInfoVal, ok := cs.podGroupInfos.Load(key); ok {
+		if pgInfo, ok := pgInfoVal.(*PodGroupInfo); ok && pgInfo.timestamp.Unix() > 0 {
+			age := time.Since(pgInfo.timestamp)
+			schedulermetrics.GangCompletionLatency.WithLabelValues(namespace, podGroupName, fmt.Sprintf("%d", minAvailable)).Observe(age.Seconds())
+		}
+	}
 
 	// Safely call IterateOverWaitingPods with recovery for test frameworks
 	if cs.frameworkHandle != nil {
@@ -367,11 +375,17 @@ func (cs *Coscheduling) calculateTotalPods(podGroupName, namespace string) int {
 }
 
 func (cs *Coscheduling) calculateRunningPodsExcluding(podGroupName, namespace string, excludeName string) int {
-	selector := labels.Set{PodGroupName: podGroupName}.AsSelector()
+	// Try new label first (match calculateTotalPods logic)
+	selector := labels.Set{"pod-group.scheduling.kubenexus.io/name": podGroupName}.AsSelector()
 	pods, err := cs.podLister.Pods(namespace).List(selector)
-	if err != nil {
-		klog.ErrorS(err, "calculateRunningPods: error listing pods")
-		return 0
+	if err != nil || len(pods) == 0 {
+		// Fallback to old label for backward compatibility
+		selector = labels.Set{PodGroupName: podGroupName}.AsSelector()
+		pods, err = cs.podLister.Pods(namespace).List(selector)
+		if err != nil {
+			klog.ErrorS(err, "calculateRunningPods: error listing pods")
+			return 0
+		}
 	}
 
 	running := 0
diff --git a/pkg/plugins/numatopology/numatopology.go b/pkg/plugins/numatopology/numatopology.go
@@ -23,6 +23,7 @@ import (
 	"math"
 	"strconv"
 	"strings"
+	"sync"
 
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/runtime"
@@ -156,6 +157,7 @@ type GangNUMAState struct {
 // NUMATopology implements NUMA-aware scheduling with advanced features
 type NUMATopology struct {
 	handle    framework.Handle
+	mu        sync.RWMutex              // Protect gangState from concurrent access
 	gangState map[string]*GangNUMAState // Gang group -> state
 }
 
@@ -300,18 +302,23 @@ func (n *NUMATopology) Score(ctx context.Context, state framework.CycleState, po
 		}
 
 		// 1. NUMA FIT QUALITY (40%)
-		cpuUtilization := float64(podCPU) / float64(len(numa.CPUs)) * 100.0
-		memUtilization := float64(podMemory) / float64(numa.TotalMemory) * 100.0
-
-		// Weighted average: 60% CPU, 40% memory
-		utilization := (cpuUtilization * 0.6) + (memUtilization * 0.4)
-
-		// Optimal utilization: 50-70% (leaves room for growth, not too fragmented)
-		fitScore = 100.0 - math.Abs(utilization-60.0)
-		if fitScore < 0 {
-			fitScore = 0
+		// Calculate remaining capacity after placing pod
+		cpuRemaining := float64(numa.AvailableCPUs - int(podCPU))
+		memRemaining := float64(numa.AvailableMemory - podMemory)
+
+		// Normalize to 0-1 range (higher is better - more room for growth)
+		cpuFitScore := cpuRemaining / float64(len(numa.CPUs))
+		if cpuFitScore < 0 {
+			cpuFitScore = 0
+		}
+		memFitScore := memRemaining / float64(numa.TotalMemory)
+		if memFitScore < 0 {
+			memFitScore = 0
 		}
 
+		// Weighted average: 60% CPU, 40% memory (prefer CPU locality)
+		fitScore = (cpuFitScore*0.6 + memFitScore*0.4) * 100.0
+
 		// Boost if in preferred NUMA list
 		if n.isNUMAInList(numa.ID, preferredNUMAs) {
 			fitScore = math.Min(100.0, fitScore*1.2) // 20% boost
@@ -688,8 +695,11 @@ func (n *NUMATopology) calculateGangAffinityScore(pod *v1.Pod, numa NUMANode, no
 		return 50.0 // Neutral score, not a gang member
 	}
 
-	// Get gang state
+	// Get gang state (with lock)
+	n.mu.RLock()
 	gangState, exists := n.gangState[gangGroup]
+	n.mu.RUnlock()
+
 	if !exists || len(gangState.AssignedMembers) == 0 {
 		return 50.0 // First gang member, neutral score
 	}
@@ -753,6 +763,10 @@ func (n *NUMATopology) recordGangPlacement(pod *v1.Pod, numaID int, node *v1.Nod
 		return
 	}
 
+	// Lock for gang state access
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
 	// Initialize gang state if needed
 	if n.gangState == nil {
 		n.gangState = make(map[string]*GangNUMAState)
diff --git a/pkg/plugins/resourcefragmentation/fragmentationscore.go b/pkg/plugins/resourcefragmentation/fragmentationscore.go
@@ -21,6 +21,7 @@ package resourcefragmentation
 
 import (
 	"context"
+	"fmt"
 	"strconv"
 
 	v1 "k8s.io/api/core/v1"
@@ -66,6 +67,7 @@ type ResourceFragmentationScore struct {
 }
 
 var _ framework.ScorePlugin = &ResourceFragmentationScore{}
+var _ framework.FilterPlugin = &ResourceFragmentationScore{}
 
 type GPUIsland struct {
 	NodeName      string
@@ -83,6 +85,49 @@ func (rf *ResourceFragmentationScore) Name() string {
 	return Name
 }
 
+// Filter filters out nodes that don't have sufficient GPUs or violate tenant restrictions
+func (rf *ResourceFragmentationScore) Filter(ctx context.Context, state framework.CycleState, pod *v1.Pod, nodeInfo framework.NodeInfo) *framework.Status {
+	requestedGPUs := getGPURequest(pod)
+	if requestedGPUs == 0 {
+		// No GPU request, allow node
+		return framework.NewStatus(framework.Success)
+	}
+
+	island := rf.detectGPUIsland(nodeInfo)
+	if island == nil {
+		// No GPU island detected, filter out
+		return framework.NewStatus(framework.Unschedulable, "node has no GPU resources")
+	}
+
+	// Check if node has sufficient GPUs
+	if island.AvailableGPUs < requestedGPUs {
+		klog.V(3).InfoS("Filter: insufficient GPUs on node",
+			"pod", klog.KObj(pod),
+			"node", nodeInfo.Node().Name,
+			"requested", requestedGPUs,
+			"available", island.AvailableGPUs)
+		return framework.NewStatus(framework.Unschedulable,
+			fmt.Sprintf("insufficient GPUs: need %d, available %d", requestedGPUs, island.AvailableGPUs))
+	}
+
+	// TENANT-AWARE ISLAND PROTECTION: Hard reject if tenant mismatch
+	podTenantTier := rf.getPodTenantTier(state, pod)
+	if island.TenantTier != "" && island.TenantTier != "none" {
+		if !rf.isTenantAllowed(podTenantTier, island.TenantTier) {
+			klog.V(3).InfoS("Filter: rejecting node due to tenant tier mismatch",
+				"pod", klog.KObj(pod),
+				"node", nodeInfo.Node().Name,
+				"podTenantTier", podTenantTier,
+				"nodeTenantTier", island.TenantTier)
+			return framework.NewStatus(framework.Unschedulable,
+				fmt.Sprintf("pod tenant %s cannot use node reserved for tenant %s",
+					podTenantTier, island.TenantTier))
+		}
+	}
+
+	return framework.NewStatus(framework.Success)
+}
+
 func (rf *ResourceFragmentationScore) Score(ctx context.Context, state framework.CycleState, pod *v1.Pod, nodeInfo framework.NodeInfo) (int64, *framework.Status) {
 	island := rf.detectGPUIsland(nodeInfo)
 	if island == nil {
@@ -237,8 +282,9 @@ func (rf *ResourceFragmentationScore) detectGPUIsland(nodeInfo framework.NodeInf
 	}
 
 	isPristine := allocatedGPUCount == 0
-	if val, ok := node.Labels[LabelGPUIsPristine]; ok && val == "true" {
-		isPristine = true
+	// Label explicitly overrides allocation count
+	if val, ok := node.Labels[LabelGPUIsPristine]; ok {
+		isPristine = val == "true"
 	}
 
 	// Check if node is reserved for a specific tenant tier
diff --git a/pkg/plugins/resourcereservation/resourcereservation.go b/pkg/plugins/resourcereservation/resourcereservation.go
@@ -431,8 +431,13 @@ func (rr *ResourceReservation) isGangComplete(pod *v1.Pod, podGroupName string,
 			continue
 		}
 
-		// Count pods that are scheduled (have NodeName assigned)
-		if p.Spec.NodeName != "" {
+		// Skip terminating pods (don't count them as scheduled)
+		if p.DeletionTimestamp != nil {
+			continue
+		}
+
+		// Count pods that are scheduled (have NodeName assigned) and running
+		if p.Spec.NodeName != "" && p.Status.Phase == v1.PodRunning {
 			scheduledCount++
 		}
 	}
diff --git a/pkg/plugins/vramscheduler/vramscheduler.go b/pkg/plugins/vramscheduler/vramscheduler.go
@@ -243,8 +243,9 @@ func (v *VRAMScheduler) Score(ctx context.Context, state framework.CycleState, p
 
 	// Calculate how many GPUs the pod needs based on GPU request
 	gpusRequested := getGPURequest(pod)
-	if gpusRequested == 0 {
-		gpusRequested = 1 // Default to 1 GPU if not specified
+	if gpusRequested == 0 && vramRequest > 0 {
+		// Only default to 1 GPU if VRAM was explicitly requested
+		gpusRequested = 1
 	}
 
 	// Calculate total VRAM needed and available
@@ -702,7 +703,11 @@ func (v *VRAMScheduler) getNodeGPUTopology(ctx context.Context, node *v1.Node) (
 	// PRIORITY 1: DRA ResourceSlices (Kubernetes 1.34+)
 	// Provides: Full topology (VRAM, NUMA, PCIe, NVLink), dynamic updates
 	if v.resourceSliceLister != nil {
-		vramPerGPU, devices, err := v.getGPUTopologyFromDRA(ctx, node)
+		// Add 5-second timeout to DRA queries to prevent scheduler stalls
+		draCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+		vramPerGPU, devices, err := v.getGPUTopologyFromDRA(draCtx, node)
+		cancel()
+
 		if err == nil && len(devices) > 0 {
 			klog.V(4).InfoS("✅ Using GPU topology from DRA ResourceSlices",
 				"node", node.Name,
@@ -712,9 +717,14 @@ func (v *VRAMScheduler) getNodeGPUTopology(ctx context.Context, node *v1.Node) (
 			schedulermetrics.DataSourceUsage.WithLabelValues("DRA").Inc()
 			return vramPerGPU, devices
 		}
-		klog.V(5).InfoS("DRA ResourceSlices not available, trying NFD labels",
-			"node", node.Name,
-			"reason", err)
+		if err == context.DeadlineExceeded {
+			klog.V(3).InfoS("DRA query timeout, falling back to NFD",
+				"node", node.Name)
+		} else {
+			klog.V(5).InfoS("DRA ResourceSlices not available, trying NFD labels",
+				"node", node.Name,
+				"reason", err)
+		}
 	} else {
 		klog.V(5).InfoS("DRA not available (lister nil), trying NFD labels",
 			"node", node.Name)

Original file line number	Diff line number	Diff line change
`@@ -431,8 +431,13 @@ func (rr ResourceReservation) isGangComplete(pod v1.Pod, podGroupName string,`
`431`	`431`	`continue`
`432`	`432`	`}`
`433`	`433`
`434`		`- // Count pods that are scheduled (have NodeName assigned)`
`435`		`- if p.Spec.NodeName != "" {`
	`434`	`+ // Skip terminating pods (don't count them as scheduled)`
	`435`	`+ if p.DeletionTimestamp != nil {`
	`436`	`+ continue`
	`437`	`+ }`
	`438`	`+`
	`439`	`+ // Count pods that are scheduled (have NodeName assigned) and running`
	`440`	`+ if p.Spec.NodeName != "" && p.Status.Phase == v1.PodRunning {`
`436`	`441`	`scheduledCount++`
`437`	`442`	`}`
`438`	`443`	`}`