implement cortex support for kubernetes native gang scheduling

kruegercharles · kruegercharles · commit 8f674eaea87d · 2025-12-20T14:57:12.000+01:00
diff --git a/Tiltfile b/Tiltfile
@@ -186,6 +186,8 @@ if 'pods' in ACTIVE_DEPLOYMENTS:
     # Deploy example resources
     k8s_yaml('samples/pods/node.yaml')
     k8s_yaml('samples/pods/pod.yaml')
+    k8s_yaml('samples/pods/gang-scheduling.yaml')
+    k8s_resource('gang-pod-1', labels=['Cortex-Pods'])
     k8s_resource('test-pod', labels=['Cortex-Pods'])
 
 ########### Dev Dependencies
diff --git a/api/delegation/pods/messages.go b/api/delegation/pods/messages.go
@@ -12,6 +12,8 @@ import (
 type PodPipelineRequest struct {
 	// The available nodes.
 	Nodes []corev1.Node `json:"nodes"`
+	// The pod to schedule.
+	Pod *corev1.Pod `json:"pod"`
 }
 
 func (r PodPipelineRequest) GetSubjects() []string {
diff --git a/helm/bundles/cortex-pods/templates/pipelines.yaml b/helm/bundles/cortex-pods/templates/pipelines.yaml
@@ -10,5 +10,7 @@ spec:
   type: filter-weigher
   createDecisions: true
   steps:
+    - ref: { name: pods-gang }
+      mandatory: true
     - ref: { name: pods-noop }
       mandatory: false
diff --git a/helm/bundles/cortex-pods/templates/steps.yaml b/helm/bundles/cortex-pods/templates/steps.yaml
@@ -11,3 +11,16 @@ spec:
     This is only a passthrough step which lets all pod candidates through.
     It is used as a placeholder step in the pods scheduler pipeline.
   knowledges: []
+---
+apiVersion: cortex.cloud/v1alpha1
+kind: Step
+metadata:
+  name: pods-gang
+spec:
+  operator: cortex
+  type: filter
+  impl: gang
+  description: |
+    This filter ensures that pods belonging to a PodGroup are only scheduled
+    if the PodGroup resource exists.
+  knowledges: []
diff --git a/internal/scheduling/decisions/pods/gang_filter.go b/internal/scheduling/decisions/pods/gang_filter.go
@@ -0,0 +1,113 @@
+package pods
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+
+	"github.com/cobaltcore-dev/cortex/api/delegation/pods"
+	"github.com/cobaltcore-dev/cortex/api/v1alpha1"
+	"github.com/cobaltcore-dev/cortex/internal/scheduling/lib"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+// GangFilter ensures that pods belonging to a PodGroup are only scheduled
+// if the PodGroup resource exists.
+type GangFilter struct {
+	client client.Client
+}
+
+func (f *GangFilter) Init(ctx context.Context, client client.Client, step v1alpha1.Step) error {
+	f.client = client
+	return nil
+}
+
+func (f *GangFilter) Run(traceLog *slog.Logger, request pods.PodPipelineRequest) (*lib.StepResult, error) {
+	activations := make(map[string]float64, len(request.Nodes))
+	stats := make(map[string]lib.StepStatistics)
+
+	pod := request.Pod
+	if pod == nil {
+		traceLog.Warn("gang-filter: pod is nil in request")
+		return nil, fmt.Errorf("pod is nil in request")
+	}
+
+	// Check for Workload API
+	// Fetch the full pod object to inspect new fields if they are not in the struct
+	workloadName := ""
+	// Note: We cannot access pod.Spec.WorkloadRef directly if the struct is old.
+	// Use unstructured to attempt to find it.
+	uPod := &unstructured.Unstructured{}
+	uPod.SetGroupVersionKind(schema.GroupVersionKind{Group: "", Version: "v1", Kind: "Pod"})
+	if err := f.client.Get(context.Background(), client.ObjectKey{Name: pod.Name, Namespace: pod.Namespace}, uPod); err == nil {
+		val, found, _ := unstructured.NestedString(uPod.Object, "spec", "workloadRef", "name")
+		if found {
+			workloadName = val
+		}
+	}
+
+	if workloadName != "" {
+		traceLog.Info("gang-filter: checking for workload", "workloadName", workloadName)
+		workload := &unstructured.Unstructured{}
+		workload.SetGroupVersionKind(schema.GroupVersionKind{
+			Group:   "scheduling.k8s.io",
+			Version: "v1alpha1",
+			Kind:    "Workload",
+		})
+		if err := f.client.Get(context.Background(), client.ObjectKey{Name: workloadName, Namespace: pod.Namespace}, workload); err != nil {
+			traceLog.Error("gang-filter: failed to fetch workload", "error", err)
+			// Deny all nodes if the gang resource is missing or cannot be fetched.
+			return &lib.StepResult{Activations: activations, Statistics: stats}, nil
+		}
+		traceLog.Info("gang-filter: workload found, allowing scheduling")
+		for _, node := range request.Nodes {
+			activations[node.Name] = 1.0
+		}
+		return &lib.StepResult{Activations: activations, Statistics: stats}, nil
+	}
+
+	// Fallback: Check if the pod belongs to a gang via Label
+	// We use the label "pod-group.scheduling.k8s.io/name" which is standard for gang scheduling.
+	gangName, ok := pod.Labels["pod-group.scheduling.k8s.io/name"]
+	if !ok {
+		// Not a gang pod, allow it.
+		for _, node := range request.Nodes {
+			activations[node.Name] = 1.0
+		}
+		return &lib.StepResult{Activations: activations, Statistics: stats}, nil
+	}
+
+	traceLog.Info("gang-filter: checking for pod group", "gangName", gangName)
+
+	// Fetch the PodGroup.
+	// We use Unstructured because the PodGroup CRD might not be compiled into this binary.
+	// We assume the group is scheduling.k8s.io
+	podGroup := &unstructured.Unstructured{}
+	podGroup.SetGroupVersionKind(schema.GroupVersionKind{
+		Group:   "scheduling.k8s.io",
+		Version: "v1alpha1",
+		Kind:    "PodGroup",
+	})
+
+	err := f.client.Get(context.Background(), client.ObjectKey{
+		Name:      gangName,
+		Namespace: pod.Namespace,
+	}, podGroup)
+
+	if err != nil {
+		traceLog.Error("gang-filter: failed to fetch pod group", "error", err)
+		// Deny all nodes if the gang resource is missing or cannot be fetched.
+		return &lib.StepResult{Activations: activations, Statistics: stats}, nil
+	}
+
+	// If we found the PodGroup, we currently allow scheduling.
+	// In a full implementation, we would check 'minMember' and other status fields here.
+	traceLog.Info("gang-filter: pod group found, allowing scheduling")
+	for _, node := range request.Nodes {
+		activations[node.Name] = 1.0
+	}
+
+	return &lib.StepResult{Activations: activations, Statistics: stats}, nil
+}
diff --git a/internal/scheduling/decisions/pods/pipeline_controller.go b/internal/scheduling/decisions/pods/pipeline_controller.go
@@ -125,6 +125,20 @@ func (c *DecisionPipelineController) process(ctx context.Context, decision *v1al
 		return errors.New("pipeline not found or not ready")
 	}
 
+	// Fetch the pod to schedule.
+	pod := &corev1.Pod{}
+	if err := c.Get(ctx, client.ObjectKey{
+		Name:      decision.Spec.PodRef.Name,
+		Namespace: decision.Spec.PodRef.Namespace,
+	}, pod); err != nil {
+		log.Error(err, "failed to fetch pod for decision")
+		return err
+	}
+	if pod.Spec.NodeName != "" {
+		log.Info("pod is already assigned to a node", "node", pod.Spec.NodeName)
+		return nil
+	}
+
 	// Find all available nodes.
 	nodes := &corev1.NodeList{}
 	if err := c.List(ctx, nodes); err != nil {
@@ -135,7 +149,7 @@ func (c *DecisionPipelineController) process(ctx context.Context, decision *v1al
 	}
 
 	// Execute the scheduling pipeline.
-	request := pods.PodPipelineRequest{Nodes: nodes.Items}
+	request := pods.PodPipelineRequest{Nodes: nodes.Items, Pod: pod}
 	result, err := pipeline.Run(request)
 	if err != nil {
 		log.V(1).Error(err, "failed to run scheduler pipeline")
@@ -145,20 +159,6 @@ func (c *DecisionPipelineController) process(ctx context.Context, decision *v1al
 	decision.Status.Took = metav1.Duration{Duration: time.Since(startedAt)}
 	log.Info("decision processed successfully", "duration", time.Since(startedAt))
 
-	// Check if the pod is already assigned to a node.
-	pod := &corev1.Pod{}
-	if err := c.Get(ctx, client.ObjectKey{
-		Name:      decision.Spec.PodRef.Name,
-		Namespace: decision.Spec.PodRef.Namespace,
-	}, pod); err != nil {
-		log.Error(err, "failed to fetch pod for decision")
-		return err
-	}
-	if pod.Spec.NodeName != "" {
-		log.Info("pod is already assigned to a node", "node", pod.Spec.NodeName)
-		return nil
-	}
-
 	// Assign the first node returned by the pipeline using a Binding.
 	binding := &corev1.Binding{
 		ObjectMeta: metav1.ObjectMeta{
diff --git a/internal/scheduling/decisions/pods/supported_steps.go b/internal/scheduling/decisions/pods/supported_steps.go
@@ -14,4 +14,5 @@ type PodStep = lib.Step[pods.PodPipelineRequest]
 // The steps actually used by the scheduler are defined through the configuration file.
 var supportedSteps = map[string]func() PodStep{
 	"noop": func() PodStep { return &NoopFilter{} },
+	"gang": func() PodStep { return &GangFilter{} },
 }
diff --git a/samples/pods/gang-scheduling.yaml b/samples/pods/gang-scheduling.yaml
@@ -0,0 +1,26 @@
+apiVersion: scheduling.k8s.io/v1alpha1
+kind: Workload
+metadata:
+  name: test-workload
+  namespace: default
+spec:
+  podGroups:
+    - name: test-group
+      policy: gang
+      gang:
+        minCount: 2
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gang-pod
+  namespace: cortex-system
+spec:
+  schedulerName: cortex
+  workloadRef:
+    name: test-workload
+    kind: Workload
+    apiGroup: scheduling.k8s.io
+  containers:
+    - name: nginx
+      image: nginx:latest

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,8 @@ import (`
`12`	`12`	`type PodPipelineRequest struct {`
`13`	`13`	`// The available nodes.`
`14`	`14`	Nodes []corev1.Node `json:"nodes"`
	`15`	`+ // The pod to schedule.`
	`16`	+ Pod *corev1.Pod `json:"pod"`
`15`	`17`	`}`
`16`	`18`
`17`	`19`	`func (r PodPipelineRequest) GetSubjects() []string {`
Original file line number	Diff line number	Diff line change
`@@ -14,4 +14,5 @@ type PodStep = lib.Step[pods.PodPipelineRequest]`
`14`	`14`	`// The steps actually used by the scheduler are defined through the configuration file.`
`15`	`15`	`var supportedSteps = map[string]func() PodStep{`
`16`	`16`	`"noop": func() PodStep { return &NoopFilter{} },`
	`17`	`+ "gang": func() PodStep { return &GangFilter{} },`
`17`	`18`	`}`