Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions pkg/controllers/provisioning/scheduling/topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,13 @@ func buildDomainGroups(nodePools []*v1.NodePool, instanceTypes map[string][]*clo
domainGroups := map[string]TopologyDomainGroup{}
for npName, its := range instanceTypes {
np := nodePoolIndex[npName]

// Build the base NodePool requirements from user-defined labels and requirements
// This represents what a pod needs to match to be scheduled on this NodePool
baseNodePoolRequirements := scheduling.NewRequirements()
baseNodePoolRequirements.Add(scheduling.NewLabelRequirements(np.Spec.Template.Labels).Values()...)
baseNodePoolRequirements.Add(scheduling.NewNodeSelectorRequirementsWithMinValues(np.Spec.Template.Spec.Requirements...).Values()...)

for _, it := range its {
// We need to intersect the instance type requirements with the current nodePool requirements. This
// ensures that something like zones from an instance type don't expand the universe of valid domains.
Expand All @@ -121,7 +128,9 @@ func buildDomainGroups(nodePools []*v1.NodePool, instanceTypes map[string][]*clo
domainGroups[topologyKey] = NewTopologyDomainGroup()
}
for _, domain := range requirement.Values() {
domainGroups[topologyKey].Insert(domain, np.Spec.Template.Spec.Taints...)
// Store the base NodePool requirements with each domain
// This allows us to later filter domains based on pod requirements
domainGroups[topologyKey].Insert(domain, baseNodePoolRequirements, np.Spec.Template.Spec.Taints...)
}
}
}
Expand All @@ -134,7 +143,7 @@ func buildDomainGroups(nodePools []*v1.NodePool, instanceTypes map[string][]*clo
domainGroups[key] = NewTopologyDomainGroup()
}
for _, value := range requirement.Values() {
domainGroups[key].Insert(value, np.Spec.Template.Spec.Taints...)
domainGroups[key].Insert(value, baseNodePoolRequirements, np.Spec.Template.Spec.Taints...)
}
}
}
Expand Down
190 changes: 190 additions & 0 deletions pkg/controllers/provisioning/scheduling/topology_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1922,6 +1922,196 @@ var _ = Describe("Topology", func() {
})
})

// Issue #2227: https://github.com/kubernetes-sigs/karpenter/issues/2227
// Test that topology spread constraints only consider domains from NodePools compatible with pod requirements
Context("NodePool Domain Filtering (Issue #2227)", func() {
var restrictedNodePool, unrestrictedNodePool *v1.NodePool

BeforeEach(func() {
// NodePool with specific zone restrictions (zone-1 and zone-2 only)
restrictedNodePool = test.NodePool(v1.NodePool{
ObjectMeta: metav1.ObjectMeta{Name: "restricted"},
Spec: v1.NodePoolSpec{
Template: v1.NodeClaimTemplate{
Spec: v1.NodeClaimTemplateSpec{
Requirements: []v1.NodeSelectorRequirementWithMinValues{
{
Key: corev1.LabelTopologyZone,
Operator: corev1.NodeSelectorOpIn,
Values: []string{"test-zone-1", "test-zone-2"},
},
},
},
},
},
})

// NodePool with only zone-3 available
// This creates a clear distinction: pods targeting zone-1/2 use restrictedNodePool,
// pods targeting zone-3 use unrestrictedNodePool
unrestrictedNodePool = test.NodePool(v1.NodePool{
ObjectMeta: metav1.ObjectMeta{Name: "unrestricted"},
Spec: v1.NodePoolSpec{
Template: v1.NodeClaimTemplate{
Spec: v1.NodeClaimTemplateSpec{
Requirements: []v1.NodeSelectorRequirementWithMinValues{
{
Key: corev1.LabelTopologyZone,
Operator: corev1.NodeSelectorOpIn,
Values: []string{"test-zone-3"},
},
},
},
},
},
})
})

It("should only consider zones from NodePool matching pod's nodeSelector", func() {
topology := []corev1.TopologySpreadConstraint{{
TopologyKey: corev1.LabelTopologyZone,
WhenUnsatisfiable: corev1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
MaxSkew: 1,
}}

ExpectApplied(ctx, env.Client, restrictedNodePool, unrestrictedNodePool)

// Pods with nodeSelector targeting zone-2
// Both NodePools have zone-2, but the test demonstrates that topology
// spread only considers zones from NodePools compatible with the pod
pods := test.UnschedulablePods(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{Labels: labels},
TopologySpreadConstraints: topology,
NodeSelector: map[string]string{corev1.LabelTopologyZone: "test-zone-2"},
}, 2)

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
for _, pod := range pods {
ExpectScheduled(ctx, env.Client, pod)
}

// All pods are in zone-2 because of nodeSelector
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2))

// Verify all pods were scheduled in test-zone-2
nodes := &corev1.NodeList{}
Expect(env.Client.List(ctx, nodes)).To(Succeed())
for _, node := range nodes.Items {
Expect(node.Labels[corev1.LabelTopologyZone]).To(Equal("test-zone-2"))
}
})

It("should only consider zones from NodePool matching pod's nodeAffinity", func() {
topology := []corev1.TopologySpreadConstraint{{
TopologyKey: corev1.LabelTopologyZone,
WhenUnsatisfiable: corev1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
MaxSkew: 1,
}}

ExpectApplied(ctx, env.Client, restrictedNodePool, unrestrictedNodePool)

// Pods with nodeAffinity targeting zone-1 or zone-2 will select restrictedNodePool
pods := test.UnschedulablePods(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{Labels: labels},
TopologySpreadConstraints: topology,
NodeRequirements: []corev1.NodeSelectorRequirement{
{
Key: corev1.LabelTopologyZone,
Operator: corev1.NodeSelectorOpIn,
Values: []string{"test-zone-1", "test-zone-2"},
},
},
}, 4)

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
for _, pod := range pods {
ExpectScheduled(ctx, env.Client, pod)
}

// Verify pods are spread across only zone-1 and zone-2
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2, 2))
})

It("should handle pods without NodePool restrictions using all zones", func() {
topology := []corev1.TopologySpreadConstraint{{
TopologyKey: corev1.LabelTopologyZone,
WhenUnsatisfiable: corev1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
MaxSkew: 1,
}}

ExpectApplied(ctx, env.Client, restrictedNodePool, unrestrictedNodePool)

// Pods targeting zone-3 will use unrestrictedNodePool
pods := test.UnschedulablePods(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{Labels: labels},
TopologySpreadConstraints: topology,
NodeSelector: map[string]string{corev1.LabelTopologyZone: "test-zone-3"},
}, 2)

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
for _, pod := range pods {
ExpectScheduled(ctx, env.Client, pod)
}

// All pods should be in zone-3
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2))
})

It("should not fail due to zones outside NodePool scope", func() {
topology := []corev1.TopologySpreadConstraint{{
TopologyKey: corev1.LabelTopologyZone,
WhenUnsatisfiable: corev1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
MaxSkew: 1,
}}

ExpectApplied(ctx, env.Client, restrictedNodePool, unrestrictedNodePool)

// This test verifies the fix for Issue #2227:
// Before fix: Karpenter evaluated all cluster zones (zone-1, 2, 3) even though
// the pod targets restrictedNodePool which only has zone-1 and zone-2.
// This caused unnecessary scheduling failures.
// After fix: Karpenter only evaluates zones available in the targeted NodePool.
//
// Setup: restrictedNodePool has zone-1, zone-2; unrestrictedNodePool has zone-3
// Pod targets zone-1 or zone-2, which only matches restrictedNodePool
// With maxSkew=1 and 3 pods across 2 zones: 2-1 distribution is valid
pods := test.UnschedulablePods(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{Labels: labels},
TopologySpreadConstraints: topology,
NodeRequirements: []corev1.NodeSelectorRequirement{
{
Key: corev1.LabelTopologyZone,
Operator: corev1.NodeSelectorOpIn,
Values: []string{"test-zone-1", "test-zone-2"},
},
},
}, 3)

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)

// All pods should successfully schedule
for _, pod := range pods {
ExpectScheduled(ctx, env.Client, pod)
}

// Verify the spread is 2-1 across the two zones (zone-1 and zone-2)
// This would have failed before the fix if Karpenter incorrectly
// tried to evaluate zone-3 as well
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2, 1))

// Verify no pods were scheduled in test-zone-3
nodes := &corev1.NodeList{}
Expect(env.Client.List(ctx, nodes)).To(Succeed())
for _, node := range nodes.Items {
Expect(node.Labels[corev1.LabelTopologyZone]).To(BeElementOf("test-zone-1", "test-zone-2"))
}
})
})

Context("Pod Affinity/Anti-Affinity", func() {
It("should schedule a pod with empty pod affinity and anti-affinity", func() {
ExpectApplied(ctx, env.Client)
Expand Down
143 changes: 113 additions & 30 deletions pkg/controllers/provisioning/scheduling/topologydomaingroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,51 +22,134 @@ import (
"sigs.k8s.io/karpenter/pkg/scheduling"
)

// TopologyDomainGroup tracks the domains for a single topology. Additionally, it tracks the taints associated with
// each of these domains. This enables us to determine which domains should be considered by a pod if its
// NodeTaintPolicy is honor.
type TopologyDomainGroup map[string][][]v1.Taint
// DomainSource tracks the requirements and taints for a specific domain provided by a NodePool.
// This allows us to determine if a pod can use this domain based on its nodeSelector and nodeAffinity.
type DomainSource struct {
// NodePoolRequirements contains the combined requirements from the NodePool's labels and requirements.
// This includes both user-defined labels and instance type requirements (e.g., karpenter.k8s.aws/instance-size).
NodePoolRequirements scheduling.Requirements
// Taints are the taints associated with this domain from the NodePool.
Taints []v1.Taint
}

// TopologyDomainGroup tracks the domains for a single topology. Additionally, it tracks the requirements and taints
// associated with each of these domains from the NodePools that provide them. This enables us to determine which
// domains should be considered by a pod based on its nodeSelector, nodeAffinity, and tolerations.
type TopologyDomainGroup struct {
// domains maps each domain to the list of NodePool sources that can provide it
domains map[string][]DomainSource
}

func NewTopologyDomainGroup() TopologyDomainGroup {
return map[string][][]v1.Taint{}
return TopologyDomainGroup{
domains: map[string][]DomainSource{},
}
}

// Insert either adds a new domain to the TopologyDomainGroup or updates an existing domain.
func (t TopologyDomainGroup) Insert(domain string, taints ...v1.Taint) {
// If the domain is not currently tracked, insert it with the associated taints. Additionally, if there are no taints
// provided, override the taints associated with the domain. Generally, we could remove any sets of taints for which
// the provided set is a proper subset. This is because if a pod tolerates the supersets, it will also tolerate the
// proper subset, and removing the superset reduces the number of taint sets we need to traverse. For now we only
// implement the simplest case, the empty set, but we could do additional performance testing to determine if
// implementing the general case is worth the precomputation cost.
if _, ok := t[domain]; !ok || len(taints) == 0 {
t[domain] = [][]v1.Taint{taints}
// Insert adds a domain to the TopologyDomainGroup with its associated NodePool requirements and taints.
// This tracks which NodePools can provide this domain, allowing us to filter domains based on pod requirements.
func (t TopologyDomainGroup) Insert(domain string, nodePoolRequirements scheduling.Requirements, taints ...v1.Taint) {
if t.domains[domain] == nil {
t.domains[domain] = []DomainSource{}
}

// Create a new domain source for this NodePool
source := DomainSource{
NodePoolRequirements: nodePoolRequirements,
Taints: taints,
}

// Taint optimization: If there are no taints, this NodePool makes the domain available to all pods
// (regarding taints), so we can replace all existing sources. This is the simplest case of the
// general optimization where we could remove any sets of taints for which the provided set is a
// proper subset. We implement only the empty set case for now, as it covers the most common scenario
// and avoids the computational cost of subset checking. Further performance testing could determine
// if implementing the general case (removing supersets when a subset is added) is beneficial.
if len(taints) == 0 {
t.domains[domain] = []DomainSource{source}
return
}
if len(t[domain][0]) == 0 {
// This is the base case, where we're already tracking the empty set of taints for the domain. Pods will always
// be eligible for NodeClaims with this domain (based on taints), so there is no need to track additional taints.

// If we already have a source with no taints, no need to add this one since pods will always
// prefer the taint-free option
if len(t.domains[domain]) > 0 && len(t.domains[domain][0].Taints) == 0 {
return
}
t[domain] = append(t[domain], taints)

// Add this source to the list
t.domains[domain] = append(t.domains[domain], source)
}

// ForEachDomain calls f on each domain tracked by the topology group. If the taintHonorPolicy is honor, only domains
// available on nodes tolerated by the provided pod will be included.
func (t TopologyDomainGroup) ForEachDomain(pod *v1.Pod, taintHonorPolicy v1.NodeInclusionPolicy, f func(domain string)) {
for domain, taintGroups := range t {
if taintHonorPolicy == v1.NodeInclusionPolicyIgnore {
// ForEachDomain calls f on each domain tracked by the topology group that is compatible with the pod's requirements.
// It filters domains based on:
// 1. Pod's requirements don't conflict with NodePool requirements
// 2. Pod's tolerations tolerate the NodePool's taints (if taintHonorPolicy is honor)
func (t TopologyDomainGroup) ForEachDomain(pod *v1.Pod, podRequirements scheduling.Requirements, taintHonorPolicy v1.NodeInclusionPolicy, f func(domain string)) {
for domain, sources := range t.domains {
// Check if any source for this domain is compatible with the pod
isCompatible := false

for _, source := range sources {
// Check if pod requirements don't conflict with NodePool requirements
// We use Intersects to check if they can coexist (no conflicts)
if err := source.NodePoolRequirements.Intersects(podRequirements); err != nil {
continue
}

// If taint policy is ignore, we don't need to check taints
if taintHonorPolicy == v1.NodeInclusionPolicyIgnore {
isCompatible = true
break
}

// Check if pod tolerates this NodePool's taints
if err := scheduling.Taints(source.Taints).ToleratesPod(pod); err == nil {
isCompatible = true
break
}
}

if isCompatible {
f(domain)
continue
}
// Since the taint policy is honor, we should only call f if there is a set of taints associated with the domain which
// the pod tolerates.
// Perf Note: We could consider hashing the pod's tolerations and using that to look up a set of tolerated domains.
for _, taints := range taintGroups {
if err := scheduling.Taints(taints).ToleratesPod(pod); err == nil {
f(domain)
}
}

// ForEachDomainWithMultipleRequirements calls f on each domain that is compatible with ANY of the provided requirement sets.
// This is used when a pod has multiple NodeSelectorTerms (which are OR'd together).
// A domain is included if it's compatible with at least one requirement set.
func (t TopologyDomainGroup) ForEachDomainWithMultipleRequirements(pod *v1.Pod, podRequirementsList []scheduling.Requirements, taintHonorPolicy v1.NodeInclusionPolicy, f func(domain string)) {
for domain, sources := range t.domains {
// Check if any source for this domain is compatible with ANY of the pod's requirement sets
isCompatible := false

for _, podRequirements := range podRequirementsList {
for _, source := range sources {
// Check if pod requirements don't conflict with NodePool requirements
if err := source.NodePoolRequirements.Intersects(podRequirements); err != nil {
continue
}

// If taint policy is ignore, we don't need to check taints
if taintHonorPolicy == v1.NodeInclusionPolicyIgnore {
isCompatible = true
break
}

// Check if pod tolerates this NodePool's taints
if err := scheduling.Taints(source.Taints).ToleratesPod(pod); err == nil {
isCompatible = true
break
}
}
if isCompatible {
break
}
}

if isCompatible {
f(domain)
}
}
}
Loading
Loading