Skip to content

Commit 1baef05

Browse files
Merge pull request #2060 from ingvagabund/1990-rebase
OCPBUGS-50992: Help with must-gather scheduling through suggesting preferance of nodes
2 parents 68a8590 + b50b11f commit 1baef05

File tree

2 files changed

+356
-4
lines changed

2 files changed

+356
-4
lines changed

pkg/cli/admin/mustgather/mustgather.go

Lines changed: 156 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"os"
1111
"path"
1212
"regexp"
13+
"sort"
1314
"strconv"
1415
"strings"
1516
"sync"
@@ -39,6 +40,7 @@ import (
3940
"k8s.io/kubectl/pkg/util/templates"
4041
admissionapi "k8s.io/pod-security-admission/api"
4142
"k8s.io/utils/exec"
43+
utilptr "k8s.io/utils/ptr"
4244

4345
configclient "github.com/openshift/client-go/config/clientset/versioned"
4446
imagev1client "github.com/openshift/client-go/image/clientset/versioned/typed/image/v1"
@@ -51,7 +53,10 @@ import (
5153
)
5254

5355
const (
54-
gatherContainerName = "gather"
56+
gatherContainerName = "gather"
57+
unreachableTaintKey = "node.kubernetes.io/unreachable"
58+
notReadyTaintKey = "node.kubernetes.io/not-ready"
59+
controlPlaneNodeRoleLabel = "node-role.kubernetes.io/control-plane"
5560
)
5661

5762
var (
@@ -99,6 +104,30 @@ sleep 5
99104
done`
100105
)
101106

107+
// buildNodeAffinity builds a node affinity from the provided node hostnames.
108+
func buildNodeAffinity(nodeHostnames []string) *corev1.Affinity {
109+
if len(nodeHostnames) == 0 {
110+
return nil
111+
}
112+
return &corev1.Affinity{
113+
NodeAffinity: &corev1.NodeAffinity{
114+
RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
115+
NodeSelectorTerms: []corev1.NodeSelectorTerm{
116+
{
117+
MatchExpressions: []corev1.NodeSelectorRequirement{
118+
{
119+
Key: "kubernetes.io/hostname",
120+
Operator: corev1.NodeSelectorOpIn,
121+
Values: nodeHostnames,
122+
},
123+
},
124+
},
125+
},
126+
},
127+
},
128+
}
129+
}
130+
102131
const (
103132
// number of concurrent must-gather Pods to run if --all-images or multiple --image are provided
104133
concurrentMG = 4
@@ -404,6 +433,125 @@ func (o *MustGatherOptions) Validate() error {
404433
return nil
405434
}
406435

436+
func getNodeLastHeartbeatTime(node corev1.Node) *metav1.Time {
437+
for _, cond := range node.Status.Conditions {
438+
if cond.Type == corev1.NodeReady {
439+
if !cond.LastHeartbeatTime.IsZero() {
440+
return utilptr.To[metav1.Time](cond.LastHeartbeatTime)
441+
}
442+
return nil
443+
}
444+
}
445+
return nil
446+
}
447+
448+
func isNodeReadyByCondition(node corev1.Node) bool {
449+
for _, cond := range node.Status.Conditions {
450+
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
451+
return true
452+
}
453+
}
454+
return false
455+
}
456+
457+
func isNodeReadyAndReachableByTaint(node corev1.Node) bool {
458+
for _, taint := range node.Spec.Taints {
459+
if taint.Key == unreachableTaintKey || taint.Key == notReadyTaintKey {
460+
return false
461+
}
462+
}
463+
return true
464+
}
465+
466+
// getCandidateNodeNames identifies suitable nodes for constructing a list of
467+
// node names for a node affinity expression.
468+
func getCandidateNodeNames(nodes *corev1.NodeList, hasMaster bool) []string {
469+
// Identify ready and reacheable nodes
470+
var controlPlaneNodes, allControlPlaneNodes, workerNodes, unschedulableNodes, remainingNodes, selectedNodes []corev1.Node
471+
for _, node := range nodes.Items {
472+
if _, ok := node.Labels[controlPlaneNodeRoleLabel]; ok {
473+
allControlPlaneNodes = append(allControlPlaneNodes, node)
474+
}
475+
if !isNodeReadyByCondition(node) || !isNodeReadyAndReachableByTaint(node) {
476+
remainingNodes = append(remainingNodes, node)
477+
continue
478+
}
479+
if node.Spec.Unschedulable {
480+
unschedulableNodes = append(unschedulableNodes, node)
481+
continue
482+
}
483+
if _, ok := node.Labels[controlPlaneNodeRoleLabel]; ok {
484+
controlPlaneNodes = append(controlPlaneNodes, node)
485+
} else {
486+
workerNodes = append(workerNodes, node)
487+
}
488+
}
489+
490+
// INFO(ingvagabund): a single target node should be enough. Yet,
491+
// it might help to provide more to allow the scheduler choose a more
492+
// suitable node based on the cluster scheduling capabilities.
493+
494+
// hasMaster cause the must-gather pod to set a node selector targeting all
495+
// control plane node. So there's no point of processing other than control
496+
// plane nodes
497+
if hasMaster {
498+
if len(controlPlaneNodes) > 0 {
499+
selectedNodes = controlPlaneNodes
500+
} else {
501+
selectedNodes = allControlPlaneNodes
502+
}
503+
} else {
504+
// For hypershift case
505+
506+
// Order of preference:
507+
// - ready and reachable control plane nodes first
508+
// - then ready and reachable worker nodes
509+
// - then ready and reachable unschedulable nodes
510+
// - then any other node
511+
selectedNodes = controlPlaneNodes // this will be very likely empty for hypershift
512+
if len(selectedNodes) == 0 {
513+
selectedNodes = workerNodes
514+
}
515+
if len(selectedNodes) == 0 {
516+
// unschedulable nodes might still be better candidates than the remaining
517+
// not ready and not reacheable nodes
518+
selectedNodes = unschedulableNodes
519+
}
520+
if len(selectedNodes) == 0 {
521+
// whatever is left for the last resort
522+
selectedNodes = remainingNodes
523+
}
524+
}
525+
526+
// Sort nodes based on the cond.LastHeartbeatTime.Time to prefer nodes that
527+
// reported their status most recently
528+
sort.SliceStable(selectedNodes, func(i, j int) bool {
529+
iTime := getNodeLastHeartbeatTime(selectedNodes[i])
530+
jTime := getNodeLastHeartbeatTime(selectedNodes[j])
531+
// iTime has no effect since all is sorted right
532+
if jTime == nil {
533+
return true
534+
}
535+
if iTime == nil {
536+
return false
537+
}
538+
return jTime.Before(iTime)
539+
})
540+
541+
// Limit the number of nodes to 10 at most to provide a sane list of nodes
542+
nodeNames := []string{}
543+
var nodeNamesSize = 0
544+
for _, n := range selectedNodes {
545+
nodeNames = append(nodeNames, n.Name)
546+
nodeNamesSize++
547+
if nodeNamesSize >= 10 {
548+
break
549+
}
550+
}
551+
552+
return nodeNames
553+
}
554+
407555
// Run creates and runs a must-gather pod
408556
func (o *MustGatherOptions) Run() error {
409557
var errs []error
@@ -476,6 +624,8 @@ func (o *MustGatherOptions) Run() error {
476624
}
477625
}
478626

627+
affinity := buildNodeAffinity(getCandidateNodeNames(nodes, hasMaster))
628+
479629
// ... and create must-gather pod(s)
480630
var pods []*corev1.Pod
481631
for _, image := range o.Images {
@@ -497,7 +647,7 @@ func (o *MustGatherOptions) Run() error {
497647
return err
498648
}
499649
for _, node := range nodes.Items {
500-
pods = append(pods, o.newPod(node.Name, image, hasMaster))
650+
pods = append(pods, o.newPod(node.Name, image, hasMaster, affinity))
501651
}
502652
} else {
503653
if o.NodeName != "" {
@@ -507,7 +657,7 @@ func (o *MustGatherOptions) Run() error {
507657
return err
508658
}
509659
}
510-
pods = append(pods, o.newPod(o.NodeName, image, hasMaster))
660+
pods = append(pods, o.newPod(o.NodeName, image, hasMaster, affinity))
511661
}
512662
}
513663

@@ -925,7 +1075,8 @@ func newClusterRoleBinding(ns *corev1.Namespace) *rbacv1.ClusterRoleBinding {
9251075
// newPod creates a pod with 2 containers with a shared volume mount:
9261076
// - gather: init containers that run gather command
9271077
// - copy: no-op container we can exec into
928-
func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.Pod {
1078+
func (o *MustGatherOptions) newPod(node, image string, hasMaster bool, affinity *corev1.Affinity) *corev1.Pod {
1079+
9291080
zero := int64(0)
9301081

9311082
nodeSelector := map[string]string{
@@ -1025,6 +1176,7 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P
10251176
Operator: "Exists",
10261177
},
10271178
},
1179+
Affinity: affinity,
10281180
},
10291181
}
10301182
if o.HostNetwork {

0 commit comments

Comments
 (0)