Skip to content

Commit 2cc3c1b

Browse files
Merge pull request #238 from novasbc/configurable_minimum_worker_nodecount_2024-10-02
Configurable minimum worker nodecount
2 parents 16052be + 194acec commit 2cc3c1b

15 files changed

+153
-38
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ bin
1919

2020
# editor and IDE paraphernalia
2121
.idea
22+
*.iml
2223
*.swp
2324
*.swo
2425
*~

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and Cust
160160
.PHONY: generate
161161
generate: controller-gen protoc ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. Also generate protoc / gRPC code
162162
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."
163-
PATH=$(PATH):$(shell pwd)/bin/proto/bin && $(PROTOC) --go_out=. --go-grpc_out=. pkg/peerhealth/peerhealth.proto
163+
PATH='$(PATH)':$(shell pwd)/bin/proto/bin && $(PROTOC) --go_out=. --go-grpc_out=. pkg/peerhealth/peerhealth.proto
164164

165165
.PHONY: fmt
166166
fmt: ## Run go fmt against code.

api/v1alpha1/selfnoderemediationconfig_types.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ const (
2828
ConfigCRName = "self-node-remediation-config"
2929
defaultWatchdogPath = "/dev/watchdog"
3030
defaultIsSoftwareRebootEnabled = true
31+
defaultMinPeersForRemediation = 1
3132
)
3233

3334
// SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig
@@ -127,6 +128,15 @@ type SelfNodeRemediationConfigSpec struct {
127128
// CustomDsTolerations allows to add custom tolerations snr agents that are running on the ds in order to support remediation for different types of nodes.
128129
// +optional
129130
CustomDsTolerations []v1.Toleration `json:"customDsTolerations,omitempty"`
131+
132+
// Minimum number of peer workers/control nodes to attempt to contact before deciding if node is unhealthy or not
133+
// if set to zero, no other peers will be required to be present for remediation action to occur when this
134+
// node has lost API server access. If an insufficient number of peers are found, we will not attempt to ask
135+
// any peer nodes (if present) whether they see that the current node has been marked unhealthy with a
136+
// SelfNodeRemediation CR
137+
// +kubebuilder:default:=1
138+
// +kubebuilder:validation:Minimum=0
139+
MinPeersForRemediation int `json:"minPeersForRemediation,omitempty"`
130140
}
131141

132142
// SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig

bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,17 @@ spec:
127127
its peers.
128128
minimum: 1
129129
type: integer
130+
minPeersForRemediation:
131+
default: 1
132+
description: "Minimum number of peer workers/control nodes to attempt
133+
to contact before deciding if node is unhealthy or not\n\tif set
134+
to zero, no other peers will be required to be present for remediation
135+
action to occur when this\n\tnode has lost API server access. If
136+
an insufficient number of peers are found, we will not attempt to
137+
ask\n\tany peer nodes (if present) whether they see that the current
138+
node has been marked unhealthy with a\n\tSelfNodeRemediation CR"
139+
minimum: 0
140+
type: integer
130141
peerApiServerTimeout:
131142
default: 5s
132143
description: |-

config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,17 @@ spec:
125125
its peers.
126126
minimum: 1
127127
type: integer
128+
minPeersForRemediation:
129+
default: 1
130+
description: "Minimum number of peer workers/control nodes to attempt
131+
to contact before deciding if node is unhealthy or not\n\tif set
132+
to zero, no other peers will be required to be present for remediation
133+
action to occur when this\n\tnode has lost API server access. If
134+
an insufficient number of peers are found, we will not attempt to
135+
ask\n\tany peer nodes (if present) whether they see that the current
136+
node has been marked unhealthy with a\n\tSelfNodeRemediation CR"
137+
minimum: 0
138+
type: integer
128139
peerApiServerTimeout:
129140
default: 5s
130141
description: |-

controllers/selfnoderemediationconfig_controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ func (r *SelfNodeRemediationConfigReconciler) syncConfigDaemonSet(ctx context.Co
153153
data.Data["PeerRequestTimeout"] = snrConfig.Spec.PeerRequestTimeout.Nanoseconds()
154154
data.Data["MaxApiErrorThreshold"] = snrConfig.Spec.MaxApiErrorThreshold
155155
data.Data["EndpointHealthCheckUrl"] = snrConfig.Spec.EndpointHealthCheckUrl
156+
data.Data["MinPeersForRemediation"] = snrConfig.Spec.MinPeersForRemediation
156157
data.Data["HostPort"] = snrConfig.Spec.HostPort
157158
data.Data["IsSoftwareRebootEnabled"] = fmt.Sprintf("\"%t\"", snrConfig.Spec.IsSoftwareRebootEnabled)
158159

controllers/tests/config/selfnoderemediationconfig_controller_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ var _ = Describe("SNR Config Test", func() {
238238
Expect(createdConfig.Spec.ApiServerTimeout.Seconds()).To(BeEquivalentTo(5))
239239
Expect(createdConfig.Spec.ApiCheckInterval.Seconds()).To(BeEquivalentTo(15))
240240
Expect(createdConfig.Spec.PeerUpdateInterval.Seconds()).To(BeEquivalentTo(15 * 60))
241+
Expect(createdConfig.Spec.MinPeersForRemediation).To(BeEquivalentTo(1))
241242
})
242243
})
243244

controllers/tests/config/suite_test.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -120,14 +120,16 @@ var _ = BeforeSuite(func() {
120120
Expect(err).ToNot(HaveOccurred())
121121

122122
certReader = certificates.NewSecretCertStorage(k8sClient, ctrl.Log.WithName("SecretCertStorage"), shared.Namespace)
123+
123124
apiConnectivityCheckConfig := &apicheck.ApiConnectivityCheckConfig{
124-
Log: ctrl.Log.WithName("api-check"),
125-
MyNodeName: shared.UnhealthyNodeName,
126-
CheckInterval: shared.ApiCheckInterval,
127-
MaxErrorsThreshold: shared.MaxErrorThreshold,
128-
Peers: peers,
129-
Cfg: cfg,
130-
CertReader: certReader,
125+
Log: ctrl.Log.WithName("api-check"),
126+
MyNodeName: shared.UnhealthyNodeName,
127+
CheckInterval: shared.ApiCheckInterval,
128+
MaxErrorsThreshold: shared.MaxErrorThreshold,
129+
MinPeersForRemediation: shared.MinPeersForRemediation,
130+
Peers: peers,
131+
Cfg: cfg,
132+
CertReader: certReader,
131133
}
132134
apiCheck := apicheck.New(apiConnectivityCheckConfig, nil)
133135
err = k8sManager.Add(apiCheck)

controllers/tests/controller/selfnoderemediation_controller_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ var _ = Describe("SNR Controller", func() {
6666
k8sClient.ShouldSimulateFailure = false
6767
k8sClient.ShouldSimulatePodDeleteFailure = false
6868
isAdditionalSetupNeeded = false
69+
70+
By("Restore default settings for api connectivity check")
71+
apiConnectivityCheckConfig.MinPeersForRemediation = shared.MinPeersForRemediation
72+
6973
deleteRemediations()
7074
deleteSelfNodeRemediationPod()
7175
//clear node's state, this is important to remove taints, label etc.
@@ -451,6 +455,19 @@ var _ = Describe("SNR Controller", func() {
451455
verifyWatchdogNotTriggered()
452456
})
453457
})
458+
459+
Context("no peer found and MinPeersForRemediation is configured to 0", func() {
460+
BeforeEach(func() {
461+
By("Set MinPeersForRemedation to zero which should trigger the watchdog before the test")
462+
apiConnectivityCheckConfig.MinPeersForRemediation = 0
463+
})
464+
465+
It("Does not receive peer communication and since configured to need zero peers, initiates a reboot",
466+
func() {
467+
verifyWatchdogTriggered()
468+
})
469+
})
470+
454471
})
455472

456473
Context("Configuration is missing", func() {

controllers/tests/controller/suite_test.go

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,14 @@ import (
5353
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.
5454

5555
var (
56-
testEnv *envtest.Environment
57-
dummyDog watchdog.FakeWatchdog
58-
unhealthyNode, peerNode = &v1.Node{}, &v1.Node{}
59-
cancelFunc context.CancelFunc
60-
k8sClient *shared.K8sClientWrapper
61-
fakeRecorder *record.FakeRecorder
62-
snrConfig *selfnoderemediationv1alpha1.SelfNodeRemediationConfig
56+
testEnv *envtest.Environment
57+
dummyDog watchdog.FakeWatchdog
58+
unhealthyNode, peerNode = &v1.Node{}, &v1.Node{}
59+
cancelFunc context.CancelFunc
60+
k8sClient *shared.K8sClientWrapper
61+
fakeRecorder *record.FakeRecorder
62+
snrConfig *selfnoderemediationv1alpha1.SelfNodeRemediationConfig
63+
apiConnectivityCheckConfig *apicheck.ApiConnectivityCheckConfig
6364
)
6465

6566
var unhealthyNodeNamespacedName = client.ObjectKey{
@@ -152,14 +153,15 @@ var _ = BeforeSuite(func() {
152153
Expect(err).ToNot(HaveOccurred())
153154

154155
rebooter := reboot.NewWatchdogRebooter(dummyDog, ctrl.Log.WithName("rebooter"))
155-
apiConnectivityCheckConfig := &apicheck.ApiConnectivityCheckConfig{
156-
Log: ctrl.Log.WithName("api-check"),
157-
MyNodeName: shared.UnhealthyNodeName,
158-
CheckInterval: shared.ApiCheckInterval,
159-
MaxErrorsThreshold: shared.MaxErrorThreshold,
160-
Peers: peers,
161-
Rebooter: rebooter,
162-
Cfg: cfg,
156+
apiConnectivityCheckConfig = &apicheck.ApiConnectivityCheckConfig{
157+
Log: ctrl.Log.WithName("api-check"),
158+
MyNodeName: shared.UnhealthyNodeName,
159+
CheckInterval: shared.ApiCheckInterval,
160+
MaxErrorsThreshold: shared.MaxErrorThreshold,
161+
Peers: peers,
162+
Rebooter: rebooter,
163+
Cfg: cfg,
164+
MinPeersForRemediation: shared.MinPeersForRemediation,
163165
}
164166
apiCheck := apicheck.New(apiConnectivityCheckConfig, nil)
165167
err = k8sManager.Add(apiCheck)

0 commit comments

Comments
 (0)