Skip to content

Commit 8d93a2a

Browse files
committed
operator: Restart rolling update when Statefulset changes
This PR enhances the detection mechanism for Node Pool rolling upgrades. Previously, when a Node Pool rolling upgrade was detected, all Pods were marked with the `ClusterUpdate` Pod Status Condition, and the `restarting` Cluster Custom Resource Status Node Pool field was marked as `true`. If a modification to the Cluster Custom Resource occurred during an ongoing rolling upgrade (e.g., a change to the Redpanda container tag), it could trigger an update to the StatefulSet resource. In such cases, Pods that had already been restarted within the affected Node Pool were not restarted again, resulting in inconsistent state propagation (e.g., mismatched Redpanda container tags). With this change, updates to the StatefulSet are explicitly distinguished from Node Pool rolling upgrades. When a StatefulSet update is required, all associated Pods are now marked or re-marked with the `ClusterUpdate` Pod Status Condition to ensure consistent rollout of the updated specification.
1 parent 83eaa93 commit 8d93a2a

File tree

7 files changed

+142
-18
lines changed

7 files changed

+142
-18
lines changed

operator/pkg/resources/statefulset_update.go

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -94,15 +94,23 @@ func (r *StatefulSetResource) runUpdate(
9494
// Check if we should run update on this specific STS.
9595

9696
log.V(logger.DebugLevel).Info("Checking that we should update")
97-
update, err := r.shouldUpdate(current, modified)
97+
stsChanged, nodePoolRestarting, err := r.shouldUpdate(current, modified)
9898
if err != nil {
9999
return fmt.Errorf("unable to determine the update procedure: %w", err)
100100
}
101101

102-
if !update {
102+
if !stsChanged && !nodePoolRestarting {
103103
return nil
104104
}
105105

106+
// When a StatefulSet change is detected, mark all Node Pool Pods as requiring a Pod rolling restart
107+
if stsChanged {
108+
log.V(logger.DebugLevel).Info("Setting ClusterUpdate condition on pods")
109+
if err = r.MarkPodsForUpdate(ctx, ClusterUpdateReasonVersion); err != nil {
110+
return fmt.Errorf("unable to mark pods for update: %w", err)
111+
}
112+
}
113+
106114
// At this point, we have seen a diff and want to update the StatefulSet.
107115
npStatus := r.getNodePoolStatus()
108116

@@ -612,17 +620,19 @@ func (r *StatefulSetResource) updateStatefulSet(
612620
return nil
613621
}
614622

615-
// shouldUpdate returns true if changes on the CR require update
623+
// shouldUpdate compares the generated appsv1.StatefulSet from the StatefulSetResource.obj
624+
// function (referred to as `modified`) with appsv1.StatefulSet retrieved from
625+
// the Kubernetes API (referred to as `current`). If a difference is detected, the function returns
626+
// `true` as the first value. Certain fields are ignored because they are not relevant to a Pod
627+
// rolling upgrade (e.g. `Status`, `VolumeClaimTemplate`, or `Annotations`).
628+
//
629+
// If no differences are found, the function returns `false` as first value. The second value
630+
// depends on the Node Pool Status, indicating whether rolling restart is still in progress.
616631
func (r *StatefulSetResource) shouldUpdate(
617632
current, modified *appsv1.StatefulSet,
618-
) (bool, error) {
633+
) (bool, bool, error) {
619634
log := r.logger.WithName("shouldUpdate")
620635

621-
npStatus := r.getNodePoolStatus()
622-
623-
if npStatus.Restarting || r.pandaCluster.Status.Restarting {
624-
return true, nil
625-
}
626636
prepareResourceForPatch(current, modified)
627637
opts := []patch.CalculateOption{
628638
patch.IgnoreStatusFields(),
@@ -632,10 +642,16 @@ func (r *StatefulSetResource) shouldUpdate(
632642
}
633643
patchResult, err := patch.NewPatchMaker(patch.NewAnnotator(redpandaAnnotatorKey), &patch.K8sStrategicMergePatcher{}, &patch.BaseJSONMergePatcher{}).Calculate(current, modified, opts...)
634644
if err != nil || patchResult.IsEmpty() {
635-
return false, err
645+
npStatus := r.getNodePoolStatus()
646+
647+
if npStatus.Restarting || r.pandaCluster.Status.Restarting {
648+
return false, true, nil
649+
}
650+
651+
return false, false, err
636652
}
637653
log.Info("Detected diff", "patchResult", string(patchResult.Patch))
638-
return true, nil
654+
return true, false, nil
639655
}
640656

641657
func (r *StatefulSetResource) getRestartingStatus() bool {

operator/pkg/resources/statefulset_update_test.go

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -163,14 +163,36 @@ func TestShouldUpdate_AnnotationChange(t *testing.T) {
163163
},
164164
},
165165
}
166-
update, err := ssres.shouldUpdate(sts, stsWithAnnotation)
166+
stsChange, _, err := ssres.shouldUpdate(sts, stsWithAnnotation)
167167
require.NoError(t, err)
168-
require.True(t, update)
168+
require.True(t, stsChange)
169169

170170
// same statefulset with same annotation
171-
update, err = ssres.shouldUpdate(stsWithAnnotation, stsWithAnnotation)
171+
stsChange, _, err = ssres.shouldUpdate(stsWithAnnotation, stsWithAnnotation)
172172
require.NoError(t, err)
173-
require.False(t, update)
173+
require.False(t, stsChange)
174+
175+
ssres = StatefulSetResource{
176+
nodePool: vectorizedv1alpha1.NodePoolSpecWithDeleted{
177+
NodePoolSpec: vectorizedv1alpha1.NodePoolSpec{
178+
Name: "default",
179+
},
180+
},
181+
pandaCluster: &vectorizedv1alpha1.Cluster{
182+
Status: vectorizedv1alpha1.ClusterStatus{
183+
Restarting: false,
184+
NodePools: map[string]vectorizedv1alpha1.NodePoolStatus{
185+
"default": {
186+
Restarting: true,
187+
},
188+
},
189+
},
190+
},
191+
}
192+
// same statefulset with same annotation, but with node pool status restarting
193+
_, nodePoolRestarting, err := ssres.shouldUpdate(stsWithAnnotation, stsWithAnnotation)
194+
require.NoError(t, err)
195+
require.True(t, nodePoolRestarting)
174196
}
175197

176198
func TestPutInMaintenanceMode(t *testing.T) {

operator/tests/e2e/additional-configuration/00-redpanda-cluster.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ metadata:
44
name: additional-configuration
55
spec:
66
image: "redpandadata/redpanda"
7-
version: "v24.2.5"
7+
version: "v25.1.11"
88
replicas: 1
99
resources:
1010
requests:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
apiVersion: kuttl.dev/v1beta1
22
kind: TestStep
33
commands:
4-
- command: ./verify-config.sh -r 10 -b v24.2
4+
- command: ./verify-config.sh -r 10 -b v25.1
55
namespaced: true
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
apiVersion: kuttl.dev/v1beta1
22
kind: TestStep
33
commands:
4-
- command: ./verify-config.sh -r 11 -b v24.2
4+
- command: ./verify-config.sh -r 11 -b v25.1
55
namespaced: true

operator/tests/e2e/additional-configuration/verify-config-dev.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ cloud_storage_segment_max_upload_interval_sec: 1800
7979
default_topic_partitions: 3
8080
enable_idempotence: true
8181
enable_rack_awareness: true
82+
internal_topic_replication_factor: 3
83+
kafka_nodelete_topics: [_internal_connectors_configs _internal_connectors_offsets _internal_connectors_status _audit __consumer_offsets _redpanda_e2e_probe _schemas]
8284
log_segment_size: 536870912
8385
EOF
8486
)
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env bash
2+
3+
expected=$(
4+
cat <<EOF
5+
redpanda:
6+
data_directory: /var/lib/redpanda/data
7+
empty_seed_starts_cluster: false
8+
seed_servers:
9+
- host:
10+
address: additional-configuration-0.additional-configuration.${NAMESPACE}.svc.cluster.local.
11+
port: 33145
12+
rpc_server:
13+
address: 0.0.0.0
14+
port: 33145
15+
kafka_api:
16+
- address: 0.0.0.0
17+
port: 9092
18+
name: kafka
19+
authentication_method: none
20+
admin:
21+
- address: 0.0.0.0
22+
port: 9644
23+
name: admin
24+
advertised_rpc_api:
25+
address: additional-configuration-0.additional-configuration.${NAMESPACE}.svc.cluster.local.
26+
port: 33145
27+
advertised_kafka_api:
28+
- address: additional-configuration-0.additional-configuration.${NAMESPACE}.svc.cluster.local.
29+
port: 9092
30+
name: kafka
31+
developer_mode: true
32+
auto_create_topics_enabled: true
33+
fetch_reads_debounce_timeout: 10
34+
group_initial_rebalance_delay: 0
35+
group_topic_partitions: 3
36+
log_segment_size_min: 1
37+
storage_min_free_bytes: 10485760
38+
topic_partitions_per_shard: 1000
39+
write_caching_default: "true"
40+
rpk:
41+
overprovisioned: true
42+
tune_network: true
43+
tune_disk_scheduler: true
44+
tune_disk_nomerges: true
45+
tune_disk_write_cache: true
46+
tune_disk_irq: true
47+
tune_cpu: true
48+
tune_aio_events: true
49+
tune_clocksource: true
50+
tune_swappiness: true
51+
coredump_dir: /var/lib/redpanda/coredump
52+
tune_ballast_file: true
53+
pandaproxy:
54+
pandaproxy_api:
55+
- address: 0.0.0.0
56+
port: 8082
57+
name: proxy
58+
advertised_pandaproxy_api:
59+
- address: additional-configuration-0.additional-configuration.${NAMESPACE}.svc.cluster.local.
60+
port: 8082
61+
name: proxy
62+
pandaproxy_client:
63+
brokers:
64+
- address: additional-configuration.${NAMESPACE}.svc.cluster.local.
65+
port: 9092
66+
retries: ${PANDAPROXY_RETRIES}
67+
schema_registry:
68+
schema_registry_api:
69+
- address: 0.0.0.0
70+
port: 8081
71+
name: external
72+
EOF
73+
)
74+
75+
expected_bootstrap=$(
76+
cat <<EOF
77+
auto_create_topics_enabled: false
78+
cloud_storage_segment_max_upload_interval_sec: 1800
79+
default_topic_partitions: 3
80+
enable_idempotence: true
81+
enable_rack_awareness: true
82+
log_segment_size: 536870912
83+
EOF
84+
)

0 commit comments

Comments
 (0)