Skip to content

Commit 9166590

Browse files
authored
fix: remove usage of the pre-stop hook (#896)
* fix: remove usage of the pre-stop hook * fix test switcharoo * rename constant
1 parent 8df689b commit 9166590

File tree

2 files changed

+47
-16
lines changed

2 files changed

+47
-16
lines changed

rust/operator-binary/src/config/command.rs

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,48 @@ fn broker_start_command(
137137
}
138138
}
139139

140+
// During a namespace or stacklet delete the Kafka controllers shut down too fast leaving the brokers
141+
// in a bad state.
142+
// Brokers try to connect to controllers before gracefully shutting down but by that time, all
143+
// controllers are already gone.
144+
// The broker pods are then kept alive until the value of `gracefulShutdownTimeout` is reached.
145+
// The environment variable `PRE_STOP_CONTROLLER_SLEEP_SECONDS` delays the termination of the
146+
// controller processes to give the brokers more time to offload data and shutdown gracefully.
147+
// Kubernetes has a built in `pre-stop` hook feature that is not yet generally available on all platforms
148+
// supported by the operator.
149+
const BASH_TRAP_FUNCTIONS: &str = r#"
150+
prepare_signal_handlers()
151+
{
152+
unset term_child_pid
153+
unset term_kill_needed
154+
trap 'handle_term_signal' TERM
155+
}
156+
157+
handle_term_signal()
158+
{
159+
if [ "${term_child_pid}" ]; then
160+
[ -n "$PRE_STOP_CONTROLLER_SLEEP_SECONDS" ] && sleep "$PRE_STOP_CONTROLLER_SLEEP_SECONDS"
161+
kill -TERM "${term_child_pid}" 2>/dev/null
162+
else
163+
term_kill_needed="yes"
164+
fi
165+
}
166+
167+
wait_for_termination()
168+
{
169+
set +e
170+
term_child_pid=$1
171+
if [[ -v term_kill_needed ]]; then
172+
[ -n "$PRE_STOP_CONTROLLER_SLEEP_SECONDS" ] && sleep "$PRE_STOP_CONTROLLER_SLEEP_SECONDS"
173+
kill -TERM "${term_child_pid}" 2>/dev/null
174+
fi
175+
wait ${term_child_pid} 2>/dev/null
176+
trap - TERM
177+
wait ${term_child_pid} 2>/dev/null
178+
set -e
179+
}
180+
"#;
181+
140182
pub fn controller_kafka_container_command(
141183
cluster_id: &str,
142184
controller_descriptors: Vec<KafkaPodDescriptor>,
@@ -152,7 +194,7 @@ pub fn controller_kafka_container_command(
152194
// - use config-utils for proper replacements?
153195
// - should we print the adapted properties file at startup?
154196
formatdoc! {"
155-
{COMMON_BASH_TRAP_FUNCTIONS}
197+
{BASH_TRAP_FUNCTIONS}
156198
{remove_vector_shutdown_file_command}
157199
prepare_signal_handlers
158200
containerdebug --output={STACKABLE_LOG_DIR}/containerdebug-state.json --loop &

rust/operator-binary/src/resource/statefulset.rs

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ use stackable_operator::{
2323
apps::v1::{StatefulSet, StatefulSetSpec, StatefulSetUpdateStrategy},
2424
core::v1::{
2525
ConfigMapKeySelector, ConfigMapVolumeSource, ContainerPort, EnvVar, EnvVarSource,
26-
ExecAction, Lifecycle, LifecycleHandler, ObjectFieldSelector, PodSpec, Probe,
27-
ServiceAccount, SleepAction, TCPSocketAction, Volume,
26+
ExecAction, ObjectFieldSelector, PodSpec, Probe, ServiceAccount, TCPSocketAction,
27+
Volume,
2828
},
2929
},
3030
apimachinery::pkg::{apis::meta::v1::LabelSelector, util::intstr::IntOrString},
@@ -658,6 +658,7 @@ pub fn build_controller_rolegroup_statefulset(
658658
kafka_security,
659659
&resolved_product_image.product_version,
660660
)])
661+
.add_env_var("PRE_STOP_CONTROLLER_SLEEP_SECONDS", "5")
661662
.add_env_var(
662663
"EXTRA_ARGS",
663664
kafka_role
@@ -756,19 +757,7 @@ pub fn build_controller_rolegroup_statefulset(
756757
)
757758
.context(AddVolumesAndVolumeMountsSnafu)?;
758759

759-
// Currently, Controllers shutdown very fast, too fast in most times (flakyness) for the Brokers
760-
// to off load properly. The Brokers then try to connect to any controllers until the
761-
// `gracefulShutdownTimeout` is reached and the pod is finally killed.
762-
// The `pre-stop` hook will delay the kill signal to the Controllers to provide the Brokers more
763-
// time to offload data.
764-
let mut kafka_container = cb_kafka.build();
765-
kafka_container.lifecycle = Some(Lifecycle {
766-
pre_stop: Some(LifecycleHandler {
767-
sleep: Some(SleepAction { seconds: 10 }),
768-
..Default::default()
769-
}),
770-
..Default::default()
771-
});
760+
let kafka_container = cb_kafka.build();
772761

773762
pod_builder
774763
.metadata(metadata)

0 commit comments

Comments
 (0)