Skip to content

Commit d40910b

Browse files
chore: Reduce severity of Pod eviction errors (#372)
* chore: Reduce severity of Pod eviciton errors * clippy * changelog * Update slab to fix RUSTSEC-2025-0047 * Regenerate nix lockfile * Add the pod to the warning message * Filter for specific error message * Use k8s.object_ref * Update rust/operator-binary/src/restart_controller/pod.rs Co-authored-by: Nick <[email protected]> --------- Co-authored-by: Nick <[email protected]>
1 parent 7d99f8a commit d40910b

File tree

6 files changed

+71
-6
lines changed

6 files changed

+71
-6
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
44

55
## [Unreleased]
66

7+
### Changed
8+
9+
- Reduce severity of Pod eviction errors. Previously, the operator would produce lot's of
10+
`Cannot evict pod as it would violate the pod's disruption budget` errors. With this fix, the
11+
error is reduced to an info instead ([#372]).
12+
13+
[#372]: https://github.com/stackabletech/commons-operator/pull/372
14+
715
## [25.7.0] - 2025-07-23
816

917
## [25.7.0-rc1] - 2025-07-18

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.nix

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ anyhow = "1.0"
1616
built = { version = "0.8", features = ["chrono", "git2"] }
1717
clap = "4.5"
1818
futures = { version = "0.3", features = ["compat"] }
19+
http = "1.3"
1920
serde = { version = "1.0", features = ["derive"] }
2021
serde_json = "1.0"
2122
snafu = "0.8"

rust/operator-binary/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ stackable-operator.workspace = true
1313

1414
anyhow.workspace = true
1515
clap.workspace = true
16+
http.workspace = true
1617
futures.workspace = true
1718
serde.workspace = true
1819
serde_json.workspace = true

rust/operator-binary/src/restart_controller/pod.rs

Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use std::{sync::Arc, time::Duration};
22

33
use futures::StreamExt;
4+
use http::StatusCode;
45
use snafu::{OptionExt, ResultExt, Snafu};
56
use stackable_operator::{
67
client::Client,
@@ -11,10 +12,10 @@ use stackable_operator::{
1112
kube::{
1213
self,
1314
api::{EvictParams, PartialObjectMeta},
14-
core::DynamicObject,
15+
core::{DynamicObject, ErrorResponse},
1516
runtime::{
1617
Controller,
17-
controller::Action,
18+
controller::{self, Action},
1819
events::{Recorder, Reporter},
1920
reflector::ObjectRef,
2021
watcher,
@@ -96,10 +97,7 @@ pub async fn start(client: &Client, watch_namespace: &WatchNamespace) {
9697
// The event_recorder needs to be shared across all invocations, so that
9798
// events are correctly aggregated
9899
let event_recorder = event_recorder.clone();
99-
async move {
100-
report_controller_reconciled(&event_recorder, FULL_CONTROLLER_NAME, &result)
101-
.await;
102-
}
100+
async move { report_result(result, event_recorder).await }
103101
},
104102
)
105103
.await;
@@ -192,6 +190,58 @@ async fn reconcile(pod: Arc<PartialObjectMeta<Pod>>, ctx: Arc<Ctx>) -> Result<Ac
192190
}
193191
}
194192

193+
/// Reports the result of reconciliation.
194+
///
195+
/// The Pod restart controller has special handling, as it produced lot's of error messages below.
196+
/// They are expected, as we intentionally use the `Evict` API to restart Pods before e.g. the
197+
/// certificate expires. We roll out PDBs by default. If we try to restart multiple Pods that are
198+
/// part of a PDB, we get this errors.
199+
/// Because of this, we don't emit an error for this case, but only product a INFO trace.
200+
///
201+
/// `ERROR stackable_operator::logging::controller: Failed to reconcile object controller.name="pod.restarter.commons.stackable.tech" error=reconciler for object Pod.v1./trino-worker-default-0.default failed error.sources=[failed to evict Pod, ApiError: Cannot evict pod as it would violate the pod's disruption budget.: TooManyRequests (ErrorResponse { status: "Failure", message: "Cannot evict pod as it would violate the pod's disruption budget.", reason: "TooManyRequests", code: 429 }), Cannot evict pod as it would violate the pod's disruption budget.: TooManyRequests]`
202+
#[allow(clippy::type_complexity)] // The result type complexity comes from kube-rs and is what it is
203+
async fn report_result(
204+
result: Result<
205+
(ObjectRef<PartialObjectMeta<Pod>>, Action),
206+
controller::Error<Error, watcher::Error>,
207+
>,
208+
event_recorder: Arc<Recorder>,
209+
) {
210+
if let Err(controller::Error::ReconcilerFailed(
211+
Error::EvictPod {
212+
source: evict_pod_error,
213+
},
214+
pod_ref,
215+
)) = &result
216+
{
217+
const TOO_MANY_REQUESTS_HTTP_CODE: u16 = StatusCode::TOO_MANY_REQUESTS.as_u16();
218+
// We can not blanket silence all 429 responses, as it could be something else.
219+
// E.g. I have seen "storage is re-initializing" in the past.
220+
const EVICT_ERROR_MESSAGE: &str =
221+
"Cannot evict pod as it would violate the pod's disruption budget.";
222+
223+
if let kube::Error::Api(ErrorResponse {
224+
code: TOO_MANY_REQUESTS_HTTP_CODE,
225+
message: error_message,
226+
..
227+
}) = evict_pod_error
228+
// TODO: We need Rust 1.88 and 2024 edition for if-let-chains
229+
// && error_message == EVICT_ERROR_MESSAGE
230+
{
231+
if error_message == EVICT_ERROR_MESSAGE {
232+
tracing::info!(
233+
k8s.object.ref = %pod_ref,
234+
error = %evict_pod_error,
235+
"Tried to evict Pod, but wasn't allowed to do so, as it would violate the Pod's disruption budget. Retrying later"
236+
);
237+
return;
238+
}
239+
}
240+
}
241+
242+
report_controller_reconciled(&event_recorder, FULL_CONTROLLER_NAME, &result).await;
243+
}
244+
195245
fn error_policy(_obj: Arc<PartialObjectMeta<Pod>>, _error: &Error, _ctx: Arc<Ctx>) -> Action {
196246
Action::requeue(Duration::from_secs(5))
197247
}

0 commit comments

Comments
 (0)