Skip to content

Commit 88c388c

Browse files
committed
feat(pod-failure-watcher): add analysis ID generation and annotation to prevent duplicate analysis of pod failures
1 parent ee9169e commit 88c388c

File tree

1 file changed

+122
-0
lines changed

1 file changed

+122
-0
lines changed

src/main/java/com/redhat/podmortem/operator/service/PodFailureWatcher.java

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,17 @@
2020
import java.net.http.HttpClient;
2121
import java.net.http.HttpRequest;
2222
import java.net.http.HttpResponse;
23+
import java.nio.charset.StandardCharsets;
2324
import java.time.Duration;
2425
import java.time.Instant;
2526
import java.util.ArrayList;
2627
import java.util.Arrays;
28+
import java.util.HashMap;
2729
import java.util.List;
2830
import java.util.Map;
2931
import java.util.Optional;
3032
import java.util.Set;
33+
import java.util.UUID;
3134
import java.util.concurrent.ConcurrentHashMap;
3235
import java.util.concurrent.ConcurrentLinkedQueue;
3336
import java.util.concurrent.CopyOnWriteArrayList;
@@ -82,6 +85,9 @@ public class PodFailureWatcher {
8285
private final ConcurrentLinkedQueue<String> pendingFailureQueue = new ConcurrentLinkedQueue<>();
8386
private final Set<String> queuedFailureKeys =
8487
java.util.Collections.newSetFromMap(new ConcurrentHashMap<>());
88+
// Pod annotations
89+
private static final String ANNOTATION_PREFIX = "podmortem.io/";
90+
private static final String ANALYSIS_ID_ANNOTATION = ANNOTATION_PREFIX + "analysis-id";
8591

8692
/**
8793
* Initializes the pod failure watcher on application startup.
@@ -401,6 +407,26 @@ private boolean hasPodFailed(Pod pod) {
401407
private void handlePodFailure(Pod pod) {
402408
String podKey = pod.getMetadata().getNamespace() + "/" + pod.getMetadata().getName();
403409

410+
// Skip if an analysis-id annotation already exists (already analyzed or in-progress)
411+
try {
412+
Pod latest =
413+
client.pods()
414+
.inNamespace(pod.getMetadata().getNamespace())
415+
.withName(pod.getMetadata().getName())
416+
.get();
417+
if (latest != null
418+
&& latest.getMetadata().getAnnotations() != null
419+
&& latest.getMetadata().getAnnotations().containsKey(ANALYSIS_ID_ANNOTATION)) {
420+
log.debug(
421+
"Skipping analysis for {}: analysis-id already present: {}",
422+
podKey,
423+
latest.getMetadata().getAnnotations().get(ANALYSIS_ID_ANNOTATION));
424+
return;
425+
}
426+
} catch (Exception e) {
427+
log.warn("Failed to check existing analysis-id for {}: {}", podKey, e.getMessage());
428+
}
429+
404430
// find matching Podmortem resources first; if none, ignore silently
405431
List<Podmortem> podmortemResources = findMatchingPodmortemResources(pod);
406432
if (podmortemResources.isEmpty()) {
@@ -425,12 +451,108 @@ private void handlePodFailure(Pod pod) {
425451
processedFailures.put(podKey, failureTime);
426452
}
427453

454+
// Assign and annotate a deterministic analysis ID before starting analysis to prevent
455+
// duplicates
456+
String analysisId = generateAnalysisId(pod, failureTime);
457+
boolean annotated = annotateAnalysisIdWithRetry(pod, analysisId, 0, 5, 100);
458+
if (!annotated) {
459+
log.warn(
460+
"Proceeding without analysis-id annotation for {} (id: {})",
461+
podKey,
462+
analysisId);
463+
}
464+
428465
for (Podmortem podmortem : podmortemResources) {
429466
eventService.emitFailureDetected(pod, podmortem);
430467
processPodFailureForPodmortem(podmortem, pod);
431468
}
432469
}
433470

471+
/**
472+
* Generates a deterministic analysis ID based on pod UID and failure timestamp.
473+
*
474+
* <p>Uses name-based UUID so the same pod failure yields the same ID across restarts. Falls
475+
* back to a random UUID if the failure time is unavailable.
476+
*/
477+
private String generateAnalysisId(Pod pod, Instant failureTime) {
478+
try {
479+
String uid =
480+
pod.getMetadata().getUid() != null
481+
? pod.getMetadata().getUid()
482+
: pod.getMetadata().getName();
483+
String failureComponent = failureTime != null ? failureTime.toString() : "no-time";
484+
String seed = uid + ":" + failureComponent;
485+
return UUID.nameUUIDFromBytes(seed.getBytes(StandardCharsets.UTF_8)).toString();
486+
} catch (Exception e) {
487+
return UUID.randomUUID().toString();
488+
}
489+
}
490+
491+
/**
492+
* Adds the analysis-id annotation to the pod with retry and backoff, if not already present.
493+
*/
494+
private boolean annotateAnalysisIdWithRetry(
495+
Pod pod, String analysisId, int attempt, int maxRetries, long delayMs) {
496+
if (attempt > 0) {
497+
try {
498+
Thread.sleep(delayMs);
499+
} catch (InterruptedException e) {
500+
Thread.currentThread().interrupt();
501+
return false;
502+
}
503+
}
504+
505+
try {
506+
Pod latest =
507+
client.pods()
508+
.inNamespace(pod.getMetadata().getNamespace())
509+
.withName(pod.getMetadata().getName())
510+
.get();
511+
if (latest == null) {
512+
return false;
513+
}
514+
515+
Map<String, String> annotations = latest.getMetadata().getAnnotations();
516+
if (annotations == null) {
517+
annotations = new HashMap<>();
518+
}
519+
520+
String key = ANALYSIS_ID_ANNOTATION;
521+
if (annotations.containsKey(key)) {
522+
return true;
523+
}
524+
525+
annotations.put(key, analysisId);
526+
latest.getMetadata().setAnnotations(annotations);
527+
client.pods()
528+
.inNamespace(latest.getMetadata().getNamespace())
529+
.withName(latest.getMetadata().getName())
530+
.patch(latest);
531+
return true;
532+
533+
} catch (io.fabric8.kubernetes.client.KubernetesClientException e) {
534+
if (e.getCode() == 409 && attempt < maxRetries) {
535+
return annotateAnalysisIdWithRetry(
536+
pod, analysisId, attempt + 1, maxRetries, delayMs * 2);
537+
} else if (e.getCode() == 403) {
538+
log.warn(
539+
"Forbidden to set analysis-id annotation for {} - check RBAC permissions: {}",
540+
pod.getMetadata().getName(),
541+
e.getMessage());
542+
return false;
543+
} else {
544+
log.debug(
545+
"Failed to set analysis-id annotation (attempt {}): {}",
546+
attempt + 1,
547+
e.getMessage());
548+
return false;
549+
}
550+
} catch (Exception e) {
551+
log.debug("Unexpected error setting analysis-id annotation: {}", e.getMessage());
552+
return false;
553+
}
554+
}
555+
434556
/**
435557
* Extracts the failure timestamp from a pod's container status.
436558
*

0 commit comments

Comments
 (0)