Skip to content

Commit 3fb568e

Browse files
committed
Resiliency for namespace listing and additional liveness checks
1 parent 8a81c87 commit 3fb568e

File tree

9 files changed

+57
-32
lines changed

9 files changed

+57
-32
lines changed

operator/src/main/java/oracle/kubernetes/operator/BaseMain.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
import java.security.cert.CertificateException;
1818
import java.security.spec.InvalidKeySpecException;
1919
import java.time.OffsetDateTime;
20+
import java.util.Collection;
2021
import java.util.Optional;
2122
import java.util.Properties;
2223
import java.util.concurrent.ScheduledExecutorService;
24+
import java.util.concurrent.ScheduledFuture;
2325
import java.util.concurrent.Semaphore;
2426
import java.util.concurrent.ThreadFactory;
2527
import java.util.concurrent.TimeUnit;
@@ -159,14 +161,14 @@ public void run() {
159161
}
160162
}
161163

162-
void markReadyAndStartLivenessThread() {
164+
void markReadyAndStartLivenessThread(Collection<ScheduledFuture<?>> futures) {
163165
try {
164166
new DeploymentReady(delegate).create();
165167

166168
logStartingLivenessMessage();
167169
// every five seconds we need to update the last modified time on the liveness file
168170
wrappedExecutorService.scheduleWithFixedDelay(
169-
new DeploymentLiveness(delegate), 5, 5, TimeUnit.SECONDS);
171+
new DeploymentLiveness(futures, delegate), 5, 5, TimeUnit.SECONDS);
170172
} catch (IOException io) {
171173
LOGGER.severe(MessageKeys.EXCEPTION, io);
172174
}

operator/src/main/java/oracle/kubernetes/operator/DeploymentLiveness.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
// Copyright (c) 2017, 2022, Oracle and/or its affiliates.
1+
// Copyright (c) 2017, 2025, Oracle and/or its affiliates.
22
// Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
33

44
package oracle.kubernetes.operator;
55

66
import java.io.File;
77
import java.io.IOException;
8+
import java.util.Collection;
89
import java.util.Date;
10+
import java.util.concurrent.ScheduledFuture;
911

1012
import oracle.kubernetes.common.logging.MessageKeys;
1113
import oracle.kubernetes.operator.logging.LoggingFacade;
@@ -18,9 +20,11 @@
1820
public class DeploymentLiveness implements Runnable {
1921

2022
private static final LoggingFacade LOGGER = LoggingFactory.getLogger("Operator", "Operator");
23+
private final Collection<ScheduledFuture<?>> futures;
2124
private final File livenessFile;
2225

23-
public DeploymentLiveness(CoreDelegate delegate) {
26+
public DeploymentLiveness(Collection<ScheduledFuture<?>> futures, CoreDelegate delegate) {
27+
this.futures = futures;
2428
livenessFile = new File(delegate.getProbesHome(), ".alive");
2529
}
2630

@@ -33,7 +37,8 @@ public void run() {
3337
} catch (IOException ioe) {
3438
LOGGER.warning(MessageKeys.COULD_NOT_CREATE_LIVENESS_FILE);
3539
}
36-
if (livenessFile.setLastModified(new Date().getTime())) {
40+
if (futures.stream().filter(ScheduledFuture::isDone).findAny().isEmpty()
41+
&& livenessFile.setLastModified(new Date().getTime())) {
3742
LOGGER.fine("Liveness file last modified time set");
3843
}
3944
}

operator/src/main/java/oracle/kubernetes/operator/DomainRecheck.java

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2020, 2023, Oracle and/or its affiliates.
1+
// Copyright (c) 2020, 2025, Oracle and/or its affiliates.
22
// Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
33

44
package oracle.kubernetes.operator;
@@ -35,6 +35,8 @@
3535
import oracle.kubernetes.operator.work.Packet;
3636
import oracle.kubernetes.operator.work.Step;
3737

38+
import static oracle.kubernetes.operator.KubernetesConstants.HTTP_GONE;
39+
import static oracle.kubernetes.operator.calls.AsyncRequestStep.FIBER_TIMEOUT;
3840
import static oracle.kubernetes.operator.helpers.EventHelper.EventItem.NAMESPACE_WATCHING_STARTED;
3941
import static oracle.kubernetes.operator.helpers.NamespaceHelper.getOperatorNamespace;
4042
import static oracle.kubernetes.operator.logging.ThreadLoggingContext.setThreadContext;
@@ -161,7 +163,14 @@ private NamespaceListResponseStep() {
161163
protected NextAction onFailureNoRetry(Packet packet, CallResponse<V1NamespaceList> callResponse) {
162164
return useBackupStrategy(callResponse)
163165
? doNext(createStartNamespacesStep(Namespaces.getConfiguredDomainNamespaces()), packet)
164-
: super.onFailureNoRetry(packet, callResponse);
166+
: onFailureNoRetryCheckForGone(packet, callResponse);
167+
}
168+
169+
protected NextAction onFailureNoRetryCheckForGone(Packet packet, CallResponse<V1NamespaceList> callResponse) {
170+
if (Optional.ofNullable(callResponse).map(CallResponse::getStatusCode).orElse(FIBER_TIMEOUT) == HTTP_GONE) {
171+
return doEnd(packet);
172+
}
173+
return super.onFailureNoRetry(packet, callResponse);
165174
}
166175

167176
// Returns true if the failure wasn't due to authorization, and we have a list of namespaces to manage.
@@ -172,12 +181,13 @@ private boolean useBackupStrategy(CallResponse<V1NamespaceList> callResponse) {
172181
@Override
173182
public NextAction onSuccess(Packet packet, CallResponse<V1NamespaceList> callResponse) {
174183
final Set<String> namespacesToStart = getNamespacesToStart(callResponse.getResult());
175-
Namespaces.getFoundDomainNamespaces(packet).addAll(namespacesToStart);
184+
Collection<String> foundDomainNamespaces = Namespaces.getFoundDomainNamespaces(packet);
185+
foundDomainNamespaces.addAll(namespacesToStart);
176186

177-
return doContinueListOrNext(callResponse, packet, createNextSteps(namespacesToStart));
187+
return doContinueListOrNext(callResponse, packet, () -> createNextSteps(foundDomainNamespaces));
178188
}
179189

180-
private Step createNextSteps(Set<String> namespacesToStartNow) {
190+
private Step createNextSteps(Collection<String> namespacesToStartNow) {
181191
if (!namespacesToStartNow.isEmpty()) {
182192
List<Step> nextSteps = new ArrayList<>();
183193
nextSteps.add(createStartNamespacesStep(namespacesToStartNow));
@@ -190,7 +200,7 @@ private Step createNextSteps(Set<String> namespacesToStartNow) {
190200
return current;
191201
}
192202

193-
private Step createNamespaceReviewStep(Set<String> namespacesToStartNow) {
203+
private Step createNamespaceReviewStep(Collection<String> namespacesToStartNow) {
194204
return RunInParallel.perNamespace(namespacesToStartNow, DomainRecheck.this::createNamespaceReview);
195205
}
196206

operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2018, 2023, Oracle and/or its affiliates.
1+
// Copyright (c) 2018, 2025, Oracle and/or its affiliates.
22
// Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
33

44
package oracle.kubernetes.operator;
@@ -317,7 +317,7 @@ public NextAction onSuccess(Packet packet, CallResponse<V1PodList> callResponse)
317317

318318
if (jobPod == null) {
319319
terminationState.remove(packet);
320-
return doContinueListOrNext(callResponse, packet, getNext());
320+
return doContinueListOrNext(callResponse, packet);
321321
} else {
322322
terminationState.setFromPod(jobPod);
323323
return doNext(getNext(), packet);

operator/src/main/java/oracle/kubernetes/operator/OperatorMain.java

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2017, 2023, Oracle and/or its affiliates.
1+
// Copyright (c) 2017, 2025, Oracle and/or its affiliates.
22
// Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
33

44
package oracle.kubernetes.operator;
@@ -13,9 +13,11 @@
1313
import java.time.OffsetDateTime;
1414
import java.util.Collection;
1515
import java.util.Collections;
16+
import java.util.List;
1617
import java.util.Optional;
1718
import java.util.Properties;
1819
import java.util.concurrent.ScheduledExecutorService;
20+
import java.util.concurrent.ScheduledFuture;
1921
import java.util.concurrent.ThreadFactory;
2022
import java.util.concurrent.TimeUnit;
2123
import java.util.concurrent.atomic.AtomicBoolean;
@@ -301,10 +303,11 @@ void completeBegin() {
301303
// start periodic retry and recheck
302304
int recheckInterval = TuningParameters.getInstance().getDomainNamespaceRecheckIntervalSeconds();
303305
int stuckPodInterval = TuningParameters.getInstance().getStuckPodRecheckSeconds();
304-
mainDelegate.scheduleWithFixedDelay(recheckDomains(), recheckInterval, recheckInterval, TimeUnit.SECONDS);
305-
mainDelegate.scheduleWithFixedDelay(checkStuckPods(), stuckPodInterval, stuckPodInterval, TimeUnit.SECONDS);
306+
Collection<ScheduledFuture<?>> futures = List.of(
307+
mainDelegate.scheduleWithFixedDelay(recheckDomains(), recheckInterval, recheckInterval, TimeUnit.SECONDS),
308+
mainDelegate.scheduleWithFixedDelay(checkStuckPods(), stuckPodInterval, stuckPodInterval, TimeUnit.SECONDS));
306309

307-
markReadyAndStartLivenessThread();
310+
markReadyAndStartLivenessThread(futures);
308311

309312
} catch (Throwable e) {
310313
LOGGER.warning(MessageKeys.EXCEPTION, e);

operator/src/main/java/oracle/kubernetes/operator/WebhookMain.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
// Copyright (c) 2022, 2023, Oracle and/or its affiliates.
1+
// Copyright (c) 2022, 2025, Oracle and/or its affiliates.
22
// Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
33

44
package oracle.kubernetes.operator;
55

6+
import java.util.List;
67
import java.util.Optional;
78
import java.util.Properties;
89
import java.util.concurrent.ScheduledExecutorService;
10+
import java.util.concurrent.ScheduledFuture;
911
import java.util.concurrent.TimeUnit;
1012
import java.util.concurrent.atomic.AtomicInteger;
1113

@@ -134,9 +136,10 @@ void completeBegin() {
134136

135137
// start periodic recheck of CRD
136138
int recheckInterval = TuningParameters.getInstance().getDomainNamespaceRecheckIntervalSeconds();
137-
delegate.scheduleWithFixedDelay(recheckCrd(), recheckInterval, recheckInterval, TimeUnit.SECONDS);
139+
ScheduledFuture<?> future =
140+
delegate.scheduleWithFixedDelay(recheckCrd(), recheckInterval, recheckInterval, TimeUnit.SECONDS);
138141

139-
markReadyAndStartLivenessThread();
142+
markReadyAndStartLivenessThread(List.of(future));
140143

141144
} catch (Exception e) {
142145
LOGGER.warning(MessageKeys.EXCEPTION, e);

operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2018, 2023, Oracle and/or its affiliates.
1+
// Copyright (c) 2018, 2025, Oracle and/or its affiliates.
22
// Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
33

44
package oracle.kubernetes.operator.helpers;
@@ -818,7 +818,7 @@ public NextAction onSuccess(Packet packet, CallResponse<V1PodList> callResponse)
818818
.orElse(null);
819819

820820
if (jobPod == null) {
821-
return doContinueListOrNext(callResponse, packet, processIntrospectorPodLog(getNext()));
821+
return doContinueListOrNext(callResponse, packet, () -> processIntrospectorPodLog(getNext()));
822822
} else if (hasImagePullError(jobPod) || initContainersHaveImagePullError(jobPod)) {
823823
return doNext(cleanUpAndReintrospect(getNext()), packet);
824824
} else if (isJobPodTimedOut(jobPod)) {

operator/src/main/java/oracle/kubernetes/operator/helpers/ResponseStep.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2018, 2024, Oracle and/or its affiliates.
1+
// Copyright (c) 2018, 2025, Oracle and/or its affiliates.
22
// Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
33

44
package oracle.kubernetes.operator.helpers;
@@ -7,6 +7,7 @@
77
import java.net.SocketTimeoutException;
88
import java.util.Collections;
99
import java.util.Optional;
10+
import java.util.function.Supplier;
1011
import java.util.stream.Collectors;
1112
import javax.annotation.Nonnull;
1213

@@ -140,7 +141,7 @@ private NextAction getPotentialRetryAction(Packet packet) {
140141
* @return Next action for list continue
141142
*/
142143
protected final NextAction doContinueListOrNext(CallResponse<T> callResponse, Packet packet) {
143-
return doContinueListOrNext(callResponse, packet, getNext());
144+
return doContinueListOrNext(callResponse, packet, this::getNext);
144145
}
145146

146147
/**
@@ -149,10 +150,10 @@ protected final NextAction doContinueListOrNext(CallResponse<T> callResponse, Pa
149150
*
150151
* @param callResponse Call response
151152
* @param packet Packet
152-
* @param next Next step, if no continuation
153+
* @param next Supplier of next step, if no continuation
153154
* @return Next action for list continue
154155
*/
155-
protected final NextAction doContinueListOrNext(CallResponse<T> callResponse, Packet packet, Step next) {
156+
protected final NextAction doContinueListOrNext(CallResponse<T> callResponse, Packet packet, Supplier<Step> next) {
156157
String cont = accessContinue(callResponse.getResult());
157158
if (cont != null) {
158159
packet.put(CONTINUE, cont);
@@ -161,9 +162,9 @@ protected final NextAction doContinueListOrNext(CallResponse<T> callResponse, Pa
161162
return resetRetryStrategyAndReinvokeRequest(packet);
162163
}
163164
if (callResponse.getResult() instanceof KubernetesListObject kubernetesListObject) {
164-
return doNext(next, packet).withDebugComment(kubernetesListObject, this::toComment);
165+
return doNext(next.get(), packet).withDebugComment(kubernetesListObject, this::toComment);
165166
} else {
166-
return doNext(next, packet);
167+
return doNext(next.get(), packet);
167168
}
168169
}
169170

operator/src/test/java/oracle/kubernetes/operator/DeploymentLivenessTest.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2022, Oracle and/or its affiliates.
1+
// Copyright (c) 2022, 2025, Oracle and/or its affiliates.
22
// Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
33

44
package oracle.kubernetes.operator;
@@ -8,6 +8,7 @@
88
import java.nio.file.Files;
99
import java.nio.file.Path;
1010
import java.util.ArrayList;
11+
import java.util.Collections;
1112
import java.util.Comparator;
1213
import java.util.List;
1314
import java.util.logging.Level;
@@ -56,7 +57,7 @@ public void tearDown() throws Exception {
5657

5758
@Test
5859
void whenNoExistingLivenessFile_fileCreated() {
59-
DeploymentLiveness deploymentLiveness = new DeploymentLiveness(coreDelegate);
60+
DeploymentLiveness deploymentLiveness = new DeploymentLiveness(Collections.emptyList(), coreDelegate);
6061
deploymentLiveness.run();
6162

6263
File aliveFile = new File(coreDelegate.probesHome, ".alive");
@@ -72,7 +73,7 @@ void whenExistingLivenessFile_onlyLogLastModifiedUpdated() throws IOException {
7273
File aliveFile = new File(coreDelegate.probesHome, ".alive");
7374
assertTrue(aliveFile.createNewFile());
7475

75-
DeploymentLiveness deploymentLiveness = new DeploymentLiveness(coreDelegate);
76+
DeploymentLiveness deploymentLiveness = new DeploymentLiveness(Collections.emptyList(), coreDelegate);
7677
deploymentLiveness.run();
7778

7879
assertThat(coreDelegate.probesHome, anExistingDirectory());
@@ -86,7 +87,7 @@ void whenExistingLivenessFile_onlyLogLastModifiedUpdated() throws IOException {
8687
void whenCantCreateLivenessFile_logWarning() throws IOException {
8788
assertTrue(coreDelegate.probesHome.setWritable(false, false));
8889

89-
DeploymentLiveness deploymentLiveness = new DeploymentLiveness(coreDelegate);
90+
DeploymentLiveness deploymentLiveness = new DeploymentLiveness(Collections.emptyList(), coreDelegate);
9091
deploymentLiveness.run();
9192

9293
assertThat(coreDelegate.probesHome, anExistingDirectory());

0 commit comments

Comments
 (0)