Skip to content

Commit 90b48ba

Browse files
authored
OWLS-89106 - Potential fix for pod startup issue in GBU CNE environment after node drain/repave operation (#2398)
* Fix for pod startup issue after node drain/repave operation.
1 parent cfa4486 commit 90b48ba

File tree

13 files changed

+285
-46
lines changed

13 files changed

+285
-46
lines changed

operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -746,7 +746,7 @@ private void addServerToMaps(Map<String, ServerHealth> serverHealthMap,
746746
*/
747747
class MakeRightDomainOperationImpl implements MakeRightDomainOperation {
748748

749-
private final DomainPresenceInfo liveInfo;
749+
private DomainPresenceInfo liveInfo;
750750
private boolean explicitRecheck;
751751
private boolean deleting;
752752
private boolean willInterrupt;
@@ -851,6 +851,22 @@ public void setInspectionRun() {
851851
inspectionRun = true;
852852
}
853853

854+
@Override
855+
public void setLiveInfo(DomainPresenceInfo info) {
856+
this.liveInfo = info;
857+
}
858+
859+
@Override
860+
public void clear() {
861+
this.liveInfo = null;
862+
this.eventData = null;
863+
this.explicitRecheck = false;
864+
this.deleting = false;
865+
this.willInterrupt = false;
866+
this.inspectionRun = false;
867+
}
868+
869+
854870
@Override
855871
public boolean wasInspectionRun() {
856872
return inspectionRun;

operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import java.util.Optional;
1313
import java.util.concurrent.ConcurrentHashMap;
1414
import java.util.concurrent.ThreadFactory;
15+
import java.util.concurrent.TimeUnit;
1516
import java.util.concurrent.atomic.AtomicBoolean;
1617
import java.util.function.Consumer;
1718
import javax.annotation.Nonnull;
@@ -25,13 +26,16 @@
2526
import io.kubernetes.client.util.Watchable;
2627
import oracle.kubernetes.operator.TuningParameters.WatchTuning;
2728
import oracle.kubernetes.operator.builders.WatchBuilder;
29+
import oracle.kubernetes.operator.calls.CallResponse;
2830
import oracle.kubernetes.operator.helpers.CallBuilder;
2931
import oracle.kubernetes.operator.helpers.KubernetesUtils;
3032
import oracle.kubernetes.operator.helpers.ResponseStep;
3133
import oracle.kubernetes.operator.logging.LoggingFacade;
3234
import oracle.kubernetes.operator.logging.LoggingFactory;
3335
import oracle.kubernetes.operator.logging.MessageKeys;
36+
import oracle.kubernetes.operator.steps.DefaultResponseStep;
3437
import oracle.kubernetes.operator.watcher.WatchListener;
38+
import oracle.kubernetes.operator.work.NextAction;
3539
import oracle.kubernetes.operator.work.Packet;
3640
import oracle.kubernetes.operator.work.Step;
3741
import oracle.kubernetes.utils.SystemClock;
@@ -243,6 +247,11 @@ boolean isReady(V1Job job) {
243247
return isComplete(job) || isFailed(job);
244248
}
245249

250+
@Override
251+
boolean onReadNotFoundForCachedResource(V1Job cachedJob, boolean isNotFoundOnRead) {
252+
return false;
253+
}
254+
246255
// Ignore modified callbacks from different jobs (identified by having different creation times) or those
247256
// where the job is not yet ready.
248257
@Override
@@ -302,6 +311,21 @@ Throwable createTerminationException(V1Job job) {
302311
void logWaiting(String name) {
303312
LOGGER.fine(MessageKeys.WAITING_FOR_JOB_READY, name);
304313
}
314+
315+
@Override
316+
protected DefaultResponseStep<V1Job> resumeIfReady(Callback callback) {
317+
return new DefaultResponseStep<>(null) {
318+
@Override
319+
public NextAction onSuccess(Packet packet, CallResponse<V1Job> callResponse) {
320+
if (isReady(callResponse.getResult()) || callback.didResumeFiber()) {
321+
callback.proceedFromWait(callResponse.getResult());
322+
return doNext(packet);
323+
}
324+
return doDelay(createReadAndIfReadyCheckStep(callback), packet,
325+
getWatchBackstopRecheckDelaySeconds(), TimeUnit.SECONDS);
326+
}
327+
};
328+
}
305329
}
306330

307331
static class DeadlineExceededException extends Exception {

operator/src/main/java/oracle/kubernetes/operator/MakeRightDomainOperation.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ public interface MakeRightDomainOperation {
3636

3737
void setInspectionRun();
3838

39+
void setLiveInfo(DomainPresenceInfo info);
40+
41+
void clear();
42+
3943
boolean wasInspectionRun();
4044

4145
private static boolean wasInspectionRun(Packet packet) {

operator/src/main/java/oracle/kubernetes/operator/PodWatcher.java

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import java.util.Map;
1212
import java.util.Optional;
1313
import java.util.concurrent.ThreadFactory;
14+
import java.util.concurrent.TimeUnit;
1415
import java.util.concurrent.atomic.AtomicBoolean;
1516
import java.util.function.Consumer;
1617
import javax.annotation.Nonnull;
@@ -28,17 +29,26 @@
2829
import io.kubernetes.client.util.Watchable;
2930
import oracle.kubernetes.operator.TuningParameters.WatchTuning;
3031
import oracle.kubernetes.operator.builders.WatchBuilder;
32+
import oracle.kubernetes.operator.calls.CallResponse;
3133
import oracle.kubernetes.operator.helpers.CallBuilder;
34+
import oracle.kubernetes.operator.helpers.DomainPresenceInfo;
3235
import oracle.kubernetes.operator.helpers.KubernetesUtils;
3336
import oracle.kubernetes.operator.helpers.LegalNames;
3437
import oracle.kubernetes.operator.helpers.PodHelper;
3538
import oracle.kubernetes.operator.helpers.ResponseStep;
3639
import oracle.kubernetes.operator.logging.LoggingFacade;
3740
import oracle.kubernetes.operator.logging.LoggingFactory;
3841
import oracle.kubernetes.operator.logging.MessageKeys;
42+
import oracle.kubernetes.operator.steps.DefaultResponseStep;
3943
import oracle.kubernetes.operator.watcher.WatchListener;
44+
import oracle.kubernetes.operator.work.NextAction;
45+
import oracle.kubernetes.operator.work.Packet;
4046
import oracle.kubernetes.operator.work.Step;
4147

48+
import static oracle.kubernetes.operator.ProcessingConstants.SERVER_NAME;
49+
import static oracle.kubernetes.operator.logging.MessageKeys.EXECUTE_MAKE_RIGHT_DOMAIN;
50+
import static oracle.kubernetes.operator.logging.MessageKeys.LOG_WAITING_COUNT;
51+
4252
/**
4353
* Watches for changes to pods.
4454
*/
@@ -305,6 +315,8 @@ public Step waitForDelete(V1Pod pod, Step next) {
305315

306316
private abstract static class WaitForPodStatusStep extends WaitForReadyStep<V1Pod> {
307317

318+
public static final int RECHECK_DEBUG_COUNT = 10;
319+
308320
private WaitForPodStatusStep(V1Pod pod, Step next) {
309321
super(pod, next);
310322
}
@@ -322,6 +334,67 @@ V1ObjectMeta getMetadata(V1Pod pod) {
322334
Step createReadAsyncStep(String name, String namespace, String domainUid, ResponseStep<V1Pod> responseStep) {
323335
return new CallBuilder().readPodAsync(name, namespace, domainUid, responseStep);
324336
}
337+
338+
protected DefaultResponseStep<V1Pod> resumeIfReady(Callback callback) {
339+
return new DefaultResponseStep<>(getNext()) {
340+
@Override
341+
public NextAction onSuccess(Packet packet, CallResponse<V1Pod> callResponse) {
342+
343+
DomainPresenceInfo info = packet.getSpi(DomainPresenceInfo.class);
344+
String serverName = (String)packet.get(SERVER_NAME);
345+
String resource = initialResource == null ? resourceName : getMetadata(initialResource).getName();
346+
if ((info != null) && (callResponse != null)) {
347+
Optional.ofNullable(callResponse.getResult()).ifPresent(result ->
348+
info.setServerPodFromEvent(getPodLabel(result), result));
349+
if (onReadNotFoundForCachedResource(getServerPod(info, serverName), isNotFoundOnRead(callResponse))) {
350+
LOGGER.fine(EXECUTE_MAKE_RIGHT_DOMAIN, serverName, callback.getRecheckCount());
351+
removeCallback(resource, callback);
352+
return doNext(NEXT_STEP_FACTORY.createMakeDomainRightStep(callback, info, getNext()), packet);
353+
}
354+
}
355+
356+
if (isReady(callResponse.getResult()) || callback.didResumeFiber()) {
357+
callback.proceedFromWait(callResponse.getResult());
358+
return null;
359+
}
360+
361+
if (shouldWait()) {
362+
if ((callback.getRecheckCount() % RECHECK_DEBUG_COUNT) == 0) {
363+
LOGGER.fine(LOG_WAITING_COUNT, serverName, callback.getRecheckCount());
364+
}
365+
// Watch backstop recheck count is less than or equal to the configured recheck count, delay.
366+
return doDelay(createReadAndIfReadyCheckStep(callback), packet,
367+
getWatchBackstopRecheckDelaySeconds(), TimeUnit.SECONDS);
368+
} else {
369+
LOGGER.fine(EXECUTE_MAKE_RIGHT_DOMAIN, serverName, callback.getRecheckCount());
370+
removeCallback(resource, callback);
371+
// Watch backstop recheck count is more than configured recheck count, proceed to make-right step.
372+
return doNext(NEXT_STEP_FACTORY.createMakeDomainRightStep(callback, info, getNext()), packet);
373+
}
374+
}
375+
376+
private String getPodLabel(V1Pod pod) {
377+
return Optional.ofNullable(pod)
378+
.map(V1Pod::getMetadata)
379+
.map(V1ObjectMeta::getLabels)
380+
.map(m -> m.get(LabelConstants.SERVERNAME_LABEL))
381+
.orElse(null);
382+
}
383+
384+
private V1Pod getServerPod(DomainPresenceInfo info, String serverName) {
385+
return Optional.ofNullable(serverName).map(info::getServerPod).orElse(null);
386+
}
387+
388+
private boolean isNotFoundOnRead(CallResponse callResponse) {
389+
return callResponse.getResult() == null;
390+
}
391+
392+
private boolean shouldWait() {
393+
return callback.incrementAndGetRecheckCount() <= getWatchBackstopRecheckCount();
394+
}
395+
};
396+
}
397+
325398
}
326399

327400
private class WaitForPodReadyStep extends WaitForPodStatusStep {
@@ -360,13 +433,25 @@ protected void removeCallback(String podName, Consumer<V1Pod> callback) {
360433
protected void logWaiting(String name) {
361434
LOGGER.fine(MessageKeys.WAITING_FOR_POD_READY, name);
362435
}
436+
437+
@Override
438+
protected boolean onReadNotFoundForCachedResource(V1Pod cachedPod, boolean isNotFoundOnRead) {
439+
// Return true if cached pod is not null but pod not found in explicit read, false otherwise.
440+
return (cachedPod != null) && isNotFoundOnRead;
441+
}
442+
363443
}
364444

365445
private class WaitForPodDeleteStep extends WaitForPodStatusStep {
366446
private WaitForPodDeleteStep(V1Pod pod, Step next) {
367447
super(pod, next);
368448
}
369449

450+
@Override
451+
protected boolean onReadNotFoundForCachedResource(V1Pod cachedPod, boolean isNotFoundOnRead) {
452+
return false;
453+
}
454+
370455
// A pod is considered deleted when reading its value from Kubernetes returns null.
371456
@Override
372457
protected boolean isReady(V1Pod result) {

operator/src/main/java/oracle/kubernetes/operator/TuningParameters.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,17 +194,20 @@ class WatchTuning {
194194
public final int watchLifetime;
195195
public final int watchMinimumDelay;
196196
public final int watchBackstopRecheckDelay;
197+
public final int watchBackstopRecheckCount;
197198

198199
/**
199200
* Create watch tuning.
200201
* @param watchLifetime Watch lifetime
201202
* @param watchMinimumDelay Minimum delay before accepting new events to prevent hot loops
202203
* @param watchBackstopRecheckDelay Recheck delay for get while waiting for a status to backstop missed watch events
203204
*/
204-
public WatchTuning(int watchLifetime, int watchMinimumDelay, int watchBackstopRecheckDelay) {
205+
public WatchTuning(int watchLifetime, int watchMinimumDelay, int watchBackstopRecheckDelay,
206+
int watchBackstopRecheckCount) {
205207
this.watchLifetime = watchLifetime;
206208
this.watchMinimumDelay = watchMinimumDelay;
207209
this.watchBackstopRecheckDelay = watchBackstopRecheckDelay;
210+
this.watchBackstopRecheckCount = watchBackstopRecheckCount;
208211
}
209212

210213
@Override

operator/src/main/java/oracle/kubernetes/operator/TuningParametersImpl.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ private void update() {
7575
new WatchTuning(
7676
(int) readTuningParameter("watchLifetime", 300),
7777
(int) readTuningParameter("watchMinimumDelay", 5),
78-
(int) readTuningParameter("watchBackstopRecheckDelaySeconds", 5));
78+
(int) readTuningParameter("watchBackstopRecheckDelaySeconds", 5),
79+
(int) readTuningParameter("watchBackstopRecheckCount", 60));
7980

8081
PodTuning pod =
8182
new PodTuning(

0 commit comments

Comments
 (0)