Skip to content

Commit ec2b601

Browse files
authored
Merge pull request #255 from oracle/feature/retry
Retry and recheck mechanism
2 parents 5aa8e63 + 34961a9 commit ec2b601

File tree

10 files changed

+133
-70
lines changed

10 files changed

+133
-70
lines changed

operator/src/main/java/oracle/kubernetes/operator/Main.java

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,14 @@ public void onThrowable(Packet packet, Throwable throwable) {
259259
}
260260
}
261261
}
262+
263+
// start periodic retry and recheck
264+
MainTuning main = tuningAndConfig.getMainTuning();
265+
engine.getExecutor().scheduleWithFixedDelay(() -> {
266+
for (DomainPresenceInfo info : domains.values()) {
267+
checkAndCreateDomainPresence(info, false);
268+
}
269+
}, main.domainPresenceRecheckIntervalSeconds, main.domainPresenceRecheckIntervalSeconds, TimeUnit.SECONDS);
262270
} catch (Throwable e) {
263271
LOGGER.warning(MessageKeys.EXCEPTION, e);
264272
} finally {
@@ -405,6 +413,7 @@ private static void doCheckAndCreateDomainPresence(Domain dom) {
405413
private static void doCheckAndCreateDomainPresence(Domain dom, boolean explicitRecheck) {
406414
doCheckAndCreateDomainPresence(dom, explicitRecheck, false, null, null);
407415
}
416+
408417

409418
private static void doCheckAndCreateDomainPresence(Domain dom, boolean explicitRecheck, boolean explicitRestartAdmin,
410419
List<String> explicitRestartServers, List<String> explicitRestartClusters) {
@@ -433,6 +442,31 @@ private static void doCheckAndCreateDomainPresence(Domain dom, boolean explicitR
433442
}
434443
info.setDomain(dom);
435444
}
445+
446+
if (explicitRestartAdmin) {
447+
LOGGER.info(MessageKeys.RESTART_ADMIN_STARTING, domainUID);
448+
info.getExplicitRestartAdmin().set(true);
449+
}
450+
if (explicitRestartServers != null) {
451+
LOGGER.info(MessageKeys.RESTART_SERVERS_STARTING, domainUID, explicitRestartServers);
452+
info.getExplicitRestartServers().addAll(explicitRestartServers);
453+
}
454+
if (explicitRestartClusters != null) {
455+
LOGGER.info(MessageKeys.ROLLING_CLUSTERS_STARTING, domainUID, explicitRestartClusters);
456+
info.getExplicitRestartClusters().addAll(explicitRestartClusters);
457+
}
458+
459+
checkAndCreateDomainPresence(info);
460+
}
461+
462+
private static void checkAndCreateDomainPresence(DomainPresenceInfo info) {
463+
checkAndCreateDomainPresence(info, true);
464+
}
465+
466+
private static void checkAndCreateDomainPresence(DomainPresenceInfo info, boolean isCausedByWatch) {
467+
Domain dom = info.getDomain();
468+
DomainSpec spec = dom.getSpec();
469+
String domainUID = spec.getDomainUID();
436470

437471
String ns = dom.getMetadata().getNamespace();
438472
if (initialized.getOrDefault(ns, Boolean.FALSE) && !stopping.get()) {
@@ -449,34 +483,10 @@ private static void doCheckAndCreateDomainPresence(Domain dom, boolean explicitR
449483
p.getComponents().put(ProcessingConstants.DOMAIN_COMPONENT_NAME, Component.createFor(info, version, pw));
450484
p.put(ProcessingConstants.PRINCIPAL, principal);
451485

452-
if (explicitRestartAdmin) {
453-
p.put(ProcessingConstants.EXPLICIT_RESTART_ADMIN, Boolean.TRUE);
454-
}
455-
p.put(ProcessingConstants.EXPLICIT_RESTART_SERVERS, explicitRestartServers);
456-
p.put(ProcessingConstants.EXPLICIT_RESTART_CLUSTERS, explicitRestartClusters);
457-
458-
if (explicitRestartAdmin) {
459-
LOGGER.info(MessageKeys.RESTART_ADMIN_STARTING, domainUID);
460-
}
461-
if (explicitRestartServers != null) {
462-
LOGGER.info(MessageKeys.RESTART_SERVERS_STARTING, domainUID, explicitRestartServers);
463-
}
464-
if (explicitRestartClusters != null) {
465-
LOGGER.info(MessageKeys.ROLLING_CLUSTERS_STARTING, domainUID, explicitRestartClusters);
466-
}
467-
468-
domainUpdaters.startFiber(domainUID, strategy, p, new CompletionCallback() {
486+
CompletionCallback cc = new CompletionCallback() {
469487
@Override
470488
public void onCompletion(Packet packet) {
471-
if (explicitRestartAdmin) {
472-
LOGGER.info(MessageKeys.RESTART_ADMIN_COMPLETE, domainUID);
473-
}
474-
if (explicitRestartServers != null) {
475-
LOGGER.info(MessageKeys.RESTART_SERVERS_COMPLETE, domainUID, explicitRestartServers);
476-
}
477-
if (explicitRestartClusters != null) {
478-
LOGGER.info(MessageKeys.ROLLING_CLUSTERS_COMPLETE, domainUID, explicitRestartClusters);
479-
}
489+
info.complete();
480490
}
481491

482492
@Override
@@ -496,9 +506,16 @@ public void onThrowable(Packet packet, Throwable throwable) {
496506
}
497507
});
498508

499-
// TODO: consider retrying domain update after a delay
509+
engine.getExecutor().schedule(() -> { checkAndCreateDomainPresence(info, false); },
510+
tuningAndConfig.getMainTuning().domainPresenceFailureRetrySeconds, TimeUnit.SECONDS);
500511
}
501-
});
512+
};
513+
514+
if (isCausedByWatch) {
515+
domainUpdaters.startFiber(domainUID, strategy, p, cc);
516+
} else {
517+
domainUpdaters.startFiberIfNoCurrentFiber(domainUID, strategy, p, cc);
518+
}
502519

503520
scheduleDomainStatusUpdating(info);
504521
}

operator/src/main/java/oracle/kubernetes/operator/ProcessingConstants.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,6 @@ public interface ProcessingConstants {
2626
public static final String NETWORK_ACCESS_POINT = "nap";
2727

2828
public static final String SERVERS_TO_ROLL = "roll";
29-
public static final String EXPLICIT_RESTART_ADMIN = "explicitRestartAdmin";
30-
public static final String EXPLICIT_RESTART_SERVERS = "explicitRestartServers";
31-
public static final String EXPLICIT_RESTART_CLUSTERS = "explicitRestartClusters";
3229

3330
public static final String SCRIPT_CONFIG_MAP = "scriptConfigMap";
3431
public static final String SERVER_STATE_MAP = "serverStateMap";

operator/src/main/java/oracle/kubernetes/operator/TuningParameters.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,18 @@ public static TuningParameters getInstance() {
1919
}
2020

2121
public static class MainTuning {
22+
public final int domainPresenceFailureRetrySeconds;
23+
public final int domainPresenceRecheckIntervalSeconds;
2224
public final int statusUpdateTimeoutSeconds;
2325
public final int unchangedCountToDelayStatusRecheck;
2426
public final long initialShortDelay;
2527
public final long eventualLongDelay;
2628

27-
public MainTuning(int statusUpdateTimeoutSeconds, int unchangedCountToDelayStatusRecheck,
29+
public MainTuning(int domainPresenceFailureRetrySeconds, int domainPresenceRecheckIntervalSeconds,
30+
int statusUpdateTimeoutSeconds, int unchangedCountToDelayStatusRecheck,
2831
long initialShortDelay, long eventualLongDelay) {
32+
this.domainPresenceFailureRetrySeconds = domainPresenceFailureRetrySeconds;
33+
this.domainPresenceRecheckIntervalSeconds = domainPresenceRecheckIntervalSeconds;
2934
this.statusUpdateTimeoutSeconds = statusUpdateTimeoutSeconds;
3035
this.unchangedCountToDelayStatusRecheck = unchangedCountToDelayStatusRecheck;
3136
this.initialShortDelay = initialShortDelay;

operator/src/main/java/oracle/kubernetes/operator/TuningParametersImpl.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ private void update() {
4949
LOGGER.info(MessageKeys.TUNING_PARAMETERS);
5050

5151
MainTuning main = new MainTuning(
52+
(int) readTuningParameter("domainPresenceFailureRetrySeconds", 30),
53+
(int) readTuningParameter("domainPresenceRecheckIntervalSeconds", 300),
5254
(int) readTuningParameter("statusUpdateTimeoutSeconds", 10),
5355
(int) readTuningParameter("statusUpdateUnchangedCountToDelayStatusRecheck", 10),
5456
readTuningParameter("statusUpdateInitialShortDelay", 3),

operator/src/main/java/oracle/kubernetes/operator/calls/AsyncRequestStep.java

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ public NextAction apply(Packet packet) {
8686
LOGGER.fine(MessageKeys.ASYNC_REQUEST, requestParams.call, requestParams.namespace, requestParams.name, requestParams.body, fieldSelector, labelSelector, resourceVersion);
8787

8888
AtomicBoolean didResume = new AtomicBoolean(false);
89-
AtomicBoolean didRecycle = new AtomicBoolean(false);
9089
ApiClient client = helper.take();
9190
return doSuspend((fiber) -> {
9291
ApiCallback<T> callback = new BaseApiCallback<T>() {
@@ -97,9 +96,7 @@ public void onFailure(ApiException e, int statusCode, Map<String, List<String>>
9796
LOGGER.info(MessageKeys.ASYNC_FAILURE, e, statusCode, responseHeaders, requestParams.call, requestParams.namespace, requestParams.name, requestParams.body, fieldSelector, labelSelector, resourceVersion);
9897
}
9998

100-
if (didRecycle.compareAndSet(false, true)) {
101-
helper.recycle(client);
102-
}
99+
helper.recycle(client);
103100
packet.getComponents().put(RESPONSE_COMPONENT_NAME, Component.createFor(RetryStrategy.class, _retry, new CallResponse<Void>(null, e, statusCode, responseHeaders)));
104101
fiber.resume(packet);
105102
}
@@ -110,24 +107,18 @@ public void onSuccess(T result, int statusCode, Map<String, List<String>> respon
110107
if (didResume.compareAndSet(false, true)) {
111108
LOGGER.fine(MessageKeys.ASYNC_SUCCESS, result, statusCode, responseHeaders);
112109

113-
if (didRecycle.compareAndSet(false, true)) {
114-
helper.recycle(client);
115-
}
110+
helper.recycle(client);
116111
packet.getComponents().put(RESPONSE_COMPONENT_NAME, Component.createFor(new CallResponse<>(result, null, statusCode, responseHeaders)));
117112
fiber.resume(packet);
118113
}
119114
}
120115
};
121-
116+
122117
try {
123118
CancellableCall c = factory.generate(requestParams, client, _continue, callback);
124119

125120
// timeout handling
126121
fiber.owner.getExecutor().schedule(() -> {
127-
if (didRecycle.compareAndSet(false, true)) {
128-
// don't recycle on timeout because state is unknown
129-
// usage.recycle();
130-
}
131122
if (didResume.compareAndSet(false, true)) {
132123
try {
133124
c.cancel();
@@ -140,10 +131,6 @@ public void onSuccess(T result, int statusCode, Map<String, List<String>> respon
140131
}, timeoutSeconds, TimeUnit.SECONDS);
141132
} catch (Throwable t) {
142133
LOGGER.warning(MessageKeys.ASYNC_FAILURE, t, 0, null, requestParams, requestParams.namespace, requestParams.name, requestParams.body, fieldSelector, labelSelector, resourceVersion);
143-
if (didRecycle.compareAndSet(false, true)) {
144-
// don't recycle on throwable because state is unknown
145-
// usage.recycle();
146-
}
147134
if (didResume.compareAndSet(false, true)) {
148135
packet.getComponents().put(RESPONSE_COMPONENT_NAME, Component.createFor(RetryStrategy.class, _retry));
149136
fiber.resume(packet);

operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55

66
import java.util.Collection;
77
import java.util.List;
8+
import java.util.Set;
89
import java.util.concurrent.ConcurrentHashMap;
910
import java.util.concurrent.ConcurrentMap;
11+
import java.util.concurrent.CopyOnWriteArraySet;
1012
import java.util.concurrent.ScheduledFuture;
13+
import java.util.concurrent.atomic.AtomicBoolean;
1114
import java.util.concurrent.atomic.AtomicReference;
1215

1316
import org.joda.time.DateTime;
@@ -37,11 +40,16 @@ public class DomainPresenceInfo {
3740
private final ConcurrentMap<String, ServerKubernetesObjects> servers = new ConcurrentHashMap<>();
3841
private final ConcurrentMap<String, V1Service> clusters = new ConcurrentHashMap<>();
3942
private final ConcurrentMap<String, V1beta1Ingress> ingresses = new ConcurrentHashMap<>();
43+
44+
private final AtomicBoolean explicitRestartAdmin = new AtomicBoolean(false);
45+
private final Set<String> explicitRestartServers = new CopyOnWriteArraySet<>();
46+
private final Set<String> explicitRestartClusters = new CopyOnWriteArraySet<>();
4047

4148
private V1PersistentVolumeClaimList claims = null;
4249

4350
private WlsDomainConfig domainConfig;
4451
private DateTime lastScanTime;
52+
private DateTime lastCompletionTime;
4553

4654
/**
4755
* Create presence for a domain
@@ -113,6 +121,21 @@ public void setLastScanTime(DateTime lastScanTime) {
113121
this.lastScanTime = lastScanTime;
114122
}
115123

124+
/**
125+
* Last completion time
126+
* @return Last completion time
127+
*/
128+
public DateTime getLastCompletionTime() {
129+
return lastCompletionTime;
130+
}
131+
132+
/**
133+
* Sets the last completion time to now
134+
*/
135+
public void complete() {
136+
this.lastCompletionTime = new DateTime();
137+
}
138+
116139
/**
117140
* Gets the domain. Except the instance to change frequently based on status updates
118141
* @return Domain
@@ -161,6 +184,30 @@ public ConcurrentMap<String, V1beta1Ingress> getIngresses() {
161184
return ingresses;
162185
}
163186

187+
/**
188+
* Control for if domain has outstanding restart admin server pending
189+
* @return Control for pending admin server restart
190+
*/
191+
public AtomicBoolean getExplicitRestartAdmin() {
192+
return explicitRestartAdmin;
193+
}
194+
195+
/**
196+
* Control list for outstanding server restarts
197+
* @return Control list for outstanding server restarts
198+
*/
199+
public Set<String> getExplicitRestartServers() {
200+
return explicitRestartServers;
201+
}
202+
203+
/**
204+
* Control list for outstanding cluster restarts
205+
* @return Control list for outstanding cluster restarts
206+
*/
207+
public Set<String> getExplicitRestartClusters() {
208+
return explicitRestartClusters;
209+
}
210+
164211
/**
165212
* Server objects (Pods and Services) for admin server
166213
* @return Server objects for admin server

operator/src/main/java/oracle/kubernetes/operator/helpers/PodHelper.java

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,8 @@ public NextAction apply(Packet packet) {
9191

9292
DomainPresenceInfo info = packet.getSPI(DomainPresenceInfo.class);
9393

94-
Boolean explicitRestartAdmin = (Boolean) packet.get(ProcessingConstants.EXPLICIT_RESTART_ADMIN);
95-
@SuppressWarnings("unchecked")
96-
List<String> explicitRestartServers = (List<String>) packet.get(ProcessingConstants.EXPLICIT_RESTART_SERVERS);
97-
9894
boolean isExplicitRestartThisServer =
99-
(Boolean.TRUE.equals(explicitRestartAdmin)) ||
100-
(explicitRestartServers != null && explicitRestartServers.contains(asName));
95+
info.getExplicitRestartAdmin().get() || info.getExplicitRestartServers().contains(asName);
10196

10297
ServerKubernetesObjects sko = skoFactory.getOrCreate(info, asName);
10398

@@ -116,6 +111,8 @@ public NextAction onFailure(Packet packet, ApiException e, int statusCode,
116111
public NextAction onSuccess(Packet packet, V1Pod result, int statusCode,
117112
Map<String, List<String>> responseHeaders) {
118113
if (result == null) {
114+
info.getExplicitRestartAdmin().set(false);
115+
info.getExplicitRestartServers().remove(asName);
119116
Step create = factory.create().createPodAsync(namespace, adminPod, new ResponseStep<V1Pod>(next) {
120117
@Override
121118
public NextAction onFailure(Packet packet, ApiException e, int statusCode,
@@ -145,7 +142,7 @@ public NextAction onSuccess(Packet packet, V1Pod result, int statusCode,
145142
Step replace = new CyclePodStep(
146143
AdminPodStep.this,
147144
podName, namespace, adminPod, MessageKeys.ADMIN_POD_REPLACED,
148-
weblogicDomainUID, asName, sko, next);
145+
weblogicDomainUID, asName, info, sko, next);
149146
return doNext(replace, packet);
150147
}
151148
}
@@ -332,9 +329,12 @@ private static class CyclePodStep extends Step {
332329
private final String messageKey;
333330
private final String weblogicDomainUID;
334331
private final String serverName;
332+
private final DomainPresenceInfo info;
335333
private final ServerKubernetesObjects sko;
336334

337-
public CyclePodStep(Step conflictStep, String podName, String namespace, V1Pod newPod, String messageKey, String weblogicDomainUID, String serverName, ServerKubernetesObjects sko, Step next) {
335+
public CyclePodStep(Step conflictStep, String podName, String namespace, V1Pod newPod,
336+
String messageKey, String weblogicDomainUID, String serverName, DomainPresenceInfo info,
337+
ServerKubernetesObjects sko, Step next) {
338338
super(next);
339339
this.conflictStep = conflictStep;
340340
this.podName = podName;
@@ -343,6 +343,7 @@ public CyclePodStep(Step conflictStep, String podName, String namespace, V1Pod n
343343
this.messageKey = messageKey;
344344
this.weblogicDomainUID = weblogicDomainUID;
345345
this.serverName = serverName;
346+
this.info = info;
346347
this.sko = sko;
347348
}
348349

@@ -365,6 +366,10 @@ public NextAction onFailure(Packet packet, ApiException e, int statusCode,
365366
@Override
366367
public NextAction onSuccess(Packet packet, V1Status result, int statusCode,
367368
Map<String, List<String>> responseHeaders) {
369+
if (conflictStep instanceof AdminPodStep) {
370+
info.getExplicitRestartAdmin().set(false);
371+
}
372+
info.getExplicitRestartServers().contains(serverName);
368373
Step create = factory.create().createPodAsync(namespace, newPod, new ResponseStep<V1Pod>(next) {
369374
@Override
370375
public NextAction onFailure(Packet packet, ApiException e, int statusCode,
@@ -497,14 +502,9 @@ public NextAction apply(Packet packet) {
497502

498503
DomainPresenceInfo info = packet.getSPI(DomainPresenceInfo.class);
499504

500-
@SuppressWarnings("unchecked")
501-
List<String> explicitRestartServers = (List<String>) packet.get(ProcessingConstants.EXPLICIT_RESTART_SERVERS);
502-
@SuppressWarnings("unchecked")
503-
List<String> explicitRestartClusters = (List<String>) packet.get(ProcessingConstants.EXPLICIT_RESTART_CLUSTERS);
504-
505505
boolean isExplicitRestartThisServer =
506-
(explicitRestartServers != null && explicitRestartServers.contains(weblogicServerName)) ||
507-
(explicitRestartClusters != null && weblogicClusterName != null && explicitRestartClusters.contains(weblogicClusterName));
506+
info.getExplicitRestartServers().contains(weblogicServerName) ||
507+
(weblogicClusterName != null && info.getExplicitRestartClusters().contains(weblogicClusterName));
508508

509509
ServerKubernetesObjects sko = skoFactory.getOrCreate(info, weblogicServerName);
510510

@@ -523,6 +523,7 @@ public NextAction onFailure(Packet packet, ApiException e, int statusCode,
523523
public NextAction onSuccess(Packet packet, V1Pod result, int statusCode,
524524
Map<String, List<String>> responseHeaders) {
525525
if (result == null) {
526+
info.getExplicitRestartServers().remove(weblogicServerName);
526527
Step create = factory.create().createPodAsync(namespace, pod, new ResponseStep<V1Pod>(next) {
527528
@Override
528529
public NextAction onFailure(Packet packet, ApiException e, int statusCode,
@@ -553,7 +554,7 @@ public NextAction onSuccess(Packet packet, V1Pod result, int statusCode,
553554
Step replace = new CyclePodStep(
554555
ManagedPodStep.this,
555556
podName, namespace, pod, MessageKeys.MANAGED_POD_REPLACED,
556-
weblogicDomainUID, weblogicServerName, sko, next);
557+
weblogicDomainUID, weblogicServerName, info, sko, next);
557558
synchronized (packet) {
558559
@SuppressWarnings("unchecked")
559560
Map<String, StepAndPacket> rolling = (Map<String, StepAndPacket>) packet.get(ProcessingConstants.SERVERS_TO_ROLL);

operator/src/main/java/oracle/kubernetes/operator/logging/MessageKeys.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,6 @@ private MessageKeys() {}
119119
public static final String NULL_TOKEN_REVIEW_STATUS = "WLSKO-0109";
120120
public static final String NULL_USER_INFO = "WLSKO-0110";
121121
public static final String RESOURCE_BUNDLE_NOT_FOUND = "WLSKO-0111";
122-
public static final String RESTART_ADMIN_COMPLETE = "WLSKO-0112";
123-
public static final String RESTART_SERVERS_COMPLETE = "WLSKO-0113";
124-
public static final String ROLLING_CLUSTERS_COMPLETE = "WLSKO-0114";
125122
public static final String RESTART_ADMIN_STARTING = "WLSKO-0115";
126123
public static final String RESTART_SERVERS_STARTING = "WLSKO-0116";
127124
public static final String ROLLING_CLUSTERS_STARTING = "WLSKO-0117";

0 commit comments

Comments
 (0)