Skip to content

Commit 4b4ca04

Browse files
authored
Backport OWLS-96896 Restart evicted pods to release/3.4 (#2981)
* backport owls-96896 to release/3.4 * updated createCyclePodEventStep to show 'Pod Evicted' in event
1 parent 6b7f646 commit 4b4ca04

File tree

12 files changed

+264
-11
lines changed

12 files changed

+264
-11
lines changed

operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@
8787
import static oracle.kubernetes.operator.helpers.EventHelper.EventItem.DOMAIN_PROCESSING_ABORTED;
8888
import static oracle.kubernetes.operator.helpers.EventHelper.EventItem.DOMAIN_PROCESSING_RETRYING;
8989
import static oracle.kubernetes.operator.helpers.LegalNames.toJobIntrospectorName;
90+
import static oracle.kubernetes.operator.helpers.PodHelper.getPodName;
91+
import static oracle.kubernetes.operator.helpers.PodHelper.getPodStatusMessage;
9092

9193
public class DomainProcessorImpl implements DomainProcessor {
9294

@@ -415,7 +417,15 @@ private void processServerPodWatch(V1Pod pod, String watchType) {
415417
info.setServerPodBeingDeleted(serverName, Boolean.FALSE);
416418
// fall through
417419
case "MODIFIED":
418-
info.setServerPodFromEvent(serverName, pod);
420+
boolean podPreviouslyEvicted = info.setServerPodFromEvent(serverName, pod, PodHelper::isEvicted);
421+
if (PodHelper.isEvicted(pod) && !podPreviouslyEvicted) {
422+
if (PodHelper.shouldRestartEvictedPod(pod)) {
423+
LOGGER.info(MessageKeys.POD_EVICTED, getPodName(pod), getPodStatusMessage(pod));
424+
createMakeRightOperation(info).interrupt().withExplicitRecheck().execute();
425+
} else {
426+
LOGGER.info(MessageKeys.POD_EVICTED_NO_RESTART, getPodName(pod), getPodStatusMessage(pod));
427+
}
428+
}
419429
break;
420430
case "DELETED":
421431
boolean removed = info.deleteServerPodFromEvent(serverName, pod);

operator/src/main/java/oracle/kubernetes/operator/KubernetesConstants.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ public interface KubernetesConstants {
4343
String OPERATOR_POD_UID_ENV = "OPERATOR_POD_UID";
4444
String NAMESPACE = "Namespace";
4545
String POD = "Pod";
46+
String EVICTED_REASON = "Evicted";
4647

4748
int DEFAULT_EXPORTER_SIDECAR_PORT = 8080;
4849

operator/src/main/java/oracle/kubernetes/operator/TuningParameters.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,12 +256,13 @@ class PodTuning {
256256
public final int livenessProbeSuccessThreshold;
257257
public final int livenessProbeFailureThreshold;
258258
public final long introspectorJobActiveDeadlineSeconds;
259+
public final boolean restartEvictedPods;
259260

260261
/**
261262
* create pod tuning.
262263
* @param readinessProbeInitialDelaySeconds readiness probe initial delay
263264
* @param readinessProbeTimeoutSeconds readiness probe timeout
264-
* @param readinessProbePeriodSeconds rediness probe period
265+
* @param readinessProbePeriodSeconds readiness probe period
265266
* @param readinessProbeSuccessThreshold readiness probe success threshold
266267
* @param readinessProbeFailureThreshold readiness probe failure threshold
267268
* @param livenessProbeInitialDelaySeconds liveness probe initial delay
@@ -270,6 +271,7 @@ class PodTuning {
270271
* @param livenessProbeSuccessThreshold liveness probe success threshold
271272
* @param livenessProbeFailureThreshold liveness probe failure threshold
272273
* @param introspectorJobActiveDeadlineSeconds introspector job active deadline
274+
* @param restartEvictedPods whether evicted server pods should be restarted
273275
*/
274276
public PodTuning(
275277
int readinessProbeInitialDelaySeconds,
@@ -282,7 +284,8 @@ public PodTuning(
282284
int livenessProbePeriodSeconds,
283285
int livenessProbeSuccessThreshold,
284286
int livenessProbeFailureThreshold,
285-
long introspectorJobActiveDeadlineSeconds) {
287+
long introspectorJobActiveDeadlineSeconds,
288+
boolean restartEvictedPods) {
286289
this.readinessProbeInitialDelaySeconds = readinessProbeInitialDelaySeconds;
287290
this.readinessProbeTimeoutSeconds = readinessProbeTimeoutSeconds;
288291
this.readinessProbePeriodSeconds = readinessProbePeriodSeconds;
@@ -294,6 +297,7 @@ public PodTuning(
294297
this.livenessProbeSuccessThreshold = livenessProbeSuccessThreshold;
295298
this.livenessProbeFailureThreshold = livenessProbeFailureThreshold;
296299
this.introspectorJobActiveDeadlineSeconds = introspectorJobActiveDeadlineSeconds;
300+
this.restartEvictedPods = restartEvictedPods;
297301
}
298302

299303
@Override
@@ -309,6 +313,7 @@ public String toString() {
309313
.append("livenessProbePeriodSeconds", livenessProbePeriodSeconds)
310314
.append("livenessProbeSuccessThreshold", livenessProbeSuccessThreshold)
311315
.append("livenessProbeFailureThreshold", livenessProbeFailureThreshold)
316+
.append("restartEvictedPods", restartEvictedPods)
312317
.toString();
313318
}
314319

@@ -325,6 +330,7 @@ public int hashCode() {
325330
.append(livenessProbePeriodSeconds)
326331
.append(livenessProbeSuccessThreshold)
327332
.append(livenessProbeFailureThreshold)
333+
.append(restartEvictedPods)
328334
.toHashCode();
329335
}
330336

@@ -348,6 +354,7 @@ public boolean equals(Object o) {
348354
.append(livenessProbePeriodSeconds, pt.livenessProbePeriodSeconds)
349355
.append(livenessProbeSuccessThreshold, pt.livenessProbeSuccessThreshold)
350356
.append(livenessProbeFailureThreshold, pt.livenessProbeFailureThreshold)
357+
.append(restartEvictedPods, pt.restartEvictedPods)
351358
.isEquals();
352359
}
353360
}

operator/src/main/java/oracle/kubernetes/operator/TuningParametersImpl.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ private void update() {
9393
(int) readTuningParameter("livenessProbePeriodSeconds", 45),
9494
(int) readTuningParameter("livenessProbeSuccessThreshold", 1),
9595
(int) readTuningParameter("livenessProbeFailureThreshold", 1),
96-
readTuningParameter("introspectorJobActiveDeadlineSeconds", 120));
96+
readTuningParameter("introspectorJobActiveDeadlineSeconds", 120),
97+
readBooleanTuningParameter("restartEvictedPods", true));
9798

9899
FeatureGates featureGates =
99100
new FeatureGates(generateFeatureGates(get("featureGates")));

operator/src/main/java/oracle/kubernetes/operator/helpers/ConfigMapConsumer.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,16 @@ public long readTuningParameter(String parameter, long defaultValue) {
8282
return defaultValue;
8383
}
8484

85+
/**
86+
* read boolean tuning parameter.
87+
* @param parameter parameter
88+
* @param defaultValue default value
89+
* @return parameter value
90+
*/
91+
public boolean readBooleanTuningParameter(String parameter, boolean defaultValue) {
92+
return Optional.ofNullable(get(parameter)).map(Boolean::parseBoolean).orElse(defaultValue);
93+
}
94+
8595
@Override
8696
public int size() {
8797
return Optional.ofNullable(mountPointDir.list()).map(list -> list.length).orElse(0);

operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,20 @@ public void setServerPodFromEvent(String serverName, V1Pod event) {
260260
getSko(serverName).getPod().accumulateAndGet(event, this::getNewerPod);
261261
}
262262

263+
/**
264+
* Applies an add or modify event for a server pod. If the current pod is newer than the one
265+
* associated with the event, ignores the event.
266+
*
267+
* @param serverName the name of the server associated with the event
268+
* @param event the pod associated with the event
269+
* @param podPredicate predicate to be applied to the original pod
270+
* @return boolean result from applying the original pod to the podFunction provided
271+
*/
272+
public boolean setServerPodFromEvent(String serverName, V1Pod event, @Nonnull Predicate<V1Pod> podPredicate) {
273+
updateStatus(serverName, event);
274+
return podPredicate.test(getSko(serverName).getPod().getAndAccumulate(event, this::getNewerPod));
275+
}
276+
263277
private void updateStatus(String serverName, V1Pod event) {
264278
getSko(serverName)
265279
.getLastKnownStatus()

operator/src/main/java/oracle/kubernetes/operator/helpers/PodHelper.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import oracle.kubernetes.weblogic.domain.model.ServerSpec;
4040
import oracle.kubernetes.weblogic.domain.model.Shutdown;
4141

42+
import static oracle.kubernetes.operator.KubernetesConstants.EVICTED_REASON;
4243
import static oracle.kubernetes.operator.LabelConstants.CLUSTERNAME_LABEL;
4344
import static oracle.kubernetes.operator.LabelConstants.SERVERNAME_LABEL;
4445
import static oracle.kubernetes.operator.ProcessingConstants.SERVERS_TO_ROLL;
@@ -192,6 +193,38 @@ public static boolean isFailed(V1Pod pod) {
192193
return false;
193194
}
194195

196+
/**
197+
* Check if pod is in failed state with "Evicted" as the reason.
198+
* @param pod pod
199+
* @return true, if pod is in failed state with "Evicted" as the reason.
200+
*/
201+
public static boolean isEvicted(V1Pod pod) {
202+
return Optional.ofNullable(pod)
203+
.map(V1Pod::getStatus)
204+
.map(PodHelper::isEvicted)
205+
.orElse(false);
206+
}
207+
208+
/**
209+
* Chcek if the pod status shows that the pod is evicted.
210+
* @param status Pod status to be checked
211+
* @return True if the pod status shows that the pod is evicted, false otherwise
212+
*/
213+
public static boolean isEvicted(@Nonnull V1PodStatus status) {
214+
return "Failed".equals(status.getPhase())
215+
&& EVICTED_REASON.equals(status.getReason());
216+
}
217+
218+
/**
219+
* Return true if pod was evicted and operator is configured to restart evicted pods.
220+
* @param pod pod
221+
* @return true, if pod was evicted and operator is configured to restart evicted pods
222+
*
223+
*/
224+
public static boolean shouldRestartEvictedPod(V1Pod pod) {
225+
return isEvicted(pod) && TuningParameters.getInstance().getPodTuning().restartEvictedPods;
226+
}
227+
195228
/**
196229
* get pod domain UID.
197230
* @param pod pod
@@ -216,6 +249,25 @@ public static String getPodServerName(V1Pod pod) {
216249
return null;
217250
}
218251

252+
/**
253+
* Returns the Kubernetes name of the specified pod.
254+
* @param pod the pod
255+
*/
256+
public static String getPodName(@Nonnull V1Pod pod) {
257+
return Optional.of(pod).map(V1Pod::getMetadata).map(V1ObjectMeta::getName).orElse("");
258+
}
259+
260+
/**
261+
* Get the message from the pod's status.
262+
* @param pod pod
263+
* @return Message string from the pod's status
264+
*/
265+
public static String getPodStatusMessage(V1Pod pod) {
266+
return Optional.ofNullable(pod)
267+
.map(V1Pod::getStatus)
268+
.map(V1PodStatus::getMessage)
269+
.orElse(null);
270+
}
219271

220272
/**
221273
* Factory for {@link Step} that creates admin server pod.

operator/src/main/java/oracle/kubernetes/operator/helpers/PodStepContext.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@
9696
import static oracle.kubernetes.operator.helpers.EventHelper.EventItem.POD_CYCLE_STARTING;
9797
import static oracle.kubernetes.operator.helpers.FluentdHelper.addFluentdContainer;
9898
import static oracle.kubernetes.operator.helpers.LegalNames.LEGAL_CONTAINER_PORT_NAME_MAX_LENGTH;
99+
import static oracle.kubernetes.operator.logging.MessageKeys.CYCLING_POD_EVICTED;
99100
import static oracle.kubernetes.utils.OperatorUtils.isNullOrEmpty;
100101
import static oracle.kubernetes.weblogic.domain.model.Model.DEFAULT_WDT_INSTALL_HOME;
101102
import static oracle.kubernetes.weblogic.domain.model.Model.DEFAULT_WDT_MODEL_HOME;
@@ -570,8 +571,12 @@ private boolean haveReasonsToRoll(String domainIncompatibility) {
570571

571572
abstract String getPodReplacedMessageKey();
572573

574+
Step cycleEvictedPodStep(V1Pod pod, Step next) {
575+
return new CyclePodStep(pod, next, LOGGER.formatMessage(CYCLING_POD_EVICTED));
576+
}
577+
573578
Step createCyclePodStep(V1Pod pod, Step next) {
574-
return new CyclePodStep(pod, next);
579+
return new CyclePodStep(pod, next, null);
575580
}
576581

577582
private boolean mustPatchPod(V1Pod currentPod) {
@@ -1181,10 +1186,12 @@ private Step getConflictStep() {
11811186

11821187
public class CyclePodStep extends BaseStep {
11831188
private final V1Pod pod;
1189+
private final String message;
11841190

1185-
CyclePodStep(V1Pod pod, Step next) {
1191+
CyclePodStep(V1Pod pod, Step next, String message) {
11861192
super(next);
11871193
this.pod = pod;
1194+
this.message = message;
11881195
}
11891196

11901197
@Override
@@ -1195,7 +1202,7 @@ public NextAction apply(Packet packet) {
11951202
}
11961203

11971204
private Step createCyclePodEventStep(Step next) {
1198-
String reason = getReasonToRecycle(pod, CompatibilityScope.POD);
1205+
String reason = Optional.ofNullable(message).orElse(getReasonToRecycle(pod, CompatibilityScope.POD));
11991206
LOGGER.info(
12001207
MessageKeys.CYCLING_POD,
12011208
Objects.requireNonNull(pod.getMetadata()).getName(),
@@ -1224,6 +1231,8 @@ public NextAction apply(Packet packet) {
12241231
return doNext(createNewPod(getNext()), packet);
12251232
} else if (!canUseCurrentPod(currentPod)) {
12261233
return doNext(replaceCurrentPod(currentPod, getNext()), packet);
1234+
} else if (PodHelper.shouldRestartEvictedPod(currentPod)) {
1235+
return doNext(cycleEvictedPodStep(currentPod, getNext()), packet);
12271236
} else if (mustPatchPod(currentPod)) {
12281237
return doNext(patchCurrentPod(currentPod, getNext()), packet);
12291238
} else {

operator/src/main/java/oracle/kubernetes/operator/logging/MessageKeys.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@ public class MessageKeys {
148148
public static final String MISSING_ELASTIC_SEARCH_SECRET = "WLSKO-0223";
149149
public static final String FLUENTD_CONFIGMAP_CREATED = "WLSKO-0224";
150150
public static final String FLUENTD_CONFIGMAP_REPLACED = "WLSKO-0225";
151+
public static final String POD_EVICTED = "WLSKO-0226";
152+
public static final String POD_EVICTED_NO_RESTART = "WLSKO-0227";
151153

152154
// domain status messages
153155
public static final String DUPLICATE_SERVER_NAME_FOUND = "WLSDO-0001";
@@ -182,6 +184,8 @@ public class MessageKeys {
182184
public static final String RESERVED_CONTAINER_NAME = "WLSDO-0030";
183185
public static final String ILLEGAL_CONTAINER_PORT_NAME_LENGTH = "WLSDO-0031";
184186
public static final String ILLEGAL_NETWORK_CHANNEL_NAME_LENGTH = "WLSDO-0032";
187+
public static final String CYCLING_POD_EVICTED = "WLSDO-0042";
188+
public static final String CYCLING_POD_SPEC_CHANGED = "WLSDO-0043";
185189

186190
private MessageKeys() {
187191
}

operator/src/main/resources/Operator.properties

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ WLSKO-0223=When fluentdSpecification is specified in the domain spec, a secret c
151151
must be specified in {0}
152152
WKSKO-0224=Fluentd configmap created.
153153
WKSKO-0225=Fluentd configmap replaced.
154+
WLSKO-0226=Pod {0} was evicted due to {1}; validating domain
155+
WLSKO-0227=Pod {0} was evicted due to {1} but the operator is configured not to restart it.
154156

155157
# Domain status messages
156158

@@ -196,6 +198,8 @@ WLSDO-0030=The container name ''{0}'' specified under ''{1}'' is reserved for us
196198
WLSDO-0031=Container port name ''{2}'' for domain with domainUID ''{0}'' and container name ''{1}'' exceeds maximum allowed length ''{3}''.
197199
WLSDO-0032=Network channel name ''{2}'' for domain with domainUID ''{0}'' and server with \
198200
name ''{1}'' exceeds maximum allowed length ''{3}''. Please specify a shorter channel name.
201+
WLSDO-0042=Pod was evicted
202+
WLSDO-0043=Pod spec has changed
199203

200204
oneEnvVar=variable
201205
multipleEnvVars=variables

0 commit comments

Comments
 (0)