Skip to content

Commit 052162c

Browse files
jshum2479rjeberhard
authored andcommitted
backport owls118470 to support reporting error conditions for...
1 parent 0b6757b commit 052162c

File tree

7 files changed

+109
-1
lines changed

7 files changed

+109
-1
lines changed

common/src/main/java/oracle/kubernetes/common/logging/MessageKeys.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,8 @@ public class MessageKeys {
233233
public static final String NO_MATCH_VOLUME_WITH_PVC = "WLSDO-0063";
234234
public static final String NO_VOLUME_WITH_PVC = "WLSDO-0064";
235235
public static final String WALLET_KEY_NOT_FOUND = "WLSDO-0065";
236+
public static final String POD_UNSCHEDULABLE = "WLSDO-0066";
237+
public static final String POD_UNSCHEDULABLE_MESSAGE = "WLSDO-0067";
236238

237239
// domain event messages
238240
public static final String DOMAIN_AVAILABLE_EVENT_PATTERN = "WLSEO-0001";

common/src/main/resources/Operator.properties

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,8 @@ WLSDO-0064=When ''spec.configuration.initializeDomainOnPV'' is specified to auto
278278
but not the PersistentVolumeClaim (PVC), at least one of the volumes in ''spec.serverPod.volumes'' should contain a PVC.
279279
WLSDO-0065=The OPSS wallet password secret ''{0}'' is specified but the required ''walletPassword'' key is missing \
280280
in the specified secret ''{1}''.
281+
WLSDO-0066=Pod ''{0}'' is Unschedulable, reason: ''{1}''
282+
WLSDO-0067=One or more pods in the domain cannot be scheduled. Please check individual pod status for details.
281283

282284
oneEnvVar=variable
283285
multipleEnvVars=variables

operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
import io.kubernetes.client.openapi.models.V1PersistentVolumeClaim;
2626
import io.kubernetes.client.openapi.models.V1PersistentVolumeClaimStatus;
2727
import io.kubernetes.client.openapi.models.V1Pod;
28+
import io.kubernetes.client.openapi.models.V1PodCondition;
2829
import io.kubernetes.client.openapi.models.V1PodDisruptionBudget;
30+
import io.kubernetes.client.openapi.models.V1PodStatus;
2931
import io.kubernetes.client.openapi.models.V1Service;
3032
import io.kubernetes.client.util.Watch;
3133
import oracle.kubernetes.common.logging.LoggingFilter;
@@ -67,9 +69,12 @@
6769
import oracle.kubernetes.weblogic.domain.model.ServerStatus;
6870
import org.jetbrains.annotations.NotNull;
6971

72+
import static oracle.kubernetes.common.logging.MessageKeys.POD_UNSCHEDULABLE;
7073
import static oracle.kubernetes.common.logging.MessageKeys.PVC_NOT_BOUND_ERROR;
7174
import static oracle.kubernetes.operator.DomainStatusUpdater.createInternalFailureSteps;
7275
import static oracle.kubernetes.operator.DomainStatusUpdater.createIntrospectionFailureSteps;
76+
import static oracle.kubernetes.operator.KubernetesConstants.POD_SCHEDULED;
77+
import static oracle.kubernetes.operator.KubernetesConstants.UNSCHEDULABLE_REASON;
7378
import static oracle.kubernetes.operator.ProcessingConstants.SERVER_HEALTH_MAP;
7479
import static oracle.kubernetes.operator.ProcessingConstants.SERVER_STATE_MAP;
7580
import static oracle.kubernetes.operator.helpers.EventHelper.EventItem.CLUSTER_CHANGED;
@@ -604,6 +609,10 @@ private void processServerPodWatch(V1Pod pod, String watchType) {
604609
LOGGER.info(MessageKeys.POD_EVICTED_NO_RESTART, getPodName(pod), getPodStatusMessage(pod));
605610
}
606611
}
612+
boolean isUnschedulable = PodHelper.hasUnSchedulableCondition(pod);
613+
if (isUnschedulable) {
614+
LOGGER.info(POD_UNSCHEDULABLE, getPodName(pod), getUnSchedulableConditionMessage(pod));
615+
}
607616
break;
608617
case DELETED:
609618
boolean removed = info.deleteServerPodFromEvent(serverName, pod);
@@ -618,6 +627,25 @@ private void processServerPodWatch(V1Pod pod, String watchType) {
618627
}
619628
}
620629

630+
/**
631+
* If a pod is unschedulable, return the condition's message.
632+
* @param pod Kubernetes V1Pod
633+
* @return message for the unschedulable pod condition
634+
*/
635+
public static String getUnSchedulableConditionMessage(V1Pod pod) {
636+
return Optional.ofNullable(pod)
637+
.filter(PodHelper::isPending)
638+
.map(V1Pod::getStatus)
639+
.map(V1PodStatus::getConditions)
640+
.orElse(Collections.emptyList())
641+
.stream()
642+
.filter(condition -> POD_SCHEDULED.equals(condition.getType())
643+
&& UNSCHEDULABLE_REASON.equals(condition.getReason()))
644+
.map(V1PodCondition::getMessage)
645+
.findFirst().orElse(null);
646+
}
647+
648+
621649
private String getPodLabel(V1Pod pod, String labelName) {
622650
return Optional.ofNullable(pod)
623651
.map(V1Pod::getMetadata)

operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
import static oracle.kubernetes.common.logging.MessageKeys.PODS_FAILED;
7777
import static oracle.kubernetes.common.logging.MessageKeys.PODS_NOT_READY;
7878
import static oracle.kubernetes.common.logging.MessageKeys.PODS_NOT_RUNNING;
79+
import static oracle.kubernetes.common.logging.MessageKeys.POD_UNSCHEDULABLE_MESSAGE;
7980
import static oracle.kubernetes.operator.ClusterResourceStatusUpdater.createClusterResourceStatusUpdaterStep;
8081
import static oracle.kubernetes.operator.KubernetesConstants.HTTP_NOT_FOUND;
8182
import static oracle.kubernetes.operator.KubernetesConstants.MINIMUM_CLUSTER_COUNT;
@@ -751,6 +752,11 @@ private void setStatusConditions(DomainStatus status) {
751752
if (isHasFailedPod()) {
752753
addFailure(status, new DomainCondition(FAILED).withReason(SERVER_POD)
753754
.withFailureInfo(getDomain().getSpec()).withMessage(getPodFailedMessage()));
755+
} else if (isPodUnSchedulable()) {
756+
if (!alreadyReportedPodUnSchedulableCondition(status)) {
757+
addFailure(status, new DomainCondition(FAILED).withReason(SERVER_POD)
758+
.withFailureInfo(getDomain().getSpec()).withMessage(getPodUnSchedulableMessage()));
759+
}
754760
} else if (hasPodNotRunningInTime()) {
755761
addFailure(status, new DomainCondition(FAILED).withReason(SERVER_POD)
756762
.withFailureInfo(getDomain().getSpec()).withMessage(getPodNotRunningMessage()));
@@ -777,6 +783,10 @@ private String getPodNotReadyMessage() {
777783
return LOGGER.formatMessage(PODS_NOT_READY);
778784
}
779785

786+
private String getPodUnSchedulableMessage() {
787+
return LOGGER.formatMessage(POD_UNSCHEDULABLE_MESSAGE);
788+
}
789+
780790
private String getPodFailedMessage() {
781791
return LOGGER.formatMessage(PODS_FAILED);
782792
}
@@ -1258,6 +1268,11 @@ private boolean stillHasPodPendingRestart(DomainStatus status) {
12581268
.anyMatch(m -> m.containsKey(LabelConstants.MII_UPDATED_RESTART_REQUIRED_LABEL));
12591269
}
12601270

1271+
private boolean alreadyReportedPodUnSchedulableCondition(DomainStatus status) {
1272+
return status.getConditions().stream().anyMatch(
1273+
condition -> getPodUnSchedulableMessage().equals(condition.getMessage()));
1274+
}
1275+
12611276
private V1Pod getServerPod(ServerStatus serverStatus) {
12621277
return getInfo().getServerPod(serverStatus.getServerName());
12631278
}
@@ -1325,6 +1340,14 @@ private boolean hasPodNotRunningInTime() {
13251340
return getInfo().getServerPods().anyMatch(this::isNotRunningInTime);
13261341
}
13271342

1343+
private boolean isPodUnSchedulable() {
1344+
return getInfo().getServerPods().anyMatch(this::isPodUnSchedulable);
1345+
}
1346+
1347+
private boolean isPodUnSchedulable(V1Pod pod) {
1348+
return PodHelper.isPending(pod) && PodHelper.hasUnSchedulableCondition(pod);
1349+
}
1350+
13281351
private boolean isNotRunningInTime(V1Pod pod) {
13291352
return PodHelper.isPending(pod) && hasBeenPendingExceededWaitTime(pod);
13301353
}

operator/src/main/java/oracle/kubernetes/operator/KubernetesConstants.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ public interface KubernetesConstants {
5656
String NAMESPACE = "Namespace";
5757
String POD = "Pod";
5858
String EVICTED_REASON = "Evicted";
59-
59+
String UNSCHEDULABLE_REASON = "Unschedulable";
60+
String POD_SCHEDULED = "PodScheduled";
6061
int DEFAULT_EXPORTER_SIDECAR_PORT = 8080;
6162

6263
//---------- HTTP statuses returned from Kubernetes ----------

operator/src/main/java/oracle/kubernetes/operator/helpers/PodHelper.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
import oracle.kubernetes.weblogic.domain.model.Shutdown;
5050

5151
import static oracle.kubernetes.operator.KubernetesConstants.EVICTED_REASON;
52+
import static oracle.kubernetes.operator.KubernetesConstants.POD_SCHEDULED;
53+
import static oracle.kubernetes.operator.KubernetesConstants.UNSCHEDULABLE_REASON;
5254
import static oracle.kubernetes.operator.LabelConstants.CLUSTERNAME_LABEL;
5355
import static oracle.kubernetes.operator.LabelConstants.SERVERNAME_LABEL;
5456
import static oracle.kubernetes.operator.ProcessingConstants.SERVERS_TO_ROLL;
@@ -214,6 +216,11 @@ private static boolean isReadyCondition(V1PodCondition condition) {
214216
return "Ready".equals(condition.getType()) && "True".equals(condition.getStatus());
215217
}
216218

219+
private static boolean isUnSchedulableTheReason(V1PodCondition condition) {
220+
return POD_SCHEDULED.equals(condition.getType()) && "False".equals(condition.getStatus())
221+
&& UNSCHEDULABLE_REASON.equals(condition.getReason());
222+
}
223+
217224
private static boolean isReadyNotTrueCondition(V1PodCondition condition) {
218225
return "Ready".equals(condition.getType()) && !"True".equals(condition.getStatus());
219226
}
@@ -299,6 +306,21 @@ public static boolean shouldRestartEvictedPod(V1Pod pod) {
299306
return isEvicted(pod) && TuningParameters.getInstance().isRestartEvictedPods();
300307
}
301308

309+
/**
310+
* Return true if the pod has unschedulable condition.
311+
* @param pod Kubernetes Pod
312+
* @return true if the pod is unschedulable
313+
*/
314+
public static boolean hasUnSchedulableCondition(V1Pod pod) {
315+
return Optional.ofNullable(pod)
316+
.filter(PodHelper::isPending)
317+
.map(V1Pod::getStatus)
318+
.map(V1PodStatus::getConditions)
319+
.orElse(Collections.emptyList())
320+
.stream()
321+
.anyMatch(PodHelper::isUnSchedulableTheReason);
322+
}
323+
302324
/**
303325
* Returns the domain UID associated with the specified pod.
304326
* @param pod the pod

operator/src/test/java/oracle/kubernetes/operator/DomainStatusUpdateTestBase.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,22 @@ private V1Pod getPod(String serverName) {
239239
return info.getServerPod(serverName);
240240
}
241241

242+
private void setPodUnschedulable(String serverName) {
243+
V1Pod pod = getPod(serverName);
244+
V1PodStatus status = pod.getStatus();
245+
if (status == null) {
246+
status = new V1PodStatus();
247+
}
248+
pod.setStatus(status
249+
.startTime(SystemClock.now())
250+
.phase("Pending")
251+
.addConditionsItem(new V1PodCondition().type("PodScheduled").status("False").reason("Unschedulable")
252+
.message("0/1 nodes are available: 1 node(s) didn't match pod topology"
253+
+ " spread constraints (missing required label). preemption: 0/1 nodes are available: "
254+
+ "1 Preemption is not helpful for scheduling.."))
255+
);
256+
}
257+
242258
@Test
243259
void statusStep_usesServerFromWlsConfig() {
244260
defineScenario()
@@ -1033,6 +1049,20 @@ void whenPodPendingWithinTimeLimit_removePreviousServerPodFailures() {
10331049
assertThat(getRecordedDomain(), not(hasCondition(FAILED)));
10341050
}
10351051

1052+
@Test
1053+
void whenPodIsUnschedulable_reportServerPodFailure() {
1054+
defineScenario()
1055+
.withServerState("server1", new V1ContainerStateWaiting().reason(null))
1056+
.build();
1057+
setPodUnschedulable("server1");
1058+
1059+
SystemClockTestSupport.increment(21);
1060+
updateDomainStatus();
1061+
assertThat(getRecordedDomain(), hasCondition(FAILED).withReason(SERVER_POD)
1062+
.withMessageContaining("One or more pods in the domain cannot be scheduled."));
1063+
}
1064+
1065+
10361066
// todo remove server pod failures when OK
10371067

10381068
@Test

0 commit comments

Comments
 (0)