Skip to content

Commit 4e57b3a

Browse files
jshum2479rjeberhard
authored andcommitted
Owls 118470 topology spread constraints error report
1 parent 0470b61 commit 4e57b3a

File tree

7 files changed

+109
-1
lines changed

7 files changed

+109
-1
lines changed

common/src/main/java/oracle/kubernetes/common/logging/MessageKeys.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,8 @@ public class MessageKeys {
222222
public static final String NO_MATCH_VOLUME_WITH_PVC = "WLSDO-0063";
223223
public static final String NO_VOLUME_WITH_PVC = "WLSDO-0064";
224224
public static final String WALLET_KEY_NOT_FOUND = "WLSDO-0065";
225+
public static final String POD_UNSCHEDULABLE = "WLSDO-0066";
226+
public static final String POD_UNSCHEDULABLE_MESSAGE = "WLSDO-0067";
225227

226228
// domain event messages
227229
public static final String DOMAIN_AVAILABLE_EVENT_PATTERN = "WLSEO-0001";

common/src/main/resources/Operator.properties

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,8 @@ WLSDO-0064=When ''spec.configuration.initializeDomainOnPV'' is specified to auto
262262
but not the PersistentVolumeClaim (PVC), at least one of the volumes in ''spec.serverPod.volumes'' should contain a PVC.
263263
WLSDO-0065=The OPSS wallet password secret ''{0}'' is specified but the required ''walletPassword'' key is missing \
264264
in the specified secret ''{1}''.
265+
WLSDO-0066=Pod ''{0}'' is Unschedulable, reason: ''{1}''
266+
WLSDO-0067=One or more pods in the domain cannot be scheduled. Please check individual pod status for details.
265267

266268
oneEnvVar=variable
267269
multipleEnvVars=variables

operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
import io.kubernetes.client.openapi.models.V1PersistentVolumeClaim;
2525
import io.kubernetes.client.openapi.models.V1PersistentVolumeClaimStatus;
2626
import io.kubernetes.client.openapi.models.V1Pod;
27+
import io.kubernetes.client.openapi.models.V1PodCondition;
2728
import io.kubernetes.client.openapi.models.V1PodDisruptionBudget;
29+
import io.kubernetes.client.openapi.models.V1PodStatus;
2830
import io.kubernetes.client.openapi.models.V1Service;
2931
import io.kubernetes.client.util.Watch;
3032
import oracle.kubernetes.common.logging.LoggingFilter;
@@ -64,9 +66,12 @@
6466
import oracle.kubernetes.weblogic.domain.model.ServerStatus;
6567
import org.jetbrains.annotations.NotNull;
6668

69+
import static oracle.kubernetes.common.logging.MessageKeys.POD_UNSCHEDULABLE;
6770
import static oracle.kubernetes.common.logging.MessageKeys.PVC_NOT_BOUND_ERROR;
6871
import static oracle.kubernetes.operator.DomainStatusUpdater.createInternalFailureSteps;
6972
import static oracle.kubernetes.operator.DomainStatusUpdater.createIntrospectionFailureSteps;
73+
import static oracle.kubernetes.operator.KubernetesConstants.POD_SCHEDULED;
74+
import static oracle.kubernetes.operator.KubernetesConstants.UNSCHEDULABLE_REASON;
7075
import static oracle.kubernetes.operator.ProcessingConstants.SERVER_HEALTH_MAP;
7176
import static oracle.kubernetes.operator.ProcessingConstants.SERVER_STATE_MAP;
7277
import static oracle.kubernetes.operator.helpers.EventHelper.EventItem.CLUSTER_CHANGED;
@@ -582,6 +587,10 @@ private void processServerPodWatch(V1Pod pod, String watchType) {
582587
if ((isEvicted || isReady != isLabedlForShutdown || PodHelper.isFailed(pod)) && !PodHelper.isDeleting(pod)) {
583588
createMakeRightOperation(info).interrupt().withExplicitRecheck().execute();
584589
}
590+
boolean isUnschedulable = PodHelper.hasUnSchedulableCondition(pod);
591+
if (isUnschedulable) {
592+
LOGGER.info(POD_UNSCHEDULABLE, getPodName(pod), getUnSchedulableConditionMessage(pod));
593+
}
585594
break;
586595
case DELETED:
587596
boolean removed = info.deleteServerPodFromEvent(serverName, pod);
@@ -596,6 +605,25 @@ private void processServerPodWatch(V1Pod pod, String watchType) {
596605
}
597606
}
598607

608+
/**
609+
* If a pod is unschedulable, return the condition's message.
610+
* @param pod Kubernetes V1Pod
611+
* @return message for the unschedulable pod condition
612+
*/
613+
public static String getUnSchedulableConditionMessage(V1Pod pod) {
614+
return Optional.ofNullable(pod)
615+
.filter(PodHelper::isPending)
616+
.map(V1Pod::getStatus)
617+
.map(V1PodStatus::getConditions)
618+
.orElse(Collections.emptyList())
619+
.stream()
620+
.filter(condition -> POD_SCHEDULED.equals(condition.getType())
621+
&& UNSCHEDULABLE_REASON.equals(condition.getReason()))
622+
.map(V1PodCondition::getMessage)
623+
.findFirst().orElse(null);
624+
}
625+
626+
599627
private String getPodLabel(V1Pod pod, String labelName) {
600628
return Optional.ofNullable(pod)
601629
.map(V1Pod::getMetadata)

operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
import static oracle.kubernetes.common.logging.MessageKeys.PODS_FAILED;
7575
import static oracle.kubernetes.common.logging.MessageKeys.PODS_NOT_READY;
7676
import static oracle.kubernetes.common.logging.MessageKeys.PODS_NOT_RUNNING;
77+
import static oracle.kubernetes.common.logging.MessageKeys.POD_UNSCHEDULABLE_MESSAGE;
7778
import static oracle.kubernetes.operator.ClusterResourceStatusUpdater.createClusterResourceStatusUpdaterStep;
7879
import static oracle.kubernetes.operator.KubernetesConstants.HTTP_NOT_FOUND;
7980
import static oracle.kubernetes.operator.KubernetesConstants.MINIMUM_CLUSTER_COUNT;
@@ -744,6 +745,11 @@ private void setStatusConditions(DomainStatus status) {
744745
if (isHasFailedPod()) {
745746
addFailure(status, new DomainCondition(FAILED).withReason(SERVER_POD)
746747
.withFailureInfo(getDomain().getSpec()).withMessage(getPodFailedMessage()));
748+
} else if (isPodUnSchedulable()) {
749+
if (!alreadyReportedPodUnSchedulableCondition(status)) {
750+
addFailure(status, new DomainCondition(FAILED).withReason(SERVER_POD)
751+
.withFailureInfo(getDomain().getSpec()).withMessage(getPodUnSchedulableMessage()));
752+
}
747753
} else if (hasPodNotRunningInTime()) {
748754
addFailure(status, new DomainCondition(FAILED).withReason(SERVER_POD)
749755
.withFailureInfo(getDomain().getSpec()).withMessage(getPodNotRunningMessage()));
@@ -770,6 +776,10 @@ private String getPodNotReadyMessage() {
770776
return LOGGER.formatMessage(PODS_NOT_READY);
771777
}
772778

779+
private String getPodUnSchedulableMessage() {
780+
return LOGGER.formatMessage(POD_UNSCHEDULABLE_MESSAGE);
781+
}
782+
773783
private String getPodFailedMessage() {
774784
return LOGGER.formatMessage(PODS_FAILED);
775785
}
@@ -1251,6 +1261,11 @@ private boolean stillHasPodPendingRestart(DomainStatus status) {
12511261
.anyMatch(m -> m.containsKey(LabelConstants.MII_UPDATED_RESTART_REQUIRED_LABEL));
12521262
}
12531263

1264+
private boolean alreadyReportedPodUnSchedulableCondition(DomainStatus status) {
1265+
return status.getConditions().stream().anyMatch(
1266+
condition -> getPodUnSchedulableMessage().equals(condition.getMessage()));
1267+
}
1268+
12541269
private V1Pod getServerPod(ServerStatus serverStatus) {
12551270
return getInfo().getServerPod(serverStatus.getServerName());
12561271
}
@@ -1318,6 +1333,14 @@ private boolean hasPodNotRunningInTime() {
13181333
return getInfo().getServerPodsNotBeingDeleted().anyMatch(this::isNotRunningInTime);
13191334
}
13201335

1336+
private boolean isPodUnSchedulable() {
1337+
return getInfo().getServerPodsNotBeingDeleted().anyMatch(this::isPodUnSchedulable);
1338+
}
1339+
1340+
private boolean isPodUnSchedulable(V1Pod pod) {
1341+
return PodHelper.isPending(pod) && PodHelper.hasUnSchedulableCondition(pod);
1342+
}
1343+
13211344
private boolean isNotRunningInTime(V1Pod pod) {
13221345
return PodHelper.isPending(pod) && hasBeenPendingExceededWaitTime(pod);
13231346
}

operator/src/main/java/oracle/kubernetes/operator/KubernetesConstants.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ public interface KubernetesConstants {
5656
String NAMESPACE = "Namespace";
5757
String POD = "Pod";
5858
String EVICTED_REASON = "Evicted";
59-
59+
String UNSCHEDULABLE_REASON = "Unschedulable";
60+
String POD_SCHEDULED = "PodScheduled";
6061
int DEFAULT_EXPORTER_SIDECAR_PORT = 8080;
6162

6263
//---------- HTTP statuses returned from Kubernetes ----------

operator/src/main/java/oracle/kubernetes/operator/helpers/PodHelper.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151

5252
import static oracle.kubernetes.operator.KubernetesConstants.EVICTED_REASON;
5353
import static oracle.kubernetes.operator.KubernetesConstants.HTTP_NOT_FOUND;
54+
import static oracle.kubernetes.operator.KubernetesConstants.POD_SCHEDULED;
55+
import static oracle.kubernetes.operator.KubernetesConstants.UNSCHEDULABLE_REASON;
5456
import static oracle.kubernetes.operator.LabelConstants.CLUSTERNAME_LABEL;
5557
import static oracle.kubernetes.operator.LabelConstants.SERVERNAME_LABEL;
5658
import static oracle.kubernetes.operator.ProcessingConstants.SERVERS_TO_ROLL;
@@ -214,6 +216,11 @@ private static boolean isReadyCondition(V1PodCondition condition) {
214216
return "Ready".equals(condition.getType()) && "True".equals(condition.getStatus());
215217
}
216218

219+
private static boolean isUnSchedulableTheReason(V1PodCondition condition) {
220+
return POD_SCHEDULED.equals(condition.getType()) && "False".equals(condition.getStatus())
221+
&& UNSCHEDULABLE_REASON.equals(condition.getReason());
222+
}
223+
217224
private static boolean isReadyNotTrueCondition(V1PodCondition condition) {
218225
return "Ready".equals(condition.getType()) && !"True".equals(condition.getStatus());
219226
}
@@ -299,6 +306,21 @@ public static boolean shouldRestartEvictedPod(V1Pod pod) {
299306
return isEvicted(pod) && TuningParameters.getInstance().isRestartEvictedPods();
300307
}
301308

309+
/**
310+
* Return true if the pod has unschedulable condition.
311+
* @param pod Kubernetes Pod
312+
* @return true if the pod is unschedulable
313+
*/
314+
public static boolean hasUnSchedulableCondition(V1Pod pod) {
315+
return Optional.ofNullable(pod)
316+
.filter(PodHelper::isPending)
317+
.map(V1Pod::getStatus)
318+
.map(V1PodStatus::getConditions)
319+
.orElse(Collections.emptyList())
320+
.stream()
321+
.anyMatch(PodHelper::isUnSchedulableTheReason);
322+
}
323+
302324
/**
303325
* Returns the domain UID associated with the specified pod.
304326
* @param pod the pod

operator/src/test/java/oracle/kubernetes/operator/DomainStatusUpdateTestBase.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,22 @@ private V1Pod getPod(String serverName) {
238238
return info.getServerPod(serverName);
239239
}
240240

241+
private void setPodUnschedulable(String serverName) {
242+
V1Pod pod = getPod(serverName);
243+
V1PodStatus status = pod.getStatus();
244+
if (status == null) {
245+
status = new V1PodStatus();
246+
}
247+
pod.setStatus(status
248+
.startTime(SystemClock.now())
249+
.phase("Pending")
250+
.addConditionsItem(new V1PodCondition().type("PodScheduled").status("False").reason("Unschedulable")
251+
.message("0/1 nodes are available: 1 node(s) didn't match pod topology"
252+
+ " spread constraints (missing required label). preemption: 0/1 nodes are available: "
253+
+ "1 Preemption is not helpful for scheduling.."))
254+
);
255+
}
256+
241257
@Test
242258
void statusStep_usesServerFromWlsConfig() {
243259
defineScenario()
@@ -1032,6 +1048,20 @@ void whenPodPendingWithinTimeLimit_removePreviousServerPodFailures() {
10321048
assertThat(getRecordedDomain(), not(hasCondition(FAILED)));
10331049
}
10341050

1051+
@Test
1052+
void whenPodIsUnschedulable_reportServerPodFailure() {
1053+
defineScenario()
1054+
.withServerState("server1", new V1ContainerStateWaiting().reason(null))
1055+
.build();
1056+
setPodUnschedulable("server1");
1057+
1058+
SystemClockTestSupport.increment(21);
1059+
updateDomainStatus();
1060+
assertThat(getRecordedDomain(), hasCondition(FAILED).withReason(SERVER_POD)
1061+
.withMessageContaining("One or more pods in the domain cannot be scheduled."));
1062+
}
1063+
1064+
10351065
// todo remove server pod failures when OK
10361066

10371067
@Test

0 commit comments

Comments
 (0)