Skip to content

Commit 4391ebe

Browse files
authored
Merge pull request #1149 from oracle/owls-74263
OWLS-74263 - NPE in operator logs during 50 domains test case
2 parents 3c861c8 + 0718fe2 commit 4391ebe

File tree

7 files changed

+252
-56
lines changed

7 files changed

+252
-56
lines changed

operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java

Lines changed: 87 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import java.util.Objects;
1212
import java.util.Optional;
1313
import java.util.Set;
14+
import java.util.function.Consumer;
1415
import java.util.function.Function;
1516
import java.util.stream.Collectors;
1617
import java.util.stream.Stream;
@@ -226,37 +227,45 @@ static class StatusUpdateStep extends Step {
226227
public NextAction apply(Packet packet) {
227228
LOGGER.entering();
228229

229-
StatusUpdateContext context = new StatusUpdateContext(packet);
230+
final StatusUpdateContext context = new StatusUpdateContext(packet);
231+
230232
DomainStatus status = context.getStatus();
231-
DomainStatus currentStatus = new DomainStatus(status);
232233

233-
if (context.getDomain() != null) {
234-
if (context.getDomainConfig().isPresent()) {
235-
status.setServers(new ArrayList<>(context.getServerStatuses().values()));
236-
status.setClusters(new ArrayList<>(context.getClusterStatuses().values()));
237-
status.setReplicas(context.getReplicaSetting());
238-
}
234+
boolean isStatusModified =
235+
modifyDomainStatus(
236+
status,
237+
s -> {
238+
if (context.getDomain() != null) {
239+
if (context.getDomainConfig().isPresent()) {
240+
s.setServers(new ArrayList<>(context.getServerStatuses().values()));
241+
s.setClusters(new ArrayList<>(context.getClusterStatuses().values()));
242+
s.setReplicas(context.getReplicaSetting());
243+
}
239244

240-
if (context.isHasFailedPod()) {
241-
status.removeConditionIf(c -> c.getType() == Available);
242-
status.removeConditionIf(c -> c.getType() == Progressing);
243-
status.addCondition(new DomainCondition(Failed).withStatus(TRUE).withReason("PodFailed"));
244-
} else {
245-
status.removeConditionIf(c -> c.getType() == Failed);
246-
if (context.allIntendedServersRunning()) {
247-
status.removeConditionIf(c -> c.getType() == Progressing);
248-
status.addCondition(
249-
new DomainCondition(Available).withStatus(TRUE).withReason(SERVERS_READY_REASON));
250-
}
251-
}
252-
}
245+
if (context.isHasFailedPod()) {
246+
s.removeConditionIf(c -> c.getType() == Available);
247+
s.removeConditionIf(c -> c.getType() == Progressing);
248+
s.addCondition(
249+
new DomainCondition(Failed).withStatus(TRUE).withReason("PodFailed"));
250+
} else {
251+
s.removeConditionIf(c -> c.getType() == Failed);
252+
if (context.allIntendedServersRunning()) {
253+
s.removeConditionIf(c -> c.getType() == Progressing);
254+
s.addCondition(
255+
new DomainCondition(Available)
256+
.withStatus(TRUE)
257+
.withReason(SERVERS_READY_REASON));
258+
}
259+
}
260+
}
261+
});
253262

254-
if (!status.equals(currentStatus)) {
263+
if (isStatusModified) {
255264
LOGGER.info(MessageKeys.DOMAIN_STATUS, context.getInfo().getDomainUid(), status);
256265
}
257266
LOGGER.exiting();
258267

259-
return !status.equals(currentStatus)
268+
return isStatusModified
260269
? doDomainUpdate(
261270
context.getDomain(), context.getInfo(), packet, StatusUpdateStep.this, getNext())
262271
: doNext(packet);
@@ -434,19 +443,24 @@ public NextAction apply(Packet packet) {
434443
LOGGER.entering();
435444

436445
DomainConditionStepContext context = new DomainConditionStepContext(packet);
437-
final DomainStatus status = context.getStatus();
438-
final DomainStatus currentStatus = new DomainStatus(status);
446+
DomainStatus status = context.getStatus();
439447

440-
status.addCondition(new DomainCondition(Progressing).withStatus(TRUE).withReason(reason));
441-
status.removeConditionIf(c -> c.getType() == Failed);
442-
if (!isPreserveAvailable) {
443-
status.removeConditionIf(c -> c.getType() == Available);
444-
}
448+
boolean isStatusModified =
449+
modifyDomainStatus(
450+
status,
451+
s -> {
452+
s.addCondition(
453+
new DomainCondition(Progressing).withStatus(TRUE).withReason(reason));
454+
s.removeConditionIf(c -> c.getType() == Failed);
455+
if (!isPreserveAvailable) {
456+
s.removeConditionIf(c -> c.getType() == Available);
457+
}
458+
});
445459

446460
LOGGER.info(MessageKeys.DOMAIN_STATUS, context.getDomain().getDomainUid(), status);
447461
LOGGER.exiting();
448462

449-
return !status.equals(currentStatus)
463+
return isStatusModified
450464
? doDomainUpdate(
451465
context.getDomain(), context.getInfo(), packet, ProgressingStep.this, getNext())
452466
: doNext(packet);
@@ -464,15 +478,19 @@ public NextAction apply(Packet packet) {
464478
LOGGER.entering();
465479

466480
DomainConditionStepContext context = new DomainConditionStepContext(packet);
467-
final DomainStatus status = context.getStatus();
468-
final DomainStatus currentStatus = new DomainStatus(status);
481+
DomainStatus status = context.getStatus();
469482

470-
status.removeConditionIf(c -> c.getType() == Progressing && TRUE.equals(c.getStatus()));
483+
boolean isStatusModified =
484+
modifyDomainStatus(
485+
status,
486+
s ->
487+
s.removeConditionIf(
488+
c -> c.getType() == Progressing && TRUE.equals(c.getStatus())));
471489

472490
LOGGER.info(MessageKeys.DOMAIN_STATUS, context.getDomain().getDomainUid(), status);
473491
LOGGER.exiting();
474492

475-
return !status.equals(currentStatus)
493+
return isStatusModified
476494
? doDomainUpdate(
477495
context.getDomain(), context.getInfo(), packet, EndProgressingStep.this, getNext())
478496
: doNext(packet);
@@ -492,21 +510,34 @@ public NextAction apply(Packet packet) {
492510
LOGGER.entering();
493511

494512
DomainConditionStepContext context = new DomainConditionStepContext(packet);
495-
final DomainStatus status = context.getStatus();
496-
final DomainStatus currentStatus = new DomainStatus(status);
513+
DomainStatus status = context.getStatus();
497514

498-
status.addCondition(new DomainCondition(Available).withStatus(TRUE).withReason(reason));
499-
status.removeConditionIf(c -> c.getType() == Failed);
515+
boolean isStatusModified =
516+
modifyDomainStatus(
517+
status,
518+
s -> {
519+
s.addCondition(new DomainCondition(Available).withStatus(TRUE).withReason(reason));
520+
s.removeConditionIf(c -> c.getType() == Failed);
521+
});
500522

501523
LOGGER.info(MessageKeys.DOMAIN_STATUS, context.getDomain().getDomainUid(), status);
502524
LOGGER.exiting();
503-
return !status.equals(currentStatus)
525+
526+
return isStatusModified
504527
? doDomainUpdate(
505528
context.getDomain(), context.getInfo(), packet, AvailableStep.this, getNext())
506529
: doNext(packet);
507530
}
508531
}
509532

533+
private static boolean modifyDomainStatus(DomainStatus domainStatus, Consumer<DomainStatus> statusUpdateConsumer) {
534+
final DomainStatus currentStatus = new DomainStatus(domainStatus);
535+
synchronized (domainStatus) {
536+
statusUpdateConsumer.accept(domainStatus);
537+
return !domainStatus.equals(currentStatus);
538+
}
539+
}
540+
510541
private static class FailedStep extends Step {
511542
private final Throwable throwable;
512543

@@ -521,21 +552,26 @@ public NextAction apply(Packet packet) {
521552

522553
DomainConditionStepContext context = new DomainConditionStepContext(packet);
523554
final DomainStatus status = context.getStatus();
524-
final DomainStatus currentStatus = new DomainStatus(status);
525-
526-
status.addCondition(
527-
new DomainCondition(Failed)
528-
.withStatus(TRUE)
529-
.withReason("Exception")
530-
.withMessage(throwable.getMessage()));
531-
if (status.hasConditionWith(c -> c.hasType(Progressing))) {
532-
status.addCondition(new DomainCondition(Progressing).withStatus(FALSE));
533-
}
555+
556+
boolean isStatusModified =
557+
modifyDomainStatus(
558+
status,
559+
s -> {
560+
s.addCondition(
561+
new DomainCondition(Failed)
562+
.withStatus(TRUE)
563+
.withReason("Exception")
564+
.withMessage(throwable.getMessage()));
565+
if (s.hasConditionWith(c -> c.hasType(Progressing))) {
566+
s.addCondition(new DomainCondition(Progressing).withStatus(FALSE));
567+
}
568+
});
569+
534570

535571
LOGGER.info(MessageKeys.DOMAIN_STATUS, context.getDomain().getDomainUid(), status);
536572
LOGGER.exiting();
537573

538-
return !status.equals(currentStatus)
574+
return isStatusModified
539575
? doDomainUpdate(
540576
context.getDomain(), context.getInfo(), packet, FailedStep.this, getNext())
541577
: doNext(packet);

operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ static void defineFactory(
100100

101101
public static boolean isComplete(V1Job job) {
102102
V1JobStatus status = job.getStatus();
103-
LOGGER.info(MessageKeys.JOB_IS_COMPLETE, job.getMetadata().getName(), status);
103+
LOGGER.fine("JobWatcher.isComplete status of job " + job.getMetadata().getName() + ": " + status);
104104
if (status != null) {
105105
List<V1JobCondition> conds = status.getConditions();
106106
if (conds != null) {
@@ -129,6 +129,18 @@ public static boolean isFailed(V1Job job) {
129129
return false;
130130
}
131131

132+
public static String getFailedReason(V1Job job) {
133+
V1JobStatus status = job.getStatus();
134+
if (status != null && status.getConditions() != null) {
135+
for (V1JobCondition cond : status.getConditions()) {
136+
if ("Failed".equals(cond.getType()) && "True".equals(cond.getStatus())) {
137+
return cond.getReason();
138+
}
139+
}
140+
}
141+
return null;
142+
}
143+
132144
@Override
133145
public WatchI<V1Job> initiateWatch(WatchBuilder watchBuilder) throws ApiException {
134146
return watchBuilder
@@ -150,7 +162,7 @@ public void receivedResponse(Watch.Response<V1Job> item) {
150162
if (isComplete || isFailed) {
151163
Complete complete = completeCallbackRegistrations.get(jobName);
152164
if (complete != null) {
153-
complete.isComplete(job);
165+
complete.isComplete(job, isFailed);
154166
}
155167
}
156168
break;
@@ -175,7 +187,7 @@ public Step waitForReady(V1Job job, Step next) {
175187

176188
@FunctionalInterface
177189
private interface Complete {
178-
void isComplete(V1Job job);
190+
void isComplete(V1Job job, boolean isJobFailed);
179191
}
180192

181193
static class JobWatcherFactory {
@@ -231,14 +243,21 @@ public NextAction apply(Packet packet) {
231243
return doSuspend(
232244
(fiber) -> {
233245
Complete complete =
234-
(V1Job job) -> {
246+
(V1Job job, boolean isJobFailed) -> {
235247
if (!shouldProcessJob(job)) {
236248
return;
237249
}
238250
completeCallbackRegistrations.remove(job.getMetadata().getName());
239251
if (didResume.compareAndSet(false, true)) {
240252
LOGGER.fine("Job status: " + job.getStatus());
241253
packet.put(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB, job);
254+
// Do not proceed to next step such as ReadDomainIntrospectorPodLog if job
255+
// failed due to DeadlineExceeded, as the pod container would likely not
256+
// be available for reading
257+
if (isJobFailed && "DeadlineExceeded".equals(getFailedReason(job))) {
258+
fiber.terminate(
259+
new DeadlineExceededException(job), packet);
260+
}
242261
fiber.resume(packet);
243262
}
244263
};
@@ -283,4 +302,29 @@ public NextAction onSuccess(
283302
});
284303
}
285304
}
305+
306+
static class DeadlineExceededException extends Exception {
307+
final V1Job job;
308+
309+
public DeadlineExceededException(V1Job job) {
310+
super();
311+
this.job = job;
312+
}
313+
314+
public String toString() {
315+
return LOGGER.getFormattedMessage(
316+
MessageKeys.JOB_DEADLINE_EXCEEDED_MESSAGE,
317+
job.getMetadata().getName(),
318+
job.getSpec().getActiveDeadlineSeconds(),
319+
getJobStartedSeconds(),
320+
DomainPresence.getDomainPresenceFailureRetryMaxCount());
321+
}
322+
323+
private long getJobStartedSeconds() {
324+
if (job.getStatus() != null && job.getStatus().getStartTime() != null) {
325+
return (System.currentTimeMillis() - job.getStatus().getStartTime().getMillis()) / 1000;
326+
}
327+
return -1;
328+
}
329+
}
286330
}

operator/src/main/java/oracle/kubernetes/operator/logging/LoggingFacade.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
package oracle.kubernetes.operator.logging;
66

7+
import java.text.MessageFormat;
78
import java.util.Arrays;
89
import java.util.logging.ConsoleHandler;
910
import java.util.logging.Handler;
@@ -632,6 +633,21 @@ public void trace(String msg, Object... args) {
632633
finer(TRACE + msg, args);
633634
}
634635

636+
/**
637+
* Returns a formatted message.
638+
*
639+
* @param msg the message to be formatted, which is key to the resource bundle
640+
* @param args parameters to the message
641+
* @return A formatted message
642+
*/
643+
public String getFormattedMessage(String msg, Object... args) {
644+
try {
645+
return MessageFormat.format(logger.getResourceBundle().getString(msg), args);
646+
} catch (Exception ex) {
647+
return msg;
648+
}
649+
}
650+
635651
/**
636652
* Obtains caller details, class name and method, to be provided to the actual Logger. This code
637653
* is adapted from ODLLogRecord, which should yield consistency in reporting using PlatformLogger

operator/src/main/java/oracle/kubernetes/operator/logging/MessageKeys.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ public class MessageKeys {
153153
public static final String EXTERNAL_CHANNEL_SERVICE_REPLACED = "WLSKO-0151";
154154
public static final String EXTERNAL_CHANNEL_SERVICE_EXISTS = "WLSKO-0152";
155155
public static final String WLS_HEALTH_READ_FAILED_NO_HTTPCLIENT = "WLSKO-0153";
156+
public static final String JOB_DEADLINE_EXCEEDED_MESSAGE = "WLSKO-0154";
156157

157158
private MessageKeys() {
158159
}

operator/src/main/resources/Operator.properties

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,3 +151,13 @@ WLSKO-0150=Creating external channel service for WebLogic domain with UID: {0}.
151151
WLSKO-0151=Replacing external channel service for WebLogic domain with UID: {0}.
152152
WLSKO-0152=Existing external channel service is correct for WebLogic domain with UID: {0}.
153153
WLSKO-0153=Failed to read health information from server {0}. Unable to connect to server.
154+
WLSKO-0154=Job {0} failed due to reason: DeadlineExceeded. \
155+
ActiveDeadlineSeconds of the job is configured with {1} seconds. \
156+
The job was started {2} seconds ago. \
157+
Ensure all domain dependencies have been deployed \
158+
(any secrets, config-maps, PVs, and PVCs that the domain resource references). \
159+
Use kubectl describe for the job and its pod for more job failure information. \
160+
The job may be retried by the operator up to {3} \
161+
times with longer ActiveDeadlineSeconds value in each subsequent retry. \
162+
Use tuning parameter 'domainPresenceFailureRetryMaxCount' to configure max retries.
163+

0 commit comments

Comments
 (0)