Skip to content

Commit c5a975d

Browse files
authored
Merge pull request #625 from oracle/owls-70659
OWLS-70659 Provide ability to retry introspection due to DeadlineExceeded status
2 parents aba633d + 386c40f commit c5a975d

File tree

10 files changed

+79
-6
lines changed

10 files changed

+79
-6
lines changed

operator/src/main/java/oracle/kubernetes/operator/DomainPresence.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,17 @@
88

99
public class DomainPresence {
1010
private static final int DEFAULT_TIMEOUT_SECONDS = 5;
11+
private static final int DEFAULT_RETRY_MAX_COUNT = 5;
1112

1213
static int getDomainPresenceFailureRetrySeconds() {
1314
return Optional.ofNullable(TuningParameters.getInstance())
1415
.map(parameters -> parameters.getMainTuning().domainPresenceFailureRetrySeconds)
1516
.orElse(DEFAULT_TIMEOUT_SECONDS);
1617
}
18+
19+
static int getDomainPresenceFailureRetryMaxCount() {
20+
return Optional.ofNullable(TuningParameters.getInstance())
21+
.map(parameters -> parameters.getMainTuning().domainPresenceFailureRetryMaxCount)
22+
.orElse(DEFAULT_RETRY_MAX_COUNT);
23+
}
1724
}

operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -843,7 +843,23 @@ public void onThrowable(Packet packet, Throwable throwable) {
843843
DomainPresenceInfo existing = getExistingDomainPresenceInfo(ns, domainUID);
844844
if (existing != null) {
845845
existing.setPopulated(false);
846-
makeRightDomainPresence(existing, true, isDeleting, false);
846+
// proceed only if we have not already retried max number of times
847+
int retryCount = existing.incrementAndGetFailureCount();
848+
LOGGER.fine(
849+
"Failure count for DomainPresenceInfo: "
850+
+ existing
851+
+ " is now: "
852+
+ retryCount);
853+
if (retryCount <= DomainPresence.getDomainPresenceFailureRetryMaxCount()) {
854+
makeRightDomainPresence(existing, true, isDeleting, false);
855+
} else {
856+
LOGGER.severe(
857+
MessageKeys.CANNOT_START_DOMAIN_AFTER_MAX_RETRIES,
858+
domainUID,
859+
ns,
860+
DomainPresence.getDomainPresenceFailureRetryMaxCount(),
861+
throwable);
862+
}
847863
}
848864
},
849865
DomainPresence.getDomainPresenceFailureRetrySeconds(),

operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,11 @@ public void receivedResponse(Watch.Response<V1Job> item) {
8383
case "ADDED":
8484
case "MODIFIED":
8585
V1Job job = item.object;
86-
Boolean isComplete = isComplete(job); // isReady(job);
86+
Boolean isComplete = isComplete(job);
87+
Boolean isFailed = isFailed(job);
8788
String jobName = job.getMetadata().getName();
88-
if (isComplete) {
89-
Complete complete = completeCallbackRegistrations.remove(jobName);
89+
if (isComplete || isFailed) {
90+
Complete complete = completeCallbackRegistrations.get(jobName);
9091
if (complete != null) {
9192
complete.isComplete(job);
9293
}
@@ -150,6 +151,11 @@ private WaitForJobReadyStep(V1Job job, Step next) {
150151
this.job = job;
151152
}
152153

154+
boolean shouldProcessJob(V1Job job) {
155+
return (this.job.getMetadata().getCreationTimestamp().getMillis()
156+
== job.getMetadata().getCreationTimestamp().getMillis());
157+
}
158+
153159
@Override
154160
public NextAction apply(Packet packet) {
155161
if (isComplete(job)) {
@@ -165,6 +171,10 @@ public NextAction apply(Packet packet) {
165171
(fiber) -> {
166172
Complete complete =
167173
(V1Job job) -> {
174+
if (!shouldProcessJob(job)) {
175+
return;
176+
}
177+
completeCallbackRegistrations.remove(job.getMetadata().getName());
168178
if (didResume.compareAndSet(false, true)) {
169179
LOGGER.fine("Job status: " + job.getStatus());
170180
packet.put(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB, job);

operator/src/main/java/oracle/kubernetes/operator/TuningParameters.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ public static TuningParameters getInstance() {
2121

2222
public static class MainTuning {
2323
public final int domainPresenceFailureRetrySeconds;
24+
public final int domainPresenceFailureRetryMaxCount;
2425
public final int domainPresenceRecheckIntervalSeconds;
2526
public final int targetNamespaceRecheckIntervalSeconds;
2627
public final int statusUpdateTimeoutSeconds;
@@ -30,13 +31,15 @@ public static class MainTuning {
3031

3132
public MainTuning(
3233
int domainPresenceFailureRetrySeconds,
34+
int domainPresenceFailureRetryMaxCount,
3335
int domainPresenceRecheckIntervalSeconds,
3436
int targetNamespaceRecheckIntervalSeconds,
3537
int statusUpdateTimeoutSeconds,
3638
int unchangedCountToDelayStatusRecheck,
3739
long initialShortDelay,
3840
long eventualLongDelay) {
3941
this.domainPresenceFailureRetrySeconds = domainPresenceFailureRetrySeconds;
42+
this.domainPresenceFailureRetryMaxCount = domainPresenceFailureRetryMaxCount;
4043
this.domainPresenceRecheckIntervalSeconds = domainPresenceRecheckIntervalSeconds;
4144
this.targetNamespaceRecheckIntervalSeconds = targetNamespaceRecheckIntervalSeconds;
4245
this.statusUpdateTimeoutSeconds = statusUpdateTimeoutSeconds;

operator/src/main/java/oracle/kubernetes/operator/TuningParametersImpl.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ private void update() {
5252
MainTuning main =
5353
new MainTuning(
5454
(int) readTuningParameter("domainPresenceFailureRetrySeconds", 10),
55+
(int) readTuningParameter("domainPresenceFailureRetryMaxCount", 5),
5556
(int) readTuningParameter("domainPresenceRecheckIntervalSeconds", 120),
5657
(int) readTuningParameter("targetNamespaceRecheckIntervalSeconds", 3),
5758
(int) readTuningParameter("statusUpdateTimeoutSeconds", 10),

operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import java.util.concurrent.ConcurrentHashMap;
1313
import java.util.concurrent.ConcurrentMap;
1414
import java.util.concurrent.atomic.AtomicBoolean;
15+
import java.util.concurrent.atomic.AtomicInteger;
1516
import java.util.concurrent.atomic.AtomicReference;
1617
import oracle.kubernetes.operator.wlsconfig.WlsServerConfig;
1718
import oracle.kubernetes.weblogic.domain.v2.Domain;
@@ -28,6 +29,7 @@ public class DomainPresenceInfo {
2829
private final AtomicReference<Domain> domain;
2930
private final AtomicBoolean isDeleting = new AtomicBoolean(false);
3031
private final AtomicBoolean isPopulated = new AtomicBoolean(false);
32+
private final AtomicInteger retryCount = new AtomicInteger(0);
3133
private final AtomicReference<Collection<ServerStartupInfo>> serverStartupInfo;
3234

3335
private final ConcurrentMap<String, ServerKubernetesObjects> servers = new ConcurrentHashMap<>();
@@ -75,6 +77,18 @@ public void setPopulated(boolean populated) {
7577
isPopulated.set(populated);
7678
}
7779

80+
public void resetFailureCount() {
81+
retryCount.set(0);
82+
}
83+
84+
public int incrementAndGetFailureCount() {
85+
return retryCount.incrementAndGet();
86+
}
87+
88+
public int getRetryCount() {
89+
return retryCount.get();
90+
}
91+
7892
/**
7993
* Last completion time
8094
*
@@ -87,6 +101,7 @@ public DateTime getLastCompletionTime() {
87101
/** Sets the last completion time to now */
88102
public void complete() {
89103
this.lastCompletionTime = new DateTime();
104+
resetFailureCount();
90105
}
91106

92107
/**

operator/src/main/java/oracle/kubernetes/operator/helpers/JobStepContext.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ public abstract class JobStepContext implements StepContextConstants {
2121

2222
private final DomainPresenceInfo info;
2323
private V1Job jobModel;
24+
final long DEFAULT_ACTIVE_DEADLINE_SECONDS = 120L;
25+
final long DEFAULT_ACTIVE_DEADLINE_INCREMENT_SECONDS = 60L;
2426

2527
JobStepContext(Packet packet) {
2628
info = packet.getSPI(DomainPresenceInfo.class);
@@ -165,9 +167,22 @@ V1ObjectMeta createMetadata() {
165167
return metadata;
166168
}
167169

170+
private long getActiveDeadlineSeconds() {
171+
return DEFAULT_ACTIVE_DEADLINE_SECONDS
172+
+ (DEFAULT_ACTIVE_DEADLINE_INCREMENT_SECONDS * info.getRetryCount());
173+
}
174+
168175
protected V1JobSpec createJobSpec(TuningParameters tuningParameters) {
176+
LOGGER.fine(
177+
"Creating job "
178+
+ getJobName()
179+
+ " with activeDeadlineSeconds = "
180+
+ getActiveDeadlineSeconds());
169181
V1JobSpec jobSpec =
170-
new V1JobSpec().backoffLimit(0).template(createPodTemplateSpec(tuningParameters));
182+
new V1JobSpec()
183+
.backoffLimit(0)
184+
.activeDeadlineSeconds(getActiveDeadlineSeconds())
185+
.template(createPodTemplateSpec(tuningParameters));
171186

172187
return jobSpec;
173188
}

operator/src/main/java/oracle/kubernetes/operator/logging/MessageKeys.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,4 +144,5 @@ private MessageKeys() {}
144144
public static final String CANNOT_PARSE_TOPOLOGY = "WLSKO-0141";
145145
public static final String CANNOT_PARSE_INTROSPECTOR_RESULT = "WLSKO-0142";
146146
public static final String CANNOT_PARSE_INTROSPECTOR_FILE = "WLSKO-0143";
147+
public static final String CANNOT_START_DOMAIN_AFTER_MAX_RETRIES = "WLSKO-0144";
147148
}

operator/src/main/resources/Operator.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,4 @@ WLSKO-0140=Job {0} is completed with status: {1}
142142
WLSKO-0141=Failed to parse WebLogic Domain topology due to exception: {0}
143143
WLSKO-0142=Failed to parse results from domain introspector for domain {0} due to exception: {1}
144144
WLSKO-0143=Failed to parse file {0} from domain introspector for domain {1} due to exception: {2}
145+
WLSKO-0144=Unable to start domain with domainUID {0} in namespace {1} after {2} attempts due to exception: {3}

operator/src/main/resources/scripts/startNodeManager.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,12 @@ createFolder ${NODEMGR_LOG_HOME}
113113

114114
nodemgr_log_file=${NODEMGR_LOG_HOME}/${SERVER_NAME}_nodemanager.log
115115
nodemgr_out_file=${NODEMGR_LOG_HOME}/${SERVER_NAME}_nodemanager.out
116+
nodemgr_lck_file=${NODEMGR_LOG_HOME}/${SERVER_NAME}_nodemanager.log.lck
116117

117-
checkEnv NODEMGR_LOG_HOME nodemgr_log_file nodemgr_out_file
118+
checkEnv NODEMGR_LOG_HOME nodemgr_log_file nodemgr_out_file nodemgr_lck_file
118119

120+
trace "remove nodemanager .lck file"
121+
rm -f ${nodemgr_lck_file}
119122

120123

121124
###############################################################################
@@ -209,6 +212,7 @@ if [ ! "${SERVER_NAME}" = "introspector" ]; then
209212
rm -f ${wl_state_file} || exit 1
210213
fi
211214

215+
212216
cat <<EOF > ${wl_props_file}
213217
# Server startup properties
214218
AutoRestart=true

0 commit comments

Comments
 (0)