Skip to content

Commit b9c255d

Browse files
committed
limit number of retries in DomainProcessorImpl.runDomainPlan
1 parent fbbc6e0 commit b9c255d

File tree

9 files changed

+56
-14
lines changed

9 files changed

+56
-14
lines changed

operator/src/main/java/oracle/kubernetes/operator/DomainPresence.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,17 @@
88

99
public class DomainPresence {
1010
private static final int DEFAULT_TIMEOUT_SECONDS = 5;
11+
private static final int DEFAULT_RETRY_MAX_COUNT = 5;
1112

1213
static int getDomainPresenceFailureRetrySeconds() {
1314
return Optional.ofNullable(TuningParameters.getInstance())
1415
.map(parameters -> parameters.getMainTuning().domainPresenceFailureRetrySeconds)
1516
.orElse(DEFAULT_TIMEOUT_SECONDS);
1617
}
18+
19+
static int getDomainPresenceFailureRetryMaxCount() {
20+
return Optional.ofNullable(TuningParameters.getInstance())
21+
.map(parameters -> parameters.getMainTuning().domainPresenceFailureRetryMaxCount)
22+
.orElse(DEFAULT_RETRY_MAX_COUNT);
23+
}
1724
}

operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -813,7 +813,10 @@ void runDomainPlan(
813813
new CompletionCallback() {
814814
@Override
815815
public void onCompletion(Packet packet) {
816-
// no-op
816+
DomainPresenceInfo existing = getExistingDomainPresenceInfo(ns, domainUID);
817+
if (existing != null) {
818+
existing.resetFailureCount();
819+
}
817820
}
818821

819822
@Override
@@ -837,17 +840,29 @@ public void onThrowable(Packet packet, Throwable throwable) {
837840
}
838841
});
839842

840-
gate.getExecutor()
841-
.schedule(
842-
() -> {
843-
DomainPresenceInfo existing = getExistingDomainPresenceInfo(ns, domainUID);
844-
if (existing != null) {
845-
existing.setPopulated(false);
846-
makeRightDomainPresence(existing, true, isDeleting, false);
847-
}
848-
},
849-
DomainPresence.getDomainPresenceFailureRetrySeconds(),
850-
TimeUnit.SECONDS);
843+
DomainPresenceInfo existing = getExistingDomainPresenceInfo(ns, domainUID);
844+
if (existing != null) {
845+
int failureCount = existing.incrementAndGetFailureCount();
846+
LOGGER.finer(
847+
"Failure count for DomainPresenceInfo: " + existing + " is now: " + failureCount);
848+
if (failureCount > DomainPresence.getDomainPresenceFailureRetryMaxCount()) {
849+
LOGGER.warning(
850+
MessageKeys.CANNOT_START_DOMAIN_AFTER_MAX_RETRIES,
851+
domainUID,
852+
ns,
853+
DomainPresence.getDomainPresenceFailureRetryMaxCount(),
854+
throwable);
855+
} else {
856+
gate.getExecutor()
857+
.schedule(
858+
() -> {
859+
existing.setPopulated(false);
860+
makeRightDomainPresence(existing, true, isDeleting, false);
861+
},
862+
DomainPresence.getDomainPresenceFailureRetrySeconds(),
863+
TimeUnit.SECONDS);
864+
}
865+
}
851866
}
852867
};
853868

operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ public void receivedResponse(Watch.Response<V1Job> item) {
8383
case "ADDED":
8484
case "MODIFIED":
8585
V1Job job = item.object;
86-
Boolean isComplete = isComplete(job); // isReady(job);
86+
Boolean isComplete = isComplete(job) || isFailed(job);
8787
String jobName = job.getMetadata().getName();
8888
if (isComplete) {
8989
Complete complete = completeCallbackRegistrations.remove(jobName);

operator/src/main/java/oracle/kubernetes/operator/TuningParameters.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ public static TuningParameters getInstance() {
2121

2222
public static class MainTuning {
2323
public final int domainPresenceFailureRetrySeconds;
24+
public final int domainPresenceFailureRetryMaxCount;
2425
public final int domainPresenceRecheckIntervalSeconds;
2526
public final int statusUpdateTimeoutSeconds;
2627
public final int unchangedCountToDelayStatusRecheck;
@@ -29,12 +30,14 @@ public static class MainTuning {
2930

3031
public MainTuning(
3132
int domainPresenceFailureRetrySeconds,
33+
int domainPresenceFailureRetryMaxCount,
3234
int domainPresenceRecheckIntervalSeconds,
3335
int statusUpdateTimeoutSeconds,
3436
int unchangedCountToDelayStatusRecheck,
3537
long initialShortDelay,
3638
long eventualLongDelay) {
3739
this.domainPresenceFailureRetrySeconds = domainPresenceFailureRetrySeconds;
40+
this.domainPresenceFailureRetryMaxCount = domainPresenceFailureRetryMaxCount;
3841
this.domainPresenceRecheckIntervalSeconds = domainPresenceRecheckIntervalSeconds;
3942
this.statusUpdateTimeoutSeconds = statusUpdateTimeoutSeconds;
4043
this.unchangedCountToDelayStatusRecheck = unchangedCountToDelayStatusRecheck;

operator/src/main/java/oracle/kubernetes/operator/TuningParametersImpl.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ private void update() {
5252
MainTuning main =
5353
new MainTuning(
5454
(int) readTuningParameter("domainPresenceFailureRetrySeconds", 10),
55+
(int) readTuningParameter("domainPresenceFailureRetryMaxCount", 5),
5556
(int) readTuningParameter("domainPresenceRecheckIntervalSeconds", 120),
5657
(int) readTuningParameter("statusUpdateTimeoutSeconds", 10),
5758
(int) readTuningParameter("statusUpdateUnchangedCountToDelayStatusRecheck", 10),

operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import java.util.concurrent.ConcurrentHashMap;
1313
import java.util.concurrent.ConcurrentMap;
1414
import java.util.concurrent.atomic.AtomicBoolean;
15+
import java.util.concurrent.atomic.AtomicInteger;
1516
import java.util.concurrent.atomic.AtomicReference;
1617
import oracle.kubernetes.operator.wlsconfig.WlsServerConfig;
1718
import oracle.kubernetes.weblogic.domain.v2.Domain;
@@ -28,6 +29,7 @@ public class DomainPresenceInfo {
2829
private final AtomicReference<Domain> domain;
2930
private final AtomicBoolean isDeleting = new AtomicBoolean(false);
3031
private final AtomicBoolean isPopulated = new AtomicBoolean(false);
32+
private final AtomicInteger failureCount = new AtomicInteger(0);
3133
private final AtomicReference<Collection<ServerStartupInfo>> serverStartupInfo;
3234

3335
private final ConcurrentMap<String, ServerKubernetesObjects> servers = new ConcurrentHashMap<>();
@@ -75,6 +77,14 @@ public void setPopulated(boolean populated) {
7577
isPopulated.set(populated);
7678
}
7779

80+
public void resetFailureCount() {
81+
failureCount.set(0);
82+
}
83+
84+
public int incrementAndGetFailureCount() {
85+
return failureCount.incrementAndGet();
86+
}
87+
7888
/**
7989
* Last completion time
8090
*

operator/src/main/java/oracle/kubernetes/operator/logging/MessageKeys.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,4 +144,5 @@ private MessageKeys() {}
144144
public static final String CANNOT_PARSE_TOPOLOGY = "WLSKO-0141";
145145
public static final String CANNOT_PARSE_INTROSPECTOR_RESULT = "WLSKO-0142";
146146
public static final String CANNOT_PARSE_INTROSPECTOR_FILE = "WLSKO-0143";
147+
public static final String CANNOT_START_DOMAIN_AFTER_MAX_RETRIES = "WLSKO-0144";
147148
}

operator/src/main/resources/Operator.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,4 @@ WLSKO-0140=Job {0} is completed with status: {1}
142142
WLSKO-0141=Failed to parse WebLogic Domain topology due to exception: {0}
143143
WLSKO-0142=Failed to parse results from domain introspector for domain {0} due to exception: {1}
144144
WLSKO-0143=Failed to parse file {0} from domain introspector for domain {1} due to exception: {2}
145+
WLSKO-0144=Unable to start domain with domainUID {0} in namespace {1} after {2} attempts due to exception: {3}

operator/src/main/resources/scripts/startNodeManager.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,12 @@ createFolder ${NODEMGR_LOG_HOME}
113113

114114
nodemgr_log_file=${NODEMGR_LOG_HOME}/${SERVER_NAME}_nodemanager.log
115115
nodemgr_out_file=${NODEMGR_LOG_HOME}/${SERVER_NAME}_nodemanager.out
116+
nodemgr_lck_file=${NODEMGR_LOG_HOME}/${SERVER_NAME}_nodemanager.log.lck
116117

117-
checkEnv NODEMGR_LOG_HOME nodemgr_log_file nodemgr_out_file
118+
checkEnv NODEMGR_LOG_HOME nodemgr_log_file nodemgr_out_file nodemgr_lck_file
118119

120+
trace "remove nodemanager .lck file"
121+
rm -f ${nodemgr_lck_file}
119122

120123

121124
###############################################################################
@@ -209,6 +212,7 @@ if [ ! "${SERVER_NAME}" = "introspector" ]; then
209212
rm -f ${wl_state_file} || exit 1
210213
fi
211214

215+
212216
cat <<EOF > ${wl_props_file}
213217
# Server startup properties
214218
AutoRestart=true

0 commit comments

Comments
 (0)