Skip to content

Commit d34be8f

Browse files
committed
Add Coherence StatusHA check for server shutdown
1 parent a2869d8 commit d34be8f

File tree

1 file changed

+219
-30
lines changed

1 file changed

+219
-30
lines changed

operator/src/main/resources/scripts/stop-server.py

Lines changed: 219 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,200 @@
22
# Licensed under the Universal Permissive License v 1.0 as shown at
33
# http://oss.oracle.com/licenses/upl.
44

5+
# This the the preStop hook script (stopServer.sh) that is specified by the domain resource.
6+
# This script will be called by Kubernetes before it kills a pod. Kuberentes will kill
7+
# the pod once this script returns OR when the grace period timeout expires
8+
# (see Operator shutdown.timeoutSeconds)
59
#
6-
# This is a WLST script for starting WL server via the node manager.
10+
# This script shuts down the local server running in the container (admin or managed).
11+
# There are 2 main scenarios, a domain with a Coherence cluster and one without.
12+
# If there is no Coherence cluster, then the code will attempt to connect to the
13+
# local server and do a shutdown. If that doesn't work it will attempt to shutdown
14+
# using NodeManager. If that fails it just exits and kubernetes will kill the pod.
15+
# If a Coherence cluster exists, then we must wait for the Coherence services to be
16+
# safe before the server is shutdown, or we risk losing data. This requires a
17+
# a connection to the admin server and inspection of the Coherence MBeans. The code
18+
# will not shutdown the server until Coherence is safe. If Coherence never becomes safe,
19+
# then, eventually, the grace period will expire and Kubernetes will kill the pod.
720
#
821
# It users key and data files that were generated by the introspector
922
# for its nmConnect credentials.
1023
#
1124

1225
import sys
26+
import traceback
1327
import base64
28+
import time as systime
1429

30+
# Get an ENV var
1531
def getEnvVar(var):
1632
val=os.environ.get(var)
1733
if val==None:
1834
print "ERROR: Env var ",var, " not set."
1935
sys.exit(1)
2036
return val
2137

38+
39+
# Connect to Node Manager and shut down the local server
40+
#
41+
def shutdownUsingNodeManager(domainName, domainDir):
42+
try:
43+
print('Shutdown: Attempting shutdown using NodeManager')
44+
nmConnect(userConfigFile='/weblogic-operator/introspector/userConfigNodeManager.secure',
45+
userKeyFile='/tmp/userKeyNodeManager.secure.bin',
46+
host='127.0.0.1',port='5556',
47+
domainName=domainName,
48+
domainDir=domainDir,
49+
nmType='plain')
50+
print('Shutdown: Successfully connected to NodeManager')
51+
nmKill(server_name)
52+
print('Shutdown: Successfully killed server using NodeManager')
53+
except Exception, e:
54+
traceback.print_exc(file=sys.stdout)
55+
print('Shutdown: Failed to kill server using NodeManager')
56+
raise
57+
58+
59+
# Check if Coherence exists using wlst offline
60+
def doesCoherenceExist():
61+
try:
62+
readDomain(domain_path)
63+
exists = checkCoherenceClusterExist()
64+
except:
65+
print('Shutdown: Exception reading domain offline, assume Coherence exists')
66+
return True
67+
try:
68+
closeDomain()
69+
except:
70+
pass
71+
return exists
72+
73+
# Check if there is a CoherenceClusterSystemResource. This will indicate that the domain is
74+
# using Coherence
75+
def checkCoherenceClusterExist():
76+
try:
77+
cd('/')
78+
if ls().find('CoherenceClusterSystemResource') == -1:
79+
return False
80+
cd('CoherenceClusterSystemResource')
81+
val = ls()
82+
if (val is None) or len(val.strip()) == 0:
83+
print('Shutdown: This domain does not have a CoherenceClusterSystemResource')
84+
return False
85+
else:
86+
print('Shutdown: This domain has CoherenceClusterSystemResource ' + val)
87+
return True
88+
except:
89+
# Exception will occur if CoherenceClusterSystemResource is missing
90+
traceback.print_exc(file=sys.stdout)
91+
dumpStack()
92+
print('Shutdown: Exception getting CoherenceClusterSystemResource')
93+
return False
94+
95+
96+
# Coherence exists and we cannot connect to the admin server. To be on
97+
# the safe side, loop forever until the Operator kills the pod otherwise
98+
# there is a risk to lose data
99+
def waitForeverBecauseCoherence():
100+
print('Shutdown: Waiting until Operator kills the pod since this domain has Coherence')
101+
while (True):
102+
print('Shutdown: Sleeping 60 seconds ...')
103+
systime.sleep(60)
104+
105+
106+
# If there is a Coherence cluster then we wait until it is safe to shutdown
107+
# all distributed services. Each distributed cache (with backup count > 0) has data
108+
# that is in both in a primary partition on one cluster node (i.e. WLS server) and
109+
# a backup partition on a different node.
110+
# During rolling restart, Coherence rebalances the cluster and moves both primary and
111+
# backup partitions across the nodes. During this time the distributed services affected
112+
# are consider ENDANGERED. For example, in a 3 node cluster, if node 1 was being restarted
113+
# and we shut down node 2 before Coherence was safe then we may lose data.
114+
# NOTE: The overall timeout used to control the termination of this container is
115+
# controlled by the Kubernetes graceful timeout value. So even
116+
# if this code looped forever, Kubernetes will kill the pod once the timeout expires.
117+
# The user must set that timeout to a large enough value to give Coherence time to get safe.
118+
def waitUntilCoherenceSafe():
119+
print ('Shutdown: getting all service Coherence MBeans')
120+
121+
domainRuntime()
122+
query='Coherence:type=Service,name=*,*'
123+
124+
# Wait forever until we get positive ack that it is ok to shutdown this server.
125+
done = False
126+
while (not done):
127+
try:
128+
beans = list(mbs.queryMBeans(ObjectName(query), None))
129+
print("Shutdown: Coherence service bean count " + str(len(beans)))
130+
131+
# Loop waiting for each service to be safe
132+
for serviceBean in beans:
133+
objectName = serviceBean.getObjectName()
134+
waitUntilServiceSafeToShutdown(objectName)
135+
done = True
136+
print ('Shutdown: It is safe to shutdown Coherence')
137+
138+
except:
139+
print ("Shutdown: Exception checking a service Coherence statusHA, retrying...")
140+
traceback.print_exc(file=sys.stdout)
141+
dumpStack()
142+
systime.sleep(30)
143+
pass
144+
145+
146+
# Wait until the specified Coherence service is safe to shutdown.
147+
# If the cluster is a single node cluster then the service will always
148+
# be ENDANGERED, therefore it is the responsibility of the user to
149+
# set Coherence backup count to 0, or to set the terminate grace period
150+
# to a low number since this method will just wait until the oeprator kills the
151+
# pod.
152+
def waitUntilServiceSafeToShutdown(objectName):
153+
154+
print ("Shutdown: checking Coherence service " + str(objectName) )
155+
156+
# NOTE: break loop when it safe to shutdown else stay in loop forever
157+
while (True):
158+
try:
159+
# If the BackupCount is > 0 then the user intention is to have
160+
# HA with backed up partitions, otherwise if value is < 1 then the user
161+
# doesn't care about HA so no need to wait.
162+
# NOTE: if this is NOT a partitioned service we will get exception, ignore and exit loop
163+
try:
164+
val = mbs.getAttribute(objectName,"BackupCount")
165+
except:
166+
print ("Shutdown: Coherence BackupCount attribute missing")
167+
val = None
168+
pass
169+
170+
if (val is None) or int(val) < 1:
171+
print ("Shutdown: Coherence skipping status check for this service since BackupCount < 1. Value is " + str(val))
172+
break
173+
174+
status = mbs.getAttribute(objectName,"StatusHA")
175+
if (status is None):
176+
print ("Shutdown: None returned for Coherence StatusHA")
177+
break
178+
179+
print ('Shutdown: Coherence StatusHA is ' + status)
180+
if status != "ENDANGERED":
181+
break
182+
183+
# Coherence caches are ENDANGERED meaning that we may lose data
184+
print ('Shutdown: Waiting until it is safe to shutdown Coherence server ...')
185+
systime.sleep(5)
186+
187+
except:
188+
print ('Shutdown: An exception occurred getting Coherence MBeans, staying in loop checking for safe')
189+
traceback.print_exc(file=sys.stdout)
190+
dumpStack()
191+
systime.sleep(30)
192+
pass
193+
194+
195+
#----------------------------------
196+
# Main script
197+
#----------------------------------
198+
print ("Shutdown: main script")
22199
domain_uid = getEnvVar('DOMAIN_UID')
23200
server_name = getEnvVar('SERVER_NAME')
24201
domain_name = getEnvVar('DOMAIN_NAME')
@@ -29,15 +206,14 @@ def getEnvVar(var):
29206
timeout = getEnvVar('SHUTDOWN_TIMEOUT_ARG')
30207
ignore_sessions = getEnvVar('SHUTDOWN_IGNORE_SESSIONS_ARG')
31208
shutdown_type = getEnvVar('SHUTDOWN_TYPE_ARG')
209+
admin_port = getEnvVar('ADMIN_PORT')
210+
admin_host = getEnvVar('AS_SERVICE_NAME')
32211

33212
force = 'false'
34213
if shutdown_type.lower() == 'forced':
35214
force = 'true'
36215

37-
connect_url = local_admin_protocol + '://' + service_name + ':' + local_admin_port
38-
39216
# Convert b64 encoded user key into binary
40-
41217
file = open('/weblogic-operator/introspector/userKeyNodeManager.secure', 'r')
42218
contents = file.read()
43219
file.close()
@@ -47,42 +223,55 @@ def getEnvVar(var):
47223
file.write(decoded)
48224
file.close()
49225

50-
def shutdownUsingNodeManager(domainName, domainDir):
51-
try:
52-
nmConnect(userConfigFile='/weblogic-operator/introspector/userConfigNodeManager.secure',
53-
host='127.0.0.1',port='5556',
54-
domainName=domainName,
55-
domainDir=domainDir,
56-
nmType='plain')
57-
except Exception, e:
58-
print e
59-
print('Failed to connect to the node manager')
60-
exit(exitcode=2)
226+
# check if Coherence cluster exists in this domain
227+
cohExists = doesCoherenceExist()
61228

62-
# Connect to the server and request that it shuts down
229+
# If Coherence exists then we need to connect to admin server, else local server
230+
if (cohExists):
231+
print ('Shutdown: Coherence cluster exists')
232+
connect_url = local_admin_protocol + '://' + admin_host + ':' + admin_port
233+
else:
234+
print ('Shutdown: Coherence cluster does not exist')
235+
connect_url = local_admin_protocol + '://' + service_name + ':' + local_admin_port
63236

64-
try:
65-
connect(userConfigFile='/weblogic-operator/introspector/userConfigNodeManager.secure',
237+
# Stay in loop until the server is shutdown if Coherence exists. For non-Coherence
238+
# just make a best effort
239+
stayInConnectLoop = True
240+
cohSafe = False
241+
while (stayInConnectLoop):
242+
try:
243+
stayInConnectLoop = False
244+
print ('Shutdown: Connecting to server at ' + connect_url)
245+
connect(userConfigFile='/weblogic-operator/introspector/userConfigNodeManager.secure',
66246
userKeyFile='/tmp/userKeyNodeManager.secure.bin',
67247
url=connect_url,
68248
domainName=domain_name,
69249
domainDir=domain_path,
70250
nmType='plain')
71251

72-
print('Connected to the server - attempting to issue shutdown command')
73-
except Exception, e:
74-
print e
75-
print('Failed to connect to the server; trying node manager')
76-
shutdownUsingNodeManager(domain_name, domain_path)
252+
print('Shutdown: Successfully connected to server. Calling server shutdown')
253+
254+
if (cohExists):
255+
waitUntilCoherenceSafe()
256+
cohSafe = True
77257

78-
# shutdown the server
258+
shutdown(server_name, 'Server', ignoreSessions=ignore_sessions, timeOut=int(timeout), block='true', force=force)
259+
print('Shutdown: Successfully shutdown the server')
260+
261+
except Exception, e:
262+
print e
263+
print('Shutdown: Exception in connect or shutdown')
264+
if (cohExists and not cohSafe):
265+
print('Shutdown: Coherence not safe to shutdown. Sleeping before connect retry ...')
266+
stayInConnectLoop = True
267+
systime.sleep(30)
268+
else:
269+
try:
270+
shutdownUsingNodeManager(domain_name, domain_path)
271+
exit()
272+
except:
273+
exit(2)
79274

80-
try:
81-
shutdown(server_name, 'Server', ignoreSessions=ignore_sessions, timeOut=int(timeout), block='true', force=force)
82-
except Exception, e:
83-
print e
84-
print('Connected to the server, but failed to stop it; trying node manager')
85-
shutdownUsingNodeManager(domain_name, domain_path)
86275

87276
# Exit WLST
88277
exit()

0 commit comments

Comments
 (0)