2
2
# Licensed under the Universal Permissive License v 1.0 as shown at
3
3
# http://oss.oracle.com/licenses/upl.
4
4
5
+ # This the the preStop hook script (stopServer.sh) that is specified by the domain resource.
6
+ # This script will be called by Kubernetes before it kills a pod. Kuberentes will kill
7
+ # the pod once this script returns OR when the grace period timeout expires
8
+ # (see Operator shutdown.timeoutSeconds)
5
9
#
6
- # This is a WLST script for starting WL server via the node manager.
10
+ # This script shuts down the local server running in the container (admin or managed).
11
+ # There are 2 main scenarios, a domain with a Coherence cluster and one without.
12
+ # If there is no Coherence cluster, then the code will attempt to connect to the
13
+ # local server and do a shutdown. If that doesn't work it will attempt to shutdown
14
+ # using NodeManager. If that fails it just exits and kubernetes will kill the pod.
15
+ # If a Coherence cluster exists, then we must wait for the Coherence services to be
16
+ # safe before the server is shutdown, or we risk losing data. This requires a
17
+ # a connection to the admin server and inspection of the Coherence MBeans. The code
18
+ # will not shutdown the server until Coherence is safe. If Coherence never becomes safe,
19
+ # then, eventually, the grace period will expire and Kubernetes will kill the pod.
7
20
#
8
21
# It users key and data files that were generated by the introspector
9
22
# for its nmConnect credentials.
10
23
#
11
24
12
25
import sys
26
+ import traceback
13
27
import base64
28
+ import time as systime
14
29
30
+ # Get an ENV var
15
31
def getEnvVar (var ):
16
32
val = os .environ .get (var )
17
33
if val == None :
18
34
print "ERROR: Env var " ,var , " not set."
19
35
sys .exit (1 )
20
36
return val
21
37
38
+
39
+ # Connect to Node Manager and shut down the local server
40
+ #
41
+ def shutdownUsingNodeManager (domainName , domainDir ):
42
+ try :
43
+ print ('Shutdown: Attempting shutdown using NodeManager' )
44
+ nmConnect (userConfigFile = '/weblogic-operator/introspector/userConfigNodeManager.secure' ,
45
+ userKeyFile = '/tmp/userKeyNodeManager.secure.bin' ,
46
+ host = '127.0.0.1' ,port = '5556' ,
47
+ domainName = domainName ,
48
+ domainDir = domainDir ,
49
+ nmType = 'plain' )
50
+ print ('Shutdown: Successfully connected to NodeManager' )
51
+ nmKill (server_name )
52
+ print ('Shutdown: Successfully killed server using NodeManager' )
53
+ except Exception , e :
54
+ traceback .print_exc (file = sys .stdout )
55
+ print ('Shutdown: Failed to kill server using NodeManager' )
56
+ raise
57
+
58
+
59
+ # Check if Coherence exists using wlst offline
60
+ def doesCoherenceExist ():
61
+ try :
62
+ readDomain (domain_path )
63
+ exists = checkCoherenceClusterExist ()
64
+ except :
65
+ print ('Shutdown: Exception reading domain offline, assume Coherence exists' )
66
+ return True
67
+ try :
68
+ closeDomain ()
69
+ except :
70
+ pass
71
+ return exists
72
+
73
+ # Check if there is a CoherenceClusterSystemResource. This will indicate that the domain is
74
+ # using Coherence
75
+ def checkCoherenceClusterExist ():
76
+ try :
77
+ cd ('/' )
78
+ if ls ().find ('CoherenceClusterSystemResource' ) == - 1 :
79
+ return False
80
+ cd ('CoherenceClusterSystemResource' )
81
+ val = ls ()
82
+ if (val is None ) or len (val .strip ()) == 0 :
83
+ print ('Shutdown: This domain does not have a CoherenceClusterSystemResource' )
84
+ return False
85
+ else :
86
+ print ('Shutdown: This domain has CoherenceClusterSystemResource ' + val )
87
+ return True
88
+ except :
89
+ # Exception will occur if CoherenceClusterSystemResource is missing
90
+ traceback .print_exc (file = sys .stdout )
91
+ dumpStack ()
92
+ print ('Shutdown: Exception getting CoherenceClusterSystemResource' )
93
+ return False
94
+
95
+
96
+ # Coherence exists and we cannot connect to the admin server. To be on
97
+ # the safe side, loop forever until the Operator kills the pod otherwise
98
+ # there is a risk to lose data
99
+ def waitForeverBecauseCoherence ():
100
+ print ('Shutdown: Waiting until Operator kills the pod since this domain has Coherence' )
101
+ while (True ):
102
+ print ('Shutdown: Sleeping 60 seconds ...' )
103
+ systime .sleep (60 )
104
+
105
+
106
+ # If there is a Coherence cluster then we wait until it is safe to shutdown
107
+ # all distributed services. Each distributed cache (with backup count > 0) has data
108
+ # that is in both in a primary partition on one cluster node (i.e. WLS server) and
109
+ # a backup partition on a different node.
110
+ # During rolling restart, Coherence rebalances the cluster and moves both primary and
111
+ # backup partitions across the nodes. During this time the distributed services affected
112
+ # are consider ENDANGERED. For example, in a 3 node cluster, if node 1 was being restarted
113
+ # and we shut down node 2 before Coherence was safe then we may lose data.
114
+ # NOTE: The overall timeout used to control the termination of this container is
115
+ # controlled by the Kubernetes graceful timeout value. So even
116
+ # if this code looped forever, Kubernetes will kill the pod once the timeout expires.
117
+ # The user must set that timeout to a large enough value to give Coherence time to get safe.
118
+ def waitUntilCoherenceSafe ():
119
+ print ('Shutdown: getting all service Coherence MBeans' )
120
+
121
+ domainRuntime ()
122
+ query = 'Coherence:type=Service,name=*,*'
123
+
124
+ # Wait forever until we get positive ack that it is ok to shutdown this server.
125
+ done = False
126
+ while (not done ):
127
+ try :
128
+ beans = list (mbs .queryMBeans (ObjectName (query ), None ))
129
+ print ("Shutdown: Coherence service bean count " + str (len (beans )))
130
+
131
+ # Loop waiting for each service to be safe
132
+ for serviceBean in beans :
133
+ objectName = serviceBean .getObjectName ()
134
+ waitUntilServiceSafeToShutdown (objectName )
135
+ done = True
136
+ print ('Shutdown: It is safe to shutdown Coherence' )
137
+
138
+ except :
139
+ print ("Shutdown: Exception checking a service Coherence statusHA, retrying..." )
140
+ traceback .print_exc (file = sys .stdout )
141
+ dumpStack ()
142
+ systime .sleep (30 )
143
+ pass
144
+
145
+
146
+ # Wait until the specified Coherence service is safe to shutdown.
147
+ # If the cluster is a single node cluster then the service will always
148
+ # be ENDANGERED, therefore it is the responsibility of the user to
149
+ # set Coherence backup count to 0, or to set the terminate grace period
150
+ # to a low number since this method will just wait until the oeprator kills the
151
+ # pod.
152
+ def waitUntilServiceSafeToShutdown (objectName ):
153
+
154
+ print ("Shutdown: checking Coherence service " + str (objectName ) )
155
+
156
+ # NOTE: break loop when it safe to shutdown else stay in loop forever
157
+ while (True ):
158
+ try :
159
+ # If the BackupCount is > 0 then the user intention is to have
160
+ # HA with backed up partitions, otherwise if value is < 1 then the user
161
+ # doesn't care about HA so no need to wait.
162
+ # NOTE: if this is NOT a partitioned service we will get exception, ignore and exit loop
163
+ try :
164
+ val = mbs .getAttribute (objectName ,"BackupCount" )
165
+ except :
166
+ print ("Shutdown: Coherence BackupCount attribute missing" )
167
+ val = None
168
+ pass
169
+
170
+ if (val is None ) or int (val ) < 1 :
171
+ print ("Shutdown: Coherence skipping status check for this service since BackupCount < 1. Value is " + str (val ))
172
+ break
173
+
174
+ status = mbs .getAttribute (objectName ,"StatusHA" )
175
+ if (status is None ):
176
+ print ("Shutdown: None returned for Coherence StatusHA" )
177
+ break
178
+
179
+ print ('Shutdown: Coherence StatusHA is ' + status )
180
+ if status != "ENDANGERED" :
181
+ break
182
+
183
+ # Coherence caches are ENDANGERED meaning that we may lose data
184
+ print ('Shutdown: Waiting until it is safe to shutdown Coherence server ...' )
185
+ systime .sleep (5 )
186
+
187
+ except :
188
+ print ('Shutdown: An exception occurred getting Coherence MBeans, staying in loop checking for safe' )
189
+ traceback .print_exc (file = sys .stdout )
190
+ dumpStack ()
191
+ systime .sleep (30 )
192
+ pass
193
+
194
+
195
+ #----------------------------------
196
+ # Main script
197
+ #----------------------------------
198
+ print ("Shutdown: main script" )
22
199
domain_uid = getEnvVar ('DOMAIN_UID' )
23
200
server_name = getEnvVar ('SERVER_NAME' )
24
201
domain_name = getEnvVar ('DOMAIN_NAME' )
@@ -29,15 +206,14 @@ def getEnvVar(var):
29
206
timeout = getEnvVar ('SHUTDOWN_TIMEOUT_ARG' )
30
207
ignore_sessions = getEnvVar ('SHUTDOWN_IGNORE_SESSIONS_ARG' )
31
208
shutdown_type = getEnvVar ('SHUTDOWN_TYPE_ARG' )
209
+ admin_port = getEnvVar ('ADMIN_PORT' )
210
+ admin_host = getEnvVar ('AS_SERVICE_NAME' )
32
211
33
212
force = 'false'
34
213
if shutdown_type .lower () == 'forced' :
35
214
force = 'true'
36
215
37
- connect_url = local_admin_protocol + '://' + service_name + ':' + local_admin_port
38
-
39
216
# Convert b64 encoded user key into binary
40
-
41
217
file = open ('/weblogic-operator/introspector/userKeyNodeManager.secure' , 'r' )
42
218
contents = file .read ()
43
219
file .close ()
@@ -47,42 +223,55 @@ def getEnvVar(var):
47
223
file .write (decoded )
48
224
file .close ()
49
225
50
- def shutdownUsingNodeManager (domainName , domainDir ):
51
- try :
52
- nmConnect (userConfigFile = '/weblogic-operator/introspector/userConfigNodeManager.secure' ,
53
- host = '127.0.0.1' ,port = '5556' ,
54
- domainName = domainName ,
55
- domainDir = domainDir ,
56
- nmType = 'plain' )
57
- except Exception , e :
58
- print e
59
- print ('Failed to connect to the node manager' )
60
- exit (exitcode = 2 )
226
+ # check if Coherence cluster exists in this domain
227
+ cohExists = doesCoherenceExist ()
61
228
62
- # Connect to the server and request that it shuts down
229
+ # If Coherence exists then we need to connect to admin server, else local server
230
+ if (cohExists ):
231
+ print ('Shutdown: Coherence cluster exists' )
232
+ connect_url = local_admin_protocol + '://' + admin_host + ':' + admin_port
233
+ else :
234
+ print ('Shutdown: Coherence cluster does not exist' )
235
+ connect_url = local_admin_protocol + '://' + service_name + ':' + local_admin_port
63
236
64
- try :
65
- connect (userConfigFile = '/weblogic-operator/introspector/userConfigNodeManager.secure' ,
237
+ # Stay in loop until the server is shutdown if Coherence exists. For non-Coherence
238
+ # just make a best effort
239
+ stayInConnectLoop = True
240
+ cohSafe = False
241
+ while (stayInConnectLoop ):
242
+ try :
243
+ stayInConnectLoop = False
244
+ print ('Shutdown: Connecting to server at ' + connect_url )
245
+ connect (userConfigFile = '/weblogic-operator/introspector/userConfigNodeManager.secure' ,
66
246
userKeyFile = '/tmp/userKeyNodeManager.secure.bin' ,
67
247
url = connect_url ,
68
248
domainName = domain_name ,
69
249
domainDir = domain_path ,
70
250
nmType = 'plain' )
71
251
72
- print ('Connected to the server - attempting to issue shutdown command ' )
73
- except Exception , e :
74
- print e
75
- print ( 'Failed to connect to the server; trying node manager' )
76
- shutdownUsingNodeManager ( domain_name , domain_path )
252
+ print ('Shutdown: Successfully connected to server. Calling server shutdown' )
253
+
254
+ if ( cohExists ):
255
+ waitUntilCoherenceSafe ( )
256
+ cohSafe = True
77
257
78
- # shutdown the server
258
+ shutdown (server_name , 'Server' , ignoreSessions = ignore_sessions , timeOut = int (timeout ), block = 'true' , force = force )
259
+ print ('Shutdown: Successfully shutdown the server' )
260
+
261
+ except Exception , e :
262
+ print e
263
+ print ('Shutdown: Exception in connect or shutdown' )
264
+ if (cohExists and not cohSafe ):
265
+ print ('Shutdown: Coherence not safe to shutdown. Sleeping before connect retry ...' )
266
+ stayInConnectLoop = True
267
+ systime .sleep (30 )
268
+ else :
269
+ try :
270
+ shutdownUsingNodeManager (domain_name , domain_path )
271
+ exit ()
272
+ except :
273
+ exit (2 )
79
274
80
- try :
81
- shutdown (server_name , 'Server' , ignoreSessions = ignore_sessions , timeOut = int (timeout ), block = 'true' , force = force )
82
- except Exception , e :
83
- print e
84
- print ('Connected to the server, but failed to stop it; trying node manager' )
85
- shutdownUsingNodeManager (domain_name , domain_path )
86
275
87
276
# Exit WLST
88
277
exit ()
0 commit comments