Add Coherence StatusHA check for server shutdown

pfmackin · pfmackin · commit d34be8f4e8c8 · 2019-06-28T15:40:36.000-04:00
diff --git a/operator/src/main/resources/scripts/stop-server.py b/operator/src/main/resources/scripts/stop-server.py
@@ -2,23 +2,200 @@
 # Licensed under the Universal Permissive License v 1.0 as shown at
 # http://oss.oracle.com/licenses/upl.
 
+# This the the preStop hook script (stopServer.sh) that is specified by the domain resource.
+# This script will be called by Kubernetes before it kills a pod.  Kuberentes will kill
+# the pod once this script returns OR when the grace period timeout expires
+# (see Operator shutdown.timeoutSeconds)
 #
-# This is a WLST script for starting WL server via the node manager.
+# This script shuts down the local server running in the container (admin or managed).
+# There are 2 main scenarios, a domain with a Coherence cluster and one without.
+# If there is no Coherence cluster, then the code will attempt to connect to the
+# local server and do a shutdown.  If that doesn't work it will attempt to shutdown
+# using NodeManager.  If that fails it just exits and kubernetes will kill the pod.
+# If a Coherence cluster exists, then we must wait for the Coherence services to be
+# safe before the server is shutdown, or we risk losing data.  This requires a
+# a connection to the admin server and inspection of the Coherence MBeans.  The code
+# will not shutdown the server until Coherence is safe.  If Coherence never becomes safe,
+# then, eventually, the grace period will expire and Kubernetes will kill the pod.
 #
 # It users key and data files that were generated by the introspector
 # for its nmConnect credentials.
 #
 
 import sys
+import traceback
 import base64
+import time as systime
 
+# Get an ENV var
 def getEnvVar(var):
   val=os.environ.get(var)
   if val==None:
     print "ERROR: Env var ",var, " not set."
     sys.exit(1)
   return val
 
+
+# Connect to Node Manager and shut down the local server
+#
+def shutdownUsingNodeManager(domainName, domainDir):
+  try:
+    print('Shutdown: Attempting shutdown using NodeManager')
+    nmConnect(userConfigFile='/weblogic-operator/introspector/userConfigNodeManager.secure',
+              userKeyFile='/tmp/userKeyNodeManager.secure.bin',
+              host='127.0.0.1',port='5556',
+              domainName=domainName,
+              domainDir=domainDir,
+              nmType='plain')
+    print('Shutdown: Successfully connected to NodeManager')
+    nmKill(server_name)
+    print('Shutdown: Successfully killed server using NodeManager')
+  except Exception, e:
+    traceback.print_exc(file=sys.stdout)
+    print('Shutdown: Failed to kill server using NodeManager')
+    raise
+
+
+# Check if Coherence exists using wlst offline
+def doesCoherenceExist():
+  try:
+    readDomain(domain_path)
+    exists = checkCoherenceClusterExist()
+  except:
+    print('Shutdown: Exception reading domain offline, assume Coherence exists')
+    return True
+  try:
+    closeDomain()
+  except:
+    pass
+  return exists
+
+# Check if there is a CoherenceClusterSystemResource. This will indicate that the domain is
+# using Coherence
+def checkCoherenceClusterExist():
+  try:
+    cd('/')
+    if  ls().find('CoherenceClusterSystemResource') == -1:
+      return False
+    cd('CoherenceClusterSystemResource')
+    val = ls()
+    if (val is None) or len(val.strip()) == 0:
+      print('Shutdown: This domain does not have a CoherenceClusterSystemResource')
+      return False
+    else:
+      print('Shutdown: This domain has CoherenceClusterSystemResource ' + val)
+      return True
+  except:
+    # Exception will occur if CoherenceClusterSystemResource is missing
+    traceback.print_exc(file=sys.stdout)
+    dumpStack()
+    print('Shutdown: Exception getting CoherenceClusterSystemResource')
+    return False
+
+
+# Coherence exists and we cannot connect to the admin server.  To be on
+# the safe side, loop forever until the Operator kills the pod otherwise
+# there is a risk to lose data
+def waitForeverBecauseCoherence():
+  print('Shutdown: Waiting until Operator kills the pod since this domain has Coherence')
+  while (True):
+    print('Shutdown: Sleeping 60 seconds ...')
+    systime.sleep(60)
+
+
+# If there is a Coherence cluster then we wait until it is safe to shutdown
+# all distributed services.  Each distributed cache (with backup count > 0) has data
+# that is in both in a primary partition on one cluster node (i.e. WLS server) and
+# a backup partition on a different node.
+# During rolling restart, Coherence rebalances the cluster and moves both primary and
+# backup partitions across the nodes.  During this time the distributed services affected
+# are consider ENDANGERED.  For example, in a 3 node cluster, if node 1 was being restarted
+# and we shut down node 2 before Coherence was safe then we may lose data.
+# NOTE: The overall timeout used to control the termination of this container is
+# controlled by the Kubernetes graceful timeout value. So even
+# if this code looped forever, Kubernetes will kill the pod once the timeout expires.
+# The user must set that timeout to a large enough value to give Coherence time to get safe.
+def waitUntilCoherenceSafe():
+  print ('Shutdown: getting all service Coherence MBeans')
+
+  domainRuntime()
+  query='Coherence:type=Service,name=*,*'
+
+  # Wait forever until we get positive ack that it is ok to shutdown this server.
+  done = False
+  while (not done):
+    try:
+      beans = list(mbs.queryMBeans(ObjectName(query), None))
+      print("Shutdown: Coherence service bean count " + str(len(beans)))
+
+      # Loop waiting for each service to be safe
+      for serviceBean in beans:
+        objectName = serviceBean.getObjectName()
+        waitUntilServiceSafeToShutdown(objectName)
+      done = True
+      print ('Shutdown: It is safe to shutdown Coherence')
+
+    except:
+      print ("Shutdown: Exception checking a service Coherence statusHA, retrying...")
+      traceback.print_exc(file=sys.stdout)
+      dumpStack()
+      systime.sleep(30)
+      pass
+
+
+# Wait until the specified Coherence service is safe to shutdown.
+# If the cluster is a single node cluster then the service will always
+# be ENDANGERED, therefore it is the responsibility of the user to
+# set Coherence backup count to 0, or to set the terminate grace period
+# to a low number since this method will just wait until the oeprator kills the
+# pod.
+def waitUntilServiceSafeToShutdown(objectName):
+
+  print ("Shutdown: checking Coherence service " + str(objectName) )
+
+  # NOTE: break loop when it safe to shutdown else stay in loop forever
+  while (True):
+    try:
+      # If the BackupCount is > 0 then the user intention is to have
+      # HA with backed up partitions, otherwise if value is < 1 then the user
+      # doesn't care about HA so no need to wait.
+      # NOTE: if this is NOT a partitioned service we will get exception, ignore and exit loop
+      try:
+        val = mbs.getAttribute(objectName,"BackupCount")
+      except:
+        print ("Shutdown: Coherence BackupCount attribute missing")
+        val = None
+        pass
+
+      if (val is None) or int(val) < 1:
+        print ("Shutdown: Coherence skipping status check for this service since BackupCount < 1.  Value is " + str(val))
+        break
+
+      status = mbs.getAttribute(objectName,"StatusHA")
+      if (status is None):
+        print ("Shutdown: None returned for Coherence StatusHA")
+        break
+
+      print ('Shutdown: Coherence StatusHA is ' + status)
+      if status != "ENDANGERED":
+        break
+
+      # Coherence caches are ENDANGERED meaning that we may lose data
+      print ('Shutdown: Waiting until it is safe to shutdown Coherence server ...')
+      systime.sleep(5)
+
+    except:
+      print ('Shutdown: An exception occurred getting Coherence MBeans, staying in loop checking for safe')
+      traceback.print_exc(file=sys.stdout)
+      dumpStack()
+      systime.sleep(30)
+      pass
+
+
+#----------------------------------
+# Main script
+#----------------------------------
+print ("Shutdown: main script")
 domain_uid = getEnvVar('DOMAIN_UID')
 server_name = getEnvVar('SERVER_NAME')
 domain_name = getEnvVar('DOMAIN_NAME')
@@ -29,15 +206,14 @@ def getEnvVar(var):
 timeout = getEnvVar('SHUTDOWN_TIMEOUT_ARG')
 ignore_sessions = getEnvVar('SHUTDOWN_IGNORE_SESSIONS_ARG')
 shutdown_type = getEnvVar('SHUTDOWN_TYPE_ARG')
+admin_port = getEnvVar('ADMIN_PORT')
+admin_host = getEnvVar('AS_SERVICE_NAME')
 
 force = 'false'
 if shutdown_type.lower() == 'forced':
   force = 'true'
 
-connect_url = local_admin_protocol + '://' + service_name + ':' + local_admin_port
-
 # Convert b64 encoded user key into binary
-
 file = open('/weblogic-operator/introspector/userKeyNodeManager.secure', 'r')
 contents = file.read()
 file.close()
@@ -47,42 +223,55 @@ def getEnvVar(var):
 file.write(decoded)
 file.close()
 
-def shutdownUsingNodeManager(domainName, domainDir):
-  try:
-    nmConnect(userConfigFile='/weblogic-operator/introspector/userConfigNodeManager.secure',
-              host='127.0.0.1',port='5556',
-              domainName=domainName,
-              domainDir=domainDir,
-              nmType='plain')
-  except Exception, e:
-    print e
-    print('Failed to connect to the node manager')
-    exit(exitcode=2)
+# check if Coherence cluster exists in this domain
+cohExists = doesCoherenceExist()
 
-# Connect to the server and request that it shuts down
+# If Coherence exists then we need to connect to admin server, else local server
+if (cohExists):
+  print ('Shutdown: Coherence cluster exists')
+  connect_url = local_admin_protocol + '://' + admin_host + ':' + admin_port
+else:
+  print ('Shutdown: Coherence cluster does not exist')
+  connect_url = local_admin_protocol + '://' + service_name + ':' + local_admin_port
 
-try:
-  connect(userConfigFile='/weblogic-operator/introspector/userConfigNodeManager.secure',
+# Stay in loop until the server is shutdown if Coherence exists.  For non-Coherence
+# just make a best effort
+stayInConnectLoop = True
+cohSafe = False
+while (stayInConnectLoop):
+  try:
+    stayInConnectLoop = False
+    print ('Shutdown: Connecting to server at ' + connect_url)
+    connect(userConfigFile='/weblogic-operator/introspector/userConfigNodeManager.secure',
             userKeyFile='/tmp/userKeyNodeManager.secure.bin',
             url=connect_url,
             domainName=domain_name,
             domainDir=domain_path,
             nmType='plain')
 
-  print('Connected to the server - attempting to issue shutdown command')
-except Exception, e:
-  print e
-  print('Failed to connect to the server; trying node manager')
-  shutdownUsingNodeManager(domain_name, domain_path)
+    print('Shutdown: Successfully connected to server. Calling server shutdown')
+
+    if (cohExists):
+      waitUntilCoherenceSafe()
+      cohSafe = True
 
-# shutdown the server
+    shutdown(server_name, 'Server', ignoreSessions=ignore_sessions, timeOut=int(timeout), block='true', force=force)
+    print('Shutdown: Successfully shutdown the server')
+
+  except Exception, e:
+    print e
+    print('Shutdown: Exception in connect or shutdown')
+    if (cohExists and not cohSafe):
+      print('Shutdown: Coherence not safe to shutdown. Sleeping before connect retry ...')
+      stayInConnectLoop = True
+      systime.sleep(30)
+    else:
+      try:
+        shutdownUsingNodeManager(domain_name, domain_path)
+        exit()
+      except:
+        exit(2)
 
-try:
-  shutdown(server_name, 'Server', ignoreSessions=ignore_sessions, timeOut=int(timeout), block='true', force=force)
-except Exception, e:
-  print e
-  print('Connected to the server, but failed to stop it; trying node manager')
-  shutdownUsingNodeManager(domain_name, domain_path)
 
 # Exit WLST
 exit()