Merge pull request #166 from oci-hpc/2.10.5_ds_prompt_for_resize

arnaudfroidmont · web-flow · commit cf5f80a880fb · 2024-05-15T12:38:40.000-06:00
Add prompt for resize.sh when using remove or remove_unreachable
diff --git a/README.md b/README.md
@@ -104,12 +104,12 @@ optional arguments:
   --ansible_crucial     If present during reconfiguration, only crucial
                         ansible playbooks will be executed on the live nodes.
                         Non live nodes will be removed
-  --remove_unreachable  If present, nodes that are not sshable will be removed
-                        from the config. They will however not be removed from
-                        Slurm to avoid losing track of the down nodes. If you
-                        need to remove them from Slurm after terminating the
-                        nodes in the console. Run sudo scontrol update
-                        nodename=name state=Future 
+  --remove_unreachable  If present, nodes that are not sshable will be terminated 
+                        before running the action that was requested
+                        (Example Adding a node)
+  --quiet               If present, the script will not prompt for a response when 
+                        removing nodes and will not give a reminder to save data 
+                        from nodes that are being removed
 ```
 
 **Add nodes** 
@@ -161,6 +161,13 @@ Remove 3 nodes randomly from compute-1-hpc:
 ```
 /opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc
 
+```
+or 
+Remove 3 nodes randomly from compute-1-hpc but do not prompt for a response when removing the nodes and do not give a reminder to save data 
+from nodes that are being removed :  
+```
+/opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc --quiet
+
 ```
 
 **Reconfigure nodes** 
@@ -208,6 +215,10 @@ Uncomment the line in `crontab -e`:
 ```
 * * * * * /opt/oci-hpc/autoscaling/crontab/autoscale_slurm.sh >> /opt/oci-hpc/logs/crontab_slurm.log 2>&1
 ```
+And in /etc/ansible/hosts, below value should be true
+```
+autoscaling = true
+```
 
 # Submit
 How to submit jobs: 
diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh
@@ -91,8 +91,8 @@ def getIdleTime(node):
     return ( datetime.datetime.now() - right_time ).total_seconds()
 
 # Get the last time a node state was changed. This is used to get how long a cluster has been idle for
-def getQueueConf(file):
-    with open(queues_conf_file) as file:
+def getQueueConf(queue_file):
+    with open(queue_file) as file:
         try:
             data = yaml.load(file,Loader=yaml.FullLoader)
         except:
@@ -328,109 +328,125 @@ def getstatus_slurm():
             cluster_destroying.append(clusterName)
     return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes
 
-if os.path.isfile(lockfile):
-    print( "Lockfile "+lockfile + " is present, exiting" )
-    exit()
-open(lockfile,'w').close()
-try:
-    path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
-    clusters_path = os.path.join(path,'clusters')
-    config = getQueueConf(queues_conf_file)
-
-    cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()
-
-    print (time.strftime("%Y-%m-%d %H:%M:%S"))
-    print (cluster_to_build,'cluster_to_build')
-    print (cluster_to_destroy,'cluster_to_destroy')
-    print (nodes_to_destroy,'nodes_to_destroy')
-    print (cluster_building,'cluster_building')
-    print (cluster_destroying,'cluster_destroying')
-    print (current_nodes,'current_nodes')
-    print (building_nodes,'building_nodes')
-
-    for i in cluster_building:
-        for j in cluster_to_build:
-            if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
-                cluster_to_build.remove(j)
-                break
-    for cluster in cluster_to_destroy:
-        cluster_name=cluster[0]
-        print ("Deleting cluster "+cluster_name)
-        subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
-        time.sleep(5)
-
-    for cluster_name in nodes_to_destroy.keys():
-        print ("Resizing cluster "+cluster_name)
-        initial_nodes=[]
-        unreachable_nodes=[]
-        if cluster_name == "NOCLUSTERFOUND":
-            subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name])
-            continue
-        for node in nodes_to_destroy[cluster_name]:
+def getAutoscaling():
+    out = subprocess.Popen(["cat /etc/ansible/hosts | grep 'autoscaling =' | awk -F  '= ' '{print $2}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True)
+    stdout,stderr = out.communicate()
+    output = stdout.split("\n")
+    autoscaling_value=False
+    for i in range(0,len(output)-1):
+        autoscaling_value=output[i]
+    return autoscaling_value
+
+autoscaling = getAutoscaling()
+
+if autoscaling == "true":
+
+    if os.path.isfile(lockfile):
+        print( "Lockfile "+lockfile + " is present, exiting" )
+        exit()
+    open(lockfile,'w').close()
+    try:
+        path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
+        clusters_path = os.path.join(path,'clusters')
+        config = getQueueConf(queues_conf_file)
+
+        cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()
+
+        print (time.strftime("%Y-%m-%d %H:%M:%S"))
+        print (cluster_to_build,'cluster_to_build')
+        print (cluster_to_destroy,'cluster_to_destroy')
+        print (nodes_to_destroy,'nodes_to_destroy')
+        print (cluster_building,'cluster_building')
+        print (cluster_destroying,'cluster_destroying')
+        print (current_nodes,'current_nodes')
+        print (building_nodes,'building_nodes')
+
+        for i in cluster_building:
+            for j in cluster_to_build:
+                if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
+                    cluster_to_build.remove(j)
+                    break
+        for cluster in cluster_to_destroy:
+            cluster_name=cluster[0]
+            print ("Deleting cluster "+cluster_name)
+            subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
+            time.sleep(5)
+
+        for cluster_name in nodes_to_destroy.keys():
+            print ("Resizing cluster "+cluster_name)
+            initial_nodes=[]
+            unreachable_nodes=[]
+            if cluster_name == "NOCLUSTERFOUND":
+                subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet')
+                continue
+            for node in nodes_to_destroy[cluster_name]:
+                try:
+                    alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
+                    for alt_name in alt_names.split("\n")[0].split():
+                        if alt_name.startswith('inst-'):
+                            initial_nodes.append(alt_name)
+                            break
+                except:
+                    unreachable_nodes.append(node)    
+            if len(initial_nodes) > 0:
+                subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet')
+            if len(unreachable_nodes) > 0:
+                subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet')
+            time.sleep(1)
+
+        for index,cluster in enumerate(cluster_to_build):
+            nodes=cluster[0]
+            instance_type = cluster[1]
+            queue=cluster[2]
+            jobID=str(cluster[3])
+            user=str(cluster[4])
+            jobconfig=getJobConfig(config,queue,instance_type)
+            limits=getQueueLimits(config,queue,instance_type)
             try:
-                alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
-                for alt_name in alt_names.split("\n")[0].split():
-                    if alt_name.startswith('inst-'):
-                        initial_nodes.append(alt_name)
-                        break
+                clusterCount=len(used_index[queue][instance_type])
             except:
-                unreachable_nodes.append(node)    
-        if len(initial_nodes) > 0:
-            subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes)
-        if len(unreachable_nodes) > 0:
-            subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes)
-        time.sleep(1)
-
-    for index,cluster in enumerate(cluster_to_build):
-        nodes=cluster[0]
-        instance_type = cluster[1]
-        queue=cluster[2]
-        jobID=str(cluster[3])
-        user=str(cluster[4])
-        jobconfig=getJobConfig(config,queue,instance_type)
-        limits=getQueueLimits(config,queue,instance_type)
-        try:
-            clusterCount=len(used_index[queue][instance_type])
-        except:
-            clusterCount=0
-        if clusterCount>=limits["max_cluster_count"]:
-            print ("This would go over the number of running clusters, you have reached the max number of clusters")
-            continue
-        nextIndex=None
-        if clusterCount==0:
-            if queue in used_index.keys():
-                used_index[queue][instance_type]=[1]
+                clusterCount=0
+            if clusterCount>=limits["max_cluster_count"]:
+                print ("This would go over the number of running clusters, you have reached the max number of clusters")
+                continue
+            nextIndex=None
+            if clusterCount==0:
+                if queue in used_index.keys():
+                    used_index[queue][instance_type]=[1]
+                else:
+                    used_index[queue]={instance_type:[1]}
+                nextIndex=1
             else:
-                used_index[queue]={instance_type:[1]}
-            nextIndex=1
-        else:
-            for i in range(1,10000):
-                if not i in used_index[queue][instance_type]:
-                    nextIndex=i
-                    used_index[queue][instance_type].append(i)
-                    break
-        clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
-        if not queue in current_nodes.keys():
-            current_nodes[queue]={instance_type:0}
-        else:
-            if not instance_type in current_nodes[queue].keys():
-                current_nodes[queue][instance_type]=0
-        if not queue in building_nodes.keys():
-            building_nodes[queue]={instance_type:0}
-        else:
-            if not instance_type in building_nodes[queue].keys():
-                building_nodes[queue][instance_type]=0
-        if nodes > limits["max_cluster_size"]:
-            print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
-        elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
-            print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
-        else:
-            current_nodes[queue][instance_type]+=nodes
-            clusterCount+=1
-            print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
-            subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
-            time.sleep(5)
+                for i in range(1,10000):
+                    if not i in used_index[queue][instance_type]:
+                        nextIndex=i
+                        used_index[queue][instance_type].append(i)
+                        break
+            clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
+            if not queue in current_nodes.keys():
+                current_nodes[queue]={instance_type:0}
+            else:
+                if not instance_type in current_nodes[queue].keys():
+                    current_nodes[queue][instance_type]=0
+            if not queue in building_nodes.keys():
+                building_nodes[queue]={instance_type:0}
+            else:
+                if not instance_type in building_nodes[queue].keys():
+                    building_nodes[queue][instance_type]=0
+            if nodes > limits["max_cluster_size"]:
+                print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
+            elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
+                print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
+            else:
+                current_nodes[queue][instance_type]+=nodes
+                clusterCount+=1
+                print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
+                subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
+                time.sleep(5)
 
-except Exception:
-    traceback.print_exc()
-os.remove(lockfile)
+    except Exception:
+        traceback.print_exc()
+    os.remove(lockfile)
+else:
+    print("Autoscaling is false")
+    exit()
diff --git a/bin/delete_cluster.sh b/bin/delete_cluster.sh
@@ -103,7 +103,7 @@ else
         for node in `scontrol show hostname $nodes 2>&1`
         do
             echo "Cleaning up node " $node
-            /opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node 
+            /opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node --quiet
         done
     fi
     cd
diff --git a/bin/remove_nodes_prompt.txt b/bin/remove_nodes_prompt.txt
@@ -0,0 +1,4 @@
+Does your cluster run any file system like Ceph, NFS, etc. on the GPU/HPC nodes itself using local NVMe SSDs?  
+If yes, terminating nodes which store your data can result in permanent data loss, so before proceeding make sure any important data is copied to a persistent file system outside of the cluster such as to object storage, file storage, etc.
+Once data is backed up or migrated, come back and run the script. Select 2 to exit.
+Remember, once the nodes are terminated, all the data is lost forever and you won't be able to recover it.
diff --git a/bin/resize.py b/bin/resize.py
@@ -577,6 +577,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index
 parser.add_argument('--force', help='If present. Nodes will be removed even if the destroy playbook failed',action='store_true',default=False)
 parser.add_argument('--ansible_crucial', help='If present during reconfiguration, only crucial ansible playbooks will be executed on the live nodes. Non live nodes will be removed',action='store_true',default=False)
 parser.add_argument('--remove_unreachable', help='If present, nodes that are not sshable will be terminated before running the action that was requested (Example Adding a node) ',action='store_true',default=False)
+parser.add_argument('--quiet', help='If present, the script will not prompt for a response when removing nodes and will not give a reminder to save data from nodes that are being removed ',action='store_true',default=False)
 
 args = parser.parse_args()
 
diff --git a/bin/resize.sh b/bin/resize.sh
@@ -15,6 +15,12 @@ then
   exit
 fi
 
+if [ $USER != "ubuntu" ] && [ $USER != "opc" ]
+then
+  echo "Run this script as opc or ubuntu"
+  exit
+fi
+
 if [ $# -eq 0 ]
 then
   python3 $folder/resize.py --help
@@ -26,6 +32,7 @@ permanent=1
 controllerName=`hostname`
 cluster_name=${controllerName/-controller/}
 nodes=NULL
+quietMode=False
 for (( i=1; i<=$#; i++)); do
     if [ ${!i} == "--cluster_name" ]
     then
@@ -48,9 +55,24 @@ for (( i=1; i<=$#; i++)); do
     then
       j=$((i+1))
       nodes=${@:j}
+    elif [ ${!i} == "--quiet" ]
+    then
+      quietMode=True
     fi
 done
 
+if [ $resize_type == "remove" ] || [ $resize_type == "remove_unreachable" ] && [ $quietMode == "False" ]
+then
+  echo "$(cat $folder/remove_nodes_prompt.txt)"
+  echo "Do you confirm you have done all of the above steps and wish to proceed for the termination of the nodes? Enter 1 for Yes and 2 for No (to exit)."
+  select yn in "Yes" "No"; do
+    case $yn in
+        Yes ) break;;
+        No ) exit;;
+    esac
+  done
+fi
+
 if [ $resize_type != "default" ]
 then
   if [ $permanent -eq 0 ]
@@ -148,5 +170,5 @@ then
     rm currently_resizing
   fi
 else
-  python3 $folder/resize.py ${@} 
+  python3 $folder/resize.py ${@} &
 fi