Skip to content

Commit cf5f80a

Browse files
Merge pull request #166 from oci-hpc/2.10.5_ds_prompt_for_resize
Add prompt for resize.sh when using remove or remove_unreachable
2 parents 5d3a98e + 2d8e3ba commit cf5f80a

File tree

6 files changed

+166
-112
lines changed

6 files changed

+166
-112
lines changed

README.md

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,12 @@ optional arguments:
104104
--ansible_crucial If present during reconfiguration, only crucial
105105
ansible playbooks will be executed on the live nodes.
106106
Non live nodes will be removed
107-
--remove_unreachable If present, nodes that are not sshable will be removed
108-
from the config. They will however not be removed from
109-
Slurm to avoid losing track of the down nodes. If you
110-
need to remove them from Slurm after terminating the
111-
nodes in the console. Run sudo scontrol update
112-
nodename=name state=Future
107+
--remove_unreachable If present, nodes that are not sshable will be terminated
108+
before running the action that was requested
109+
(Example Adding a node)
110+
--quiet If present, the script will not prompt for a response when
111+
removing nodes and will not give a reminder to save data
112+
from nodes that are being removed
113113
```
114114

115115
**Add nodes**
@@ -161,6 +161,13 @@ Remove 3 nodes randomly from compute-1-hpc:
161161
```
162162
/opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc
163163
164+
```
165+
or
166+
Remove 3 nodes randomly from compute-1-hpc but do not prompt for a response when removing the nodes and do not give a reminder to save data
167+
from nodes that are being removed :
168+
```
169+
/opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc --quiet
170+
164171
```
165172

166173
**Reconfigure nodes**
@@ -208,6 +215,10 @@ Uncomment the line in `crontab -e`:
208215
```
209216
* * * * * /opt/oci-hpc/autoscaling/crontab/autoscale_slurm.sh >> /opt/oci-hpc/logs/crontab_slurm.log 2>&1
210217
```
218+
And in /etc/ansible/hosts, below value should be true
219+
```
220+
autoscaling = true
221+
```
211222

212223
# Submit
213224
How to submit jobs:

autoscaling/crontab/autoscale_slurm.sh

Lines changed: 120 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,8 @@ def getIdleTime(node):
9191
return ( datetime.datetime.now() - right_time ).total_seconds()
9292

9393
# Get the last time a node state was changed. This is used to get how long a cluster has been idle for
94-
def getQueueConf(file):
95-
with open(queues_conf_file) as file:
94+
def getQueueConf(queue_file):
95+
with open(queue_file) as file:
9696
try:
9797
data = yaml.load(file,Loader=yaml.FullLoader)
9898
except:
@@ -328,109 +328,125 @@ def getstatus_slurm():
328328
cluster_destroying.append(clusterName)
329329
return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes
330330

331-
if os.path.isfile(lockfile):
332-
print( "Lockfile "+lockfile + " is present, exiting" )
333-
exit()
334-
open(lockfile,'w').close()
335-
try:
336-
path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
337-
clusters_path = os.path.join(path,'clusters')
338-
config = getQueueConf(queues_conf_file)
339-
340-
cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()
341-
342-
print (time.strftime("%Y-%m-%d %H:%M:%S"))
343-
print (cluster_to_build,'cluster_to_build')
344-
print (cluster_to_destroy,'cluster_to_destroy')
345-
print (nodes_to_destroy,'nodes_to_destroy')
346-
print (cluster_building,'cluster_building')
347-
print (cluster_destroying,'cluster_destroying')
348-
print (current_nodes,'current_nodes')
349-
print (building_nodes,'building_nodes')
350-
351-
for i in cluster_building:
352-
for j in cluster_to_build:
353-
if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
354-
cluster_to_build.remove(j)
355-
break
356-
for cluster in cluster_to_destroy:
357-
cluster_name=cluster[0]
358-
print ("Deleting cluster "+cluster_name)
359-
subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
360-
time.sleep(5)
361-
362-
for cluster_name in nodes_to_destroy.keys():
363-
print ("Resizing cluster "+cluster_name)
364-
initial_nodes=[]
365-
unreachable_nodes=[]
366-
if cluster_name == "NOCLUSTERFOUND":
367-
subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name])
368-
continue
369-
for node in nodes_to_destroy[cluster_name]:
331+
def getAutoscaling():
332+
out = subprocess.Popen(["cat /etc/ansible/hosts | grep 'autoscaling =' | awk -F '= ' '{print $2}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True)
333+
stdout,stderr = out.communicate()
334+
output = stdout.split("\n")
335+
autoscaling_value=False
336+
for i in range(0,len(output)-1):
337+
autoscaling_value=output[i]
338+
return autoscaling_value
339+
340+
autoscaling = getAutoscaling()
341+
342+
if autoscaling == "true":
343+
344+
if os.path.isfile(lockfile):
345+
print( "Lockfile "+lockfile + " is present, exiting" )
346+
exit()
347+
open(lockfile,'w').close()
348+
try:
349+
path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
350+
clusters_path = os.path.join(path,'clusters')
351+
config = getQueueConf(queues_conf_file)
352+
353+
cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()
354+
355+
print (time.strftime("%Y-%m-%d %H:%M:%S"))
356+
print (cluster_to_build,'cluster_to_build')
357+
print (cluster_to_destroy,'cluster_to_destroy')
358+
print (nodes_to_destroy,'nodes_to_destroy')
359+
print (cluster_building,'cluster_building')
360+
print (cluster_destroying,'cluster_destroying')
361+
print (current_nodes,'current_nodes')
362+
print (building_nodes,'building_nodes')
363+
364+
for i in cluster_building:
365+
for j in cluster_to_build:
366+
if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
367+
cluster_to_build.remove(j)
368+
break
369+
for cluster in cluster_to_destroy:
370+
cluster_name=cluster[0]
371+
print ("Deleting cluster "+cluster_name)
372+
subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
373+
time.sleep(5)
374+
375+
for cluster_name in nodes_to_destroy.keys():
376+
print ("Resizing cluster "+cluster_name)
377+
initial_nodes=[]
378+
unreachable_nodes=[]
379+
if cluster_name == "NOCLUSTERFOUND":
380+
subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet')
381+
continue
382+
for node in nodes_to_destroy[cluster_name]:
383+
try:
384+
alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
385+
for alt_name in alt_names.split("\n")[0].split():
386+
if alt_name.startswith('inst-'):
387+
initial_nodes.append(alt_name)
388+
break
389+
except:
390+
unreachable_nodes.append(node)
391+
if len(initial_nodes) > 0:
392+
subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet')
393+
if len(unreachable_nodes) > 0:
394+
subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet')
395+
time.sleep(1)
396+
397+
for index,cluster in enumerate(cluster_to_build):
398+
nodes=cluster[0]
399+
instance_type = cluster[1]
400+
queue=cluster[2]
401+
jobID=str(cluster[3])
402+
user=str(cluster[4])
403+
jobconfig=getJobConfig(config,queue,instance_type)
404+
limits=getQueueLimits(config,queue,instance_type)
370405
try:
371-
alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
372-
for alt_name in alt_names.split("\n")[0].split():
373-
if alt_name.startswith('inst-'):
374-
initial_nodes.append(alt_name)
375-
break
406+
clusterCount=len(used_index[queue][instance_type])
376407
except:
377-
unreachable_nodes.append(node)
378-
if len(initial_nodes) > 0:
379-
subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes)
380-
if len(unreachable_nodes) > 0:
381-
subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes)
382-
time.sleep(1)
383-
384-
for index,cluster in enumerate(cluster_to_build):
385-
nodes=cluster[0]
386-
instance_type = cluster[1]
387-
queue=cluster[2]
388-
jobID=str(cluster[3])
389-
user=str(cluster[4])
390-
jobconfig=getJobConfig(config,queue,instance_type)
391-
limits=getQueueLimits(config,queue,instance_type)
392-
try:
393-
clusterCount=len(used_index[queue][instance_type])
394-
except:
395-
clusterCount=0
396-
if clusterCount>=limits["max_cluster_count"]:
397-
print ("This would go over the number of running clusters, you have reached the max number of clusters")
398-
continue
399-
nextIndex=None
400-
if clusterCount==0:
401-
if queue in used_index.keys():
402-
used_index[queue][instance_type]=[1]
408+
clusterCount=0
409+
if clusterCount>=limits["max_cluster_count"]:
410+
print ("This would go over the number of running clusters, you have reached the max number of clusters")
411+
continue
412+
nextIndex=None
413+
if clusterCount==0:
414+
if queue in used_index.keys():
415+
used_index[queue][instance_type]=[1]
416+
else:
417+
used_index[queue]={instance_type:[1]}
418+
nextIndex=1
403419
else:
404-
used_index[queue]={instance_type:[1]}
405-
nextIndex=1
406-
else:
407-
for i in range(1,10000):
408-
if not i in used_index[queue][instance_type]:
409-
nextIndex=i
410-
used_index[queue][instance_type].append(i)
411-
break
412-
clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
413-
if not queue in current_nodes.keys():
414-
current_nodes[queue]={instance_type:0}
415-
else:
416-
if not instance_type in current_nodes[queue].keys():
417-
current_nodes[queue][instance_type]=0
418-
if not queue in building_nodes.keys():
419-
building_nodes[queue]={instance_type:0}
420-
else:
421-
if not instance_type in building_nodes[queue].keys():
422-
building_nodes[queue][instance_type]=0
423-
if nodes > limits["max_cluster_size"]:
424-
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
425-
elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
426-
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
427-
else:
428-
current_nodes[queue][instance_type]+=nodes
429-
clusterCount+=1
430-
print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
431-
subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
432-
time.sleep(5)
420+
for i in range(1,10000):
421+
if not i in used_index[queue][instance_type]:
422+
nextIndex=i
423+
used_index[queue][instance_type].append(i)
424+
break
425+
clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
426+
if not queue in current_nodes.keys():
427+
current_nodes[queue]={instance_type:0}
428+
else:
429+
if not instance_type in current_nodes[queue].keys():
430+
current_nodes[queue][instance_type]=0
431+
if not queue in building_nodes.keys():
432+
building_nodes[queue]={instance_type:0}
433+
else:
434+
if not instance_type in building_nodes[queue].keys():
435+
building_nodes[queue][instance_type]=0
436+
if nodes > limits["max_cluster_size"]:
437+
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
438+
elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
439+
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
440+
else:
441+
current_nodes[queue][instance_type]+=nodes
442+
clusterCount+=1
443+
print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
444+
subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
445+
time.sleep(5)
433446

434-
except Exception:
435-
traceback.print_exc()
436-
os.remove(lockfile)
447+
except Exception:
448+
traceback.print_exc()
449+
os.remove(lockfile)
450+
else:
451+
print("Autoscaling is false")
452+
exit()

bin/delete_cluster.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ else
103103
for node in `scontrol show hostname $nodes 2>&1`
104104
do
105105
echo "Cleaning up node " $node
106-
/opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node
106+
/opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node --quiet
107107
done
108108
fi
109109
cd

bin/remove_nodes_prompt.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Does your cluster run any file system like Ceph, NFS, etc. on the GPU/HPC nodes itself using local NVMe SSDs?
2+
If yes, terminating nodes which store your data can result in permanent data loss, so before proceeding make sure any important data is copied to a persistent file system outside of the cluster such as to object storage, file storage, etc.
3+
Once data is backed up or migrated, come back and run the script. Select 2 to exit.
4+
Remember, once the nodes are terminated, all the data is lost forever and you won't be able to recover it.

bin/resize.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index
577577
parser.add_argument('--force', help='If present. Nodes will be removed even if the destroy playbook failed',action='store_true',default=False)
578578
parser.add_argument('--ansible_crucial', help='If present during reconfiguration, only crucial ansible playbooks will be executed on the live nodes. Non live nodes will be removed',action='store_true',default=False)
579579
parser.add_argument('--remove_unreachable', help='If present, nodes that are not sshable will be terminated before running the action that was requested (Example Adding a node) ',action='store_true',default=False)
580+
parser.add_argument('--quiet', help='If present, the script will not prompt for a response when removing nodes and will not give a reminder to save data from nodes that are being removed ',action='store_true',default=False)
580581

581582
args = parser.parse_args()
582583

bin/resize.sh

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ then
1515
exit
1616
fi
1717

18+
if [ $USER != "ubuntu" ] && [ $USER != "opc" ]
19+
then
20+
echo "Run this script as opc or ubuntu"
21+
exit
22+
fi
23+
1824
if [ $# -eq 0 ]
1925
then
2026
python3 $folder/resize.py --help
@@ -26,6 +32,7 @@ permanent=1
2632
controllerName=`hostname`
2733
cluster_name=${controllerName/-controller/}
2834
nodes=NULL
35+
quietMode=False
2936
for (( i=1; i<=$#; i++)); do
3037
if [ ${!i} == "--cluster_name" ]
3138
then
@@ -48,9 +55,24 @@ for (( i=1; i<=$#; i++)); do
4855
then
4956
j=$((i+1))
5057
nodes=${@:j}
58+
elif [ ${!i} == "--quiet" ]
59+
then
60+
quietMode=True
5161
fi
5262
done
5363

64+
if [ $resize_type == "remove" ] || [ $resize_type == "remove_unreachable" ] && [ $quietMode == "False" ]
65+
then
66+
echo "$(cat $folder/remove_nodes_prompt.txt)"
67+
echo "Do you confirm you have done all of the above steps and wish to proceed for the termination of the nodes? Enter 1 for Yes and 2 for No (to exit)."
68+
select yn in "Yes" "No"; do
69+
case $yn in
70+
Yes ) break;;
71+
No ) exit;;
72+
esac
73+
done
74+
fi
75+
5476
if [ $resize_type != "default" ]
5577
then
5678
if [ $permanent -eq 0 ]
@@ -148,5 +170,5 @@ then
148170
rm currently_resizing
149171
fi
150172
else
151-
python3 $folder/resize.py ${@}
173+
python3 $folder/resize.py ${@} &
152174
fi

0 commit comments

Comments
 (0)