Skip to content

Commit 39b1db4

Browse files
committed
added the check for autoscaling value in /etc/ansible/hosts
1 parent 251a3d4 commit 39b1db4

File tree

1 file changed

+120
-104
lines changed

1 file changed

+120
-104
lines changed

autoscaling/crontab/autoscale_slurm.sh

Lines changed: 120 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,8 @@ def getIdleTime(node):
9191
return ( datetime.datetime.now() - right_time ).total_seconds()
9292

9393
# Get the last time a node state was changed. This is used to get how long a cluster has been idle for
94-
def getQueueConf(file):
95-
with open(queues_conf_file) as file:
94+
def getQueueConf(queue_file):
95+
with open(queue_file) as file:
9696
try:
9797
data = yaml.load(file,Loader=yaml.FullLoader)
9898
except:
@@ -328,109 +328,125 @@ def getstatus_slurm():
328328
cluster_destroying.append(clusterName)
329329
return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes
330330

331-
if os.path.isfile(lockfile):
332-
print( "Lockfile "+lockfile + " is present, exiting" )
333-
exit()
334-
open(lockfile,'w').close()
335-
try:
336-
path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
337-
clusters_path = os.path.join(path,'clusters')
338-
config = getQueueConf(queues_conf_file)
339-
340-
cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()
341-
342-
print (time.strftime("%Y-%m-%d %H:%M:%S"))
343-
print (cluster_to_build,'cluster_to_build')
344-
print (cluster_to_destroy,'cluster_to_destroy')
345-
print (nodes_to_destroy,'nodes_to_destroy')
346-
print (cluster_building,'cluster_building')
347-
print (cluster_destroying,'cluster_destroying')
348-
print (current_nodes,'current_nodes')
349-
print (building_nodes,'building_nodes')
350-
351-
for i in cluster_building:
352-
for j in cluster_to_build:
353-
if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
354-
cluster_to_build.remove(j)
355-
break
356-
for cluster in cluster_to_destroy:
357-
cluster_name=cluster[0]
358-
print ("Deleting cluster "+cluster_name)
359-
subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
360-
time.sleep(5)
361-
362-
for cluster_name in nodes_to_destroy.keys():
363-
print ("Resizing cluster "+cluster_name)
364-
initial_nodes=[]
365-
unreachable_nodes=[]
366-
if cluster_name == "NOCLUSTERFOUND":
367-
subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet')
368-
continue
369-
for node in nodes_to_destroy[cluster_name]:
331+
def getAutoscaling():
332+
out = subprocess.Popen(["cat /etc/ansible/hosts | grep 'autoscaling =' | awk -F '= ' '{print $2}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True)
333+
stdout,stderr = out.communicate()
334+
output = stdout.split("\n")
335+
autoscaling_value=False
336+
for i in range(0,len(output)-1):
337+
autoscaling_value=output[i]
338+
return autoscaling_value
339+
340+
autoscaling = getAutoscaling()
341+
342+
if autoscaling == "true":
343+
344+
if os.path.isfile(lockfile):
345+
print( "Lockfile "+lockfile + " is present, exiting" )
346+
exit()
347+
open(lockfile,'w').close()
348+
try:
349+
path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
350+
clusters_path = os.path.join(path,'clusters')
351+
config = getQueueConf(queues_conf_file)
352+
353+
cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()
354+
355+
print (time.strftime("%Y-%m-%d %H:%M:%S"))
356+
print (cluster_to_build,'cluster_to_build')
357+
print (cluster_to_destroy,'cluster_to_destroy')
358+
print (nodes_to_destroy,'nodes_to_destroy')
359+
print (cluster_building,'cluster_building')
360+
print (cluster_destroying,'cluster_destroying')
361+
print (current_nodes,'current_nodes')
362+
print (building_nodes,'building_nodes')
363+
364+
for i in cluster_building:
365+
for j in cluster_to_build:
366+
if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
367+
cluster_to_build.remove(j)
368+
break
369+
for cluster in cluster_to_destroy:
370+
cluster_name=cluster[0]
371+
print ("Deleting cluster "+cluster_name)
372+
subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
373+
time.sleep(5)
374+
375+
for cluster_name in nodes_to_destroy.keys():
376+
print ("Resizing cluster "+cluster_name)
377+
initial_nodes=[]
378+
unreachable_nodes=[]
379+
if cluster_name == "NOCLUSTERFOUND":
380+
subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet')
381+
continue
382+
for node in nodes_to_destroy[cluster_name]:
383+
try:
384+
alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
385+
for alt_name in alt_names.split("\n")[0].split():
386+
if alt_name.startswith('inst-'):
387+
initial_nodes.append(alt_name)
388+
break
389+
except:
390+
unreachable_nodes.append(node)
391+
if len(initial_nodes) > 0:
392+
subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet')
393+
if len(unreachable_nodes) > 0:
394+
subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet')
395+
time.sleep(1)
396+
397+
for index,cluster in enumerate(cluster_to_build):
398+
nodes=cluster[0]
399+
instance_type = cluster[1]
400+
queue=cluster[2]
401+
jobID=str(cluster[3])
402+
user=str(cluster[4])
403+
jobconfig=getJobConfig(config,queue,instance_type)
404+
limits=getQueueLimits(config,queue,instance_type)
370405
try:
371-
alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
372-
for alt_name in alt_names.split("\n")[0].split():
373-
if alt_name.startswith('inst-'):
374-
initial_nodes.append(alt_name)
375-
break
406+
clusterCount=len(used_index[queue][instance_type])
376407
except:
377-
unreachable_nodes.append(node)
378-
if len(initial_nodes) > 0:
379-
subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet')
380-
if len(unreachable_nodes) > 0:
381-
subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet')
382-
time.sleep(1)
383-
384-
for index,cluster in enumerate(cluster_to_build):
385-
nodes=cluster[0]
386-
instance_type = cluster[1]
387-
queue=cluster[2]
388-
jobID=str(cluster[3])
389-
user=str(cluster[4])
390-
jobconfig=getJobConfig(config,queue,instance_type)
391-
limits=getQueueLimits(config,queue,instance_type)
392-
try:
393-
clusterCount=len(used_index[queue][instance_type])
394-
except:
395-
clusterCount=0
396-
if clusterCount>=limits["max_cluster_count"]:
397-
print ("This would go over the number of running clusters, you have reached the max number of clusters")
398-
continue
399-
nextIndex=None
400-
if clusterCount==0:
401-
if queue in used_index.keys():
402-
used_index[queue][instance_type]=[1]
408+
clusterCount=0
409+
if clusterCount>=limits["max_cluster_count"]:
410+
print ("This would go over the number of running clusters, you have reached the max number of clusters")
411+
continue
412+
nextIndex=None
413+
if clusterCount==0:
414+
if queue in used_index.keys():
415+
used_index[queue][instance_type]=[1]
416+
else:
417+
used_index[queue]={instance_type:[1]}
418+
nextIndex=1
403419
else:
404-
used_index[queue]={instance_type:[1]}
405-
nextIndex=1
406-
else:
407-
for i in range(1,10000):
408-
if not i in used_index[queue][instance_type]:
409-
nextIndex=i
410-
used_index[queue][instance_type].append(i)
411-
break
412-
clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
413-
if not queue in current_nodes.keys():
414-
current_nodes[queue]={instance_type:0}
415-
else:
416-
if not instance_type in current_nodes[queue].keys():
417-
current_nodes[queue][instance_type]=0
418-
if not queue in building_nodes.keys():
419-
building_nodes[queue]={instance_type:0}
420-
else:
421-
if not instance_type in building_nodes[queue].keys():
422-
building_nodes[queue][instance_type]=0
423-
if nodes > limits["max_cluster_size"]:
424-
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
425-
elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
426-
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
427-
else:
428-
current_nodes[queue][instance_type]+=nodes
429-
clusterCount+=1
430-
print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
431-
subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
432-
time.sleep(5)
420+
for i in range(1,10000):
421+
if not i in used_index[queue][instance_type]:
422+
nextIndex=i
423+
used_index[queue][instance_type].append(i)
424+
break
425+
clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
426+
if not queue in current_nodes.keys():
427+
current_nodes[queue]={instance_type:0}
428+
else:
429+
if not instance_type in current_nodes[queue].keys():
430+
current_nodes[queue][instance_type]=0
431+
if not queue in building_nodes.keys():
432+
building_nodes[queue]={instance_type:0}
433+
else:
434+
if not instance_type in building_nodes[queue].keys():
435+
building_nodes[queue][instance_type]=0
436+
if nodes > limits["max_cluster_size"]:
437+
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
438+
elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
439+
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
440+
else:
441+
current_nodes[queue][instance_type]+=nodes
442+
clusterCount+=1
443+
print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
444+
subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
445+
time.sleep(5)
433446

434-
except Exception:
435-
traceback.print_exc()
436-
os.remove(lockfile)
447+
except Exception:
448+
traceback.print_exc()
449+
os.remove(lockfile)
450+
else:
451+
print("Autoscaling is false")
452+
exit()

0 commit comments

Comments
 (0)