@@ -91,8 +91,8 @@ def getIdleTime(node):
9191 return ( datetime .datetime .now () - right_time ).total_seconds ()
9292
9393# Get the last time a node state was changed. This is used to get how long a cluster has been idle for
94- def getQueueConf (file ):
95- with open (queues_conf_file ) as file :
94+ def getQueueConf (queue_file ):
95+ with open (queue_file ) as file :
9696 try :
9797 data = yaml .load (file ,Loader = yaml .FullLoader )
9898 except :
@@ -328,109 +328,125 @@ def getstatus_slurm():
328328 cluster_destroying .append (clusterName )
329329 return cluster_to_build ,cluster_to_destroy ,nodes_to_destroy ,cluster_building ,cluster_destroying ,used_index ,current_nodes ,building_nodes
330330
331- if os .path .isfile (lockfile ):
332- print ( "Lockfile " + lockfile + " is present, exiting" )
333- exit ()
334- open (lockfile ,'w' ).close ()
335- try :
336- path = os .path .dirname (os .path .dirname (os .path .realpath (sys .argv [0 ])))
337- clusters_path = os .path .join (path ,'clusters' )
338- config = getQueueConf (queues_conf_file )
339-
340- cluster_to_build ,cluster_to_destroy ,nodes_to_destroy ,cluster_building ,cluster_destroying ,used_index ,current_nodes ,building_nodes = getstatus_slurm ()
341-
342- print (time .strftime ("%Y-%m-%d %H:%M:%S" ))
343- print (cluster_to_build ,'cluster_to_build' )
344- print (cluster_to_destroy ,'cluster_to_destroy' )
345- print (nodes_to_destroy ,'nodes_to_destroy' )
346- print (cluster_building ,'cluster_building' )
347- print (cluster_destroying ,'cluster_destroying' )
348- print (current_nodes ,'current_nodes' )
349- print (building_nodes ,'building_nodes' )
350-
351- for i in cluster_building :
352- for j in cluster_to_build :
353- if i [0 ]== j [0 ] and i [1 ]== j [1 ] and i [2 ]== j [2 ]:
354- cluster_to_build .remove (j )
355- break
356- for cluster in cluster_to_destroy :
357- cluster_name = cluster [0 ]
358- print ("Deleting cluster " + cluster_name )
359- subprocess .Popen ([script_path + '/delete_cluster.sh' ,cluster_name ])
360- time .sleep (5 )
361-
362- for cluster_name in nodes_to_destroy .keys ():
363- print ("Resizing cluster " + cluster_name )
364- initial_nodes = []
365- unreachable_nodes = []
366- if cluster_name == "NOCLUSTERFOUND" :
367- subprocess .Popen ([script_path + '/resize.sh' ,'remove_unreachable' ,'--nodes' ]+ nodes_to_destroy [cluster_name ])
368- continue
369- for node in nodes_to_destroy [cluster_name ]:
331+ def getAutoscaling ():
332+ out = subprocess .Popen (["cat /etc/ansible/hosts | grep 'autoscaling =' | awk -F '= ' '{print $2}'" ],stdout = subprocess .PIPE , stderr = subprocess .STDOUT ,shell = True ,universal_newlines = True )
333+ stdout ,stderr = out .communicate ()
334+ output = stdout .split ("\n " )
335+ autoscaling_value = False
336+ for i in range (0 ,len (output )- 1 ):
337+ autoscaling_value = output [i ]
338+ return autoscaling_value
339+
340+ autoscaling = getAutoscaling ()
341+
342+ if autoscaling == "true" :
343+
344+ if os .path .isfile (lockfile ):
345+ print ( "Lockfile " + lockfile + " is present, exiting" )
346+ exit ()
347+ open (lockfile ,'w' ).close ()
348+ try :
349+ path = os .path .dirname (os .path .dirname (os .path .realpath (sys .argv [0 ])))
350+ clusters_path = os .path .join (path ,'clusters' )
351+ config = getQueueConf (queues_conf_file )
352+
353+ cluster_to_build ,cluster_to_destroy ,nodes_to_destroy ,cluster_building ,cluster_destroying ,used_index ,current_nodes ,building_nodes = getstatus_slurm ()
354+
355+ print (time .strftime ("%Y-%m-%d %H:%M:%S" ))
356+ print (cluster_to_build ,'cluster_to_build' )
357+ print (cluster_to_destroy ,'cluster_to_destroy' )
358+ print (nodes_to_destroy ,'nodes_to_destroy' )
359+ print (cluster_building ,'cluster_building' )
360+ print (cluster_destroying ,'cluster_destroying' )
361+ print (current_nodes ,'current_nodes' )
362+ print (building_nodes ,'building_nodes' )
363+
364+ for i in cluster_building :
365+ for j in cluster_to_build :
366+ if i [0 ]== j [0 ] and i [1 ]== j [1 ] and i [2 ]== j [2 ]:
367+ cluster_to_build .remove (j )
368+ break
369+ for cluster in cluster_to_destroy :
370+ cluster_name = cluster [0 ]
371+ print ("Deleting cluster " + cluster_name )
372+ subprocess .Popen ([script_path + '/delete_cluster.sh' ,cluster_name ])
373+ time .sleep (5 )
374+
375+ for cluster_name in nodes_to_destroy .keys ():
376+ print ("Resizing cluster " + cluster_name )
377+ initial_nodes = []
378+ unreachable_nodes = []
379+ if cluster_name == "NOCLUSTERFOUND" :
380+ subprocess .Popen ([script_path + '/resize.sh' ,'remove_unreachable' ,'--nodes' ]+ nodes_to_destroy [cluster_name ],'--quiet' )
381+ continue
382+ for node in nodes_to_destroy [cluster_name ]:
383+ try :
384+ alt_names = subprocess .check_output (["cat /etc/hosts | grep " + node ],shell = True ,universal_newlines = True )
385+ for alt_name in alt_names .split ("\n " )[0 ].split ():
386+ if alt_name .startswith ('inst-' ):
387+ initial_nodes .append (alt_name )
388+ break
389+ except :
390+ unreachable_nodes .append (node )
391+ if len (initial_nodes ) > 0 :
392+ subprocess .Popen ([script_path + '/resize.sh' ,'--force' ,'--cluster_name' ,cluster_name ,'remove' ,'--remove_unreachable' ,'--nodes' ]+ initial_nodes ,'--quiet' )
393+ if len (unreachable_nodes ) > 0 :
394+ subprocess .Popen ([script_path + '/resize.sh' ,'--cluster_name' ,cluster_name ,'remove_unreachable' ,'--nodes' ]+ unreachable_nodes ,'--quiet' )
395+ time .sleep (1 )
396+
397+ for index ,cluster in enumerate (cluster_to_build ):
398+ nodes = cluster [0 ]
399+ instance_type = cluster [1 ]
400+ queue = cluster [2 ]
401+ jobID = str (cluster [3 ])
402+ user = str (cluster [4 ])
403+ jobconfig = getJobConfig (config ,queue ,instance_type )
404+ limits = getQueueLimits (config ,queue ,instance_type )
370405 try :
371- alt_names = subprocess .check_output (["cat /etc/hosts | grep " + node ],shell = True ,universal_newlines = True )
372- for alt_name in alt_names .split ("\n " )[0 ].split ():
373- if alt_name .startswith ('inst-' ):
374- initial_nodes .append (alt_name )
375- break
406+ clusterCount = len (used_index [queue ][instance_type ])
376407 except :
377- unreachable_nodes .append (node )
378- if len (initial_nodes ) > 0 :
379- subprocess .Popen ([script_path + '/resize.sh' ,'--force' ,'--cluster_name' ,cluster_name ,'remove' ,'--remove_unreachable' ,'--nodes' ]+ initial_nodes )
380- if len (unreachable_nodes ) > 0 :
381- subprocess .Popen ([script_path + '/resize.sh' ,'--cluster_name' ,cluster_name ,'remove_unreachable' ,'--nodes' ]+ unreachable_nodes )
382- time .sleep (1 )
383-
384- for index ,cluster in enumerate (cluster_to_build ):
385- nodes = cluster [0 ]
386- instance_type = cluster [1 ]
387- queue = cluster [2 ]
388- jobID = str (cluster [3 ])
389- user = str (cluster [4 ])
390- jobconfig = getJobConfig (config ,queue ,instance_type )
391- limits = getQueueLimits (config ,queue ,instance_type )
392- try :
393- clusterCount = len (used_index [queue ][instance_type ])
394- except :
395- clusterCount = 0
396- if clusterCount >= limits ["max_cluster_count" ]:
397- print ("This would go over the number of running clusters, you have reached the max number of clusters" )
398- continue
399- nextIndex = None
400- if clusterCount == 0 :
401- if queue in used_index .keys ():
402- used_index [queue ][instance_type ]= [1 ]
408+ clusterCount = 0
409+ if clusterCount >= limits ["max_cluster_count" ]:
410+ print ("This would go over the number of running clusters, you have reached the max number of clusters" )
411+ continue
412+ nextIndex = None
413+ if clusterCount == 0 :
414+ if queue in used_index .keys ():
415+ used_index [queue ][instance_type ]= [1 ]
416+ else :
417+ used_index [queue ]= {instance_type :[1 ]}
418+ nextIndex = 1
403419 else :
404- used_index [queue ]= {instance_type :[1 ]}
405- nextIndex = 1
406- else :
407- for i in range (1 ,10000 ):
408- if not i in used_index [queue ][instance_type ]:
409- nextIndex = i
410- used_index [queue ][instance_type ].append (i )
411- break
412- clusterName = queue + '-' + str (nextIndex )+ '-' + jobconfig ["instance_keyword" ]
413- if not queue in current_nodes .keys ():
414- current_nodes [queue ]= {instance_type :0 }
415- else :
416- if not instance_type in current_nodes [queue ].keys ():
417- current_nodes [queue ][instance_type ]= 0
418- if not queue in building_nodes .keys ():
419- building_nodes [queue ]= {instance_type :0 }
420- else :
421- if not instance_type in building_nodes [queue ].keys ():
422- building_nodes [queue ][instance_type ]= 0
423- if nodes > limits ["max_cluster_size" ]:
424- print ("Cluster " + clusterName + " won't be created, it would go over the total number of nodes per cluster limit" )
425- elif current_nodes [queue ][instance_type ] + building_nodes [queue ][instance_type ] + nodes > limits ["max_number_nodes" ]:
426- print ("Cluster " + clusterName + " won't be created, it would go over the total number of nodes limit" )
427- else :
428- current_nodes [queue ][instance_type ]+= nodes
429- clusterCount += 1
430- print ("Creating cluster " + clusterName + " with " + str (nodes )+ " nodes" )
431- subprocess .Popen ([script_path + '/create_cluster.sh' ,str (nodes ),clusterName ,instance_type ,queue ,jobID ,user ])
432- time .sleep (5 )
420+ for i in range (1 ,10000 ):
421+ if not i in used_index [queue ][instance_type ]:
422+ nextIndex = i
423+ used_index [queue ][instance_type ].append (i )
424+ break
425+ clusterName = queue + '-' + str (nextIndex )+ '-' + jobconfig ["instance_keyword" ]
426+ if not queue in current_nodes .keys ():
427+ current_nodes [queue ]= {instance_type :0 }
428+ else :
429+ if not instance_type in current_nodes [queue ].keys ():
430+ current_nodes [queue ][instance_type ]= 0
431+ if not queue in building_nodes .keys ():
432+ building_nodes [queue ]= {instance_type :0 }
433+ else :
434+ if not instance_type in building_nodes [queue ].keys ():
435+ building_nodes [queue ][instance_type ]= 0
436+ if nodes > limits ["max_cluster_size" ]:
437+ print ("Cluster " + clusterName + " won't be created, it would go over the total number of nodes per cluster limit" )
438+ elif current_nodes [queue ][instance_type ] + building_nodes [queue ][instance_type ] + nodes > limits ["max_number_nodes" ]:
439+ print ("Cluster " + clusterName + " won't be created, it would go over the total number of nodes limit" )
440+ else :
441+ current_nodes [queue ][instance_type ]+= nodes
442+ clusterCount += 1
443+ print ("Creating cluster " + clusterName + " with " + str (nodes )+ " nodes" )
444+ subprocess .Popen ([script_path + '/create_cluster.sh' ,str (nodes ),clusterName ,instance_type ,queue ,jobID ,user ])
445+ time .sleep (5 )
433446
434- except Exception :
435- traceback .print_exc ()
436- os .remove (lockfile )
447+ except Exception :
448+ traceback .print_exc ()
449+ os .remove (lockfile )
450+ else :
451+ print ("Autoscaling is false" )
452+ exit ()
0 commit comments