@@ -582,8 +582,6 @@ def run(self):
582582 cmd2 = "source /etc/profile; source " + path_install_dir + "/scripts/load.sh " + path_install_dir + " " + param_machine + "; " + get_variables_exported (
583583 exported_variables ) + " mkdir -p " + execution_folder + "; cd " + path_install_dir + "/scripts/" + machine_folder + "/; source app.sh " + userMachine + " " + str (
584584 self .name ) + " " + workflow_folder + " " + execution_folder + " " + self .numNodes + " " + self .execTime + " " + self .qos + " " + machine_found .installDir + " " + self .branch + " " + machine_found .dataDir + " " + self .gOPTION + " " + self .tOPTION + " " + self .dOPTION
585- log .info ("COMMAND" )
586- log .info (cmd2 )
587585 stdin , stdout , stderr = ssh .exec_command (cmd2 )
588586 stdout = stdout .readlines ()
589587 stderr = stderr .readlines ()
@@ -599,8 +597,6 @@ def run(self):
599597 self .request .session ['jobID' ] = jobID
600598 self .request .session ['execution_folder' ] = execution_folder
601599 os .remove ("documents/" + str (self .name ))
602- if self .auto_restart_bool :
603- monitor_checkpoint (var , self .request , self .execTime , machine_found .id )
604600 return
605601
606602
@@ -634,7 +630,7 @@ def run_sim(request):
634630 if form .is_valid ():
635631 branch = request .POST .get ('branchChoice' )
636632 bash_script_path = "/var/www/API_REST/documents/delete_old_files.sh"
637- # execute_bash_script(bash_script_path)
633+ execute_bash_script (bash_script_path )
638634 for filename , file in request .FILES .items ():
639635 uniqueID = uuid .uuid4 ()
640636 nameE = (str (file ).split ("." )[0 ]) + "_" + str (uniqueID ) + "." + str (file ).split ("." )[1 ]
@@ -993,13 +989,16 @@ def executions(request):
993989
994990 except :
995991 return False
992+
996993 request .session ["content" ] = content
997994 request .session ['machine_chosen' ] = machine_found .id
998995 c = Connection ()
999996 c .user = request .user
1000997 c .status = "Active"
1001998 c .save ()
1002999 request .session ["idConn" ] = c .idConn_id
1000+ threadUpdate = updateExecutions (request , c .idConn_id )
1001+ threadUpdate .start ()
10031002 checkConnBool = checkConnection (request )
10041003 if not checkConnBool :
10051004 machines_done = populate_executions_machines (request )
@@ -1008,8 +1007,6 @@ def executions(request):
10081007 request .session ["checkConn" ] = "Required"
10091008 return render (request , 'accounts/executions.html' ,
10101009 {'machines' : machines_done , 'checkConn' : "no" })
1011- threadUpdate = updateExecutions (request )
1012- threadUpdate .start ()
10131010 machine_connected = Machine .objects .get (id = request .session ["machine_chosen" ])
10141011 executions = Execution .objects .all ().filter (author = request .user , machine = machine_connected ).filter (
10151012 Q (status = "PENDING" ) | Q (status = "RUNNING" ) | Q (status = "INITIALIZING" ))
@@ -1105,23 +1102,28 @@ def populate_executions_machines(request):
11051102
11061103
11071104class updateExecutions (threading .Thread ):
1108- def __init__ (self , request ):
1105+ def __init__ (self , request , connectionID ):
11091106 threading .Thread .__init__ (self )
11101107 self .request = request
11111108 self .timeout = 120 * 60
1109+ self .connectionID = connectionID
11121110
11131111 def run (self ):
11141112 timeout_start = time .time ()
11151113 while time .time () < timeout_start + self .timeout :
1114+ conn = Connection .objects .get (idConn_id = self .connectionID )
1115+ if conn .status == "Disconnect" :
1116+ break
11161117 boolException = update_table (self .request )
11171118 if not boolException :
11181119 break
11191120 time .sleep (10 )
1120- Connection .objects .filter (idConn_id = self .request . session [ "idConn" ] ).update (status = "Disconnect" )
1121+ Connection .objects .filter (idConn_id = self .connectionID ).update (status = "Disconnect" )
11211122 render_right (self .request )
11221123 return
11231124
11241125
1126+
11251127def update_table (request ):
11261128 machine_found = Machine .objects .get (id = request .session ['machine_chosen' ])
11271129 machineID = machine_found .id
@@ -1143,6 +1145,10 @@ def update_table(request):
11431145 if not (str (values [4 ]) == "FAILED" and executionE .status == "INITIALIZING" ):
11441146 Execution .objects .filter (jobID = executionE .jobID ).update (status = values [4 ], time = values [3 ],
11451147 nodes = int (values [2 ]))
1148+ executionTimeout = Execution .objects .all ().filter (author = request .user , autorestart = True , status = "TIMEOUT" )
1149+ for executionT in executionTimeout :
1150+ executionT .status = "CONTINUE"
1151+ checkpointing (executionT .jobID , request , executionT .machine_id )
11461152 return True
11471153
11481154
@@ -1185,110 +1191,77 @@ def stopExecution(eIDstop, request):
11851191 {'form' : form , 'executions' : executions , 'executionsDone' : executionsDone ,
11861192 'executionsFailed' : executionsFailed , 'executionsTimeout' : executionTimeout })
11871193
1188-
1189- class auto_restart_thread (threading .Thread ):
1190- def __init__ (self , jobID , request , time , machine_id ):
1191- threading .Thread .__init__ (self )
1192- self .jobID = jobID
1193- self .request = request
1194- self .time = time
1195- self .machine_id = machine_id
1196-
1197- def run (self ):
1198- time .sleep (int (self .time ) * 60 )
1199- wait_timeout_new (self .jobID , self .request , self .machine_id )
1200- return
1201-
1202-
1203- def wait_timeout_new (jobID , request , machine_id ):
1204- execution = Execution .objects .get (jobID = jobID )
1205- if execution .status != "TIMEOUT" :
1206- time .sleep (15 )
1207- wait_timeout_new (jobID , request )
1208- else :
1209- checkpointing (jobID , request , machine_id )
1210- return
1211-
1212-
1213- def monitor_checkpoint (jobID , request , execTime , machine ):
1214- auto_restart_obj = auto_restart_thread (jobID , request , execTime , machine )
1215- auto_restart_obj .start ()
1216- return
1217-
1218-
12191194def checkpointing (jobIDCheckpoint , request , machine_id ):
12201195 ssh = connection (request .session ['content' ], machine_id )
12211196 checkpointID = Execution .objects .all ().get (author = request .user , jobID = jobIDCheckpoint )
1222- machine_connected = Machine .objects .get (id = machine_id )
1223- machine_folder = extract_substring (machine_connected .fqdn )
1224- command = "source /etc/profile; cd " + machine_connected .installDir + "/scripts/" + machine_folder + "/; sh app-checkpoint.sh " + checkpointID .user + " " + checkpointID .name_workflow + " " + checkpointID .workflow_path + " " + checkpointID .wdir + " " + str (
1225- checkpointID .nodes ) + " " + str (
1226- checkpointID .execution_time ) + " " + checkpointID .qos + " " + machine_connected .installDir
1197+ command = "source /etc/profile; cd " + checkpointID .wdir + "; source checkpoint_script.sh;"
12271198 stdin , stdout , stderr = ssh .exec_command (command )
12281199 stdout = stdout .readlines ()
12291200 s = "Submitted batch job"
1201+ execTime = checkpointID .execution_time
12301202 while (len (stdout ) == 0 ):
12311203 import time
12321204 time .sleep (1 )
12331205 if (len (stdout ) > 1 ):
12341206 for line in stdout :
12351207 if (s in line ):
12361208 jobID = int (line .replace (s , "" ))
1237- request .session ['jobID' ] = jobID
12381209 form = Execution ()
1239- form .jobID = request .session ['jobID' ]
1210+ form .jobID = jobID
1211+ form .eID = uuid .uuid4 ()
1212+ form .machine_id = checkpointID .machine_id
12401213 form .user = checkpointID .user
12411214 form .author = request .user
12421215 form .nodes = checkpointID .nodes
12431216 form .status = "PENDING"
1244- form .checkpoint = jobIDCheckpoint
1217+ form .checkpoint = checkpointID . jobID
12451218 form .time = "00:00:00"
12461219 form .wdir = checkpointID .wdir
12471220 form .workflow_path = checkpointID .workflow_path
12481221 form .execution_time = int (checkpointID .execution_time )
1249- time = int (checkpointID .execution_time )
1222+ execTime = int (checkpointID .execution_time )
12501223 form .name_workflow = checkpointID .name_workflow
12511224 form .qos = checkpointID .qos
12521225 form .name_sim = checkpointID .name_sim
12531226 form .autorestart = checkpointID .autorestart
1227+ form .checkpointBool = checkpointID .checkpointBool
1228+ form .d_bool = checkpointID .d_bool
1229+ form .t_bool = checkpointID .t_bool
1230+ form .g_bool = checkpointID .g_bool
1231+ form .branch = checkpointID .branch
12541232 form .save ()
12551233 checkpointID = Execution .objects .all ().get (author = request .user , jobID = jobIDCheckpoint )
12561234 checkpointID .status = "CONTINUE"
12571235 checkpointID .save ()
1258- monitor_checkpoint (request .session ['jobID' ], request , time , machine_id )
1236+ # monitor_checkpoint(request.session['jobID'], request, execTime , machine_id)
12591237 return
12601238
12611239
12621240def checkpointing_noAutorestart (jobIDCheckpoint , request ):
12631241 ssh = connection (request .session ['content' ], request .session ['machine_chosen' ])
12641242 checkpointID = Execution .objects .all ().get (author = request .user , jobID = jobIDCheckpoint )
12651243 machine_connected = Machine .objects .get (id = request .session ['machine_chosen' ])
1266- machine_folder = extract_substring (machine_connected .fqdn )
1267- path_install_dir = os .path .join (machine_connected .installDir , checkpointID .branch )
1268- param_machine = remove_numbers (machine_connected .fqdn )
12691244 command = "source /etc/profile; cd " + checkpointID .wdir + "; source checkpoint_script.sh;"
1270- log .info ("CHECKPOINT START" )
1271- log .info (command )
12721245 stdin , stdout , stderr = ssh .exec_command (command )
12731246 stdout = stdout .readlines ()
12741247 s = "Submitted batch job"
1275- log .info ("READ CHECKPOINT" )
12761248 while (len (stdout ) == 0 ):
12771249 import time
12781250 time .sleep (1 )
12791251 if (len (stdout ) > 1 ):
1280- log .info (stdout )
12811252 for line in stdout :
12821253 if (s in line ):
12831254 jobID = int (line .replace (s , "" ))
12841255 request .session ['jobID' ] = jobID
12851256 form = Execution ()
1286- form .jobID = request .session ['jobID' ]
1257+ form .jobID = jobID
1258+ form .eID = uuid .uuid4 ()
1259+ form .machine_id = checkpointID .machine_id
12871260 form .user = checkpointID .user
12881261 form .author = request .user
12891262 form .nodes = checkpointID .nodes
12901263 form .status = "PENDING"
1291- form .checkpoint = jobIDCheckpoint
1264+ form .checkpoint = checkpointID . jobID
12921265 form .time = "00:00:00"
12931266 form .wdir = checkpointID .wdir
12941267 form .workflow_path = checkpointID .workflow_path
0 commit comments