66import time
77from _collections import OrderedDict
88import traceback
9+ import json
910
1011from graphviz import Digraph
1112
1213from ibllib .misc import version
13- import one .params
1414from ibllib .oneibl import data_handlers
15-
15+ import one .params
16+ from one .api import ONE
1617
1718_logger = logging .getLogger ('ibllib' )
1819
1920
2021class Task (abc .ABC ):
21- log = ""
22- cpu = 1
23- gpu = 0
22+ log = "" # place holder to keep the log of the task for registratoin
23+ cpu = 1 # CPU resource
24+ gpu = 0 # GPU resources: as of now, either 0 or 1
2425 io_charge = 5 # integer percentage
2526 priority = 30 # integer percentage, 100 means highest priority
2627 ram = 4 # RAM needed to run (Go)
2728 one = None # one instance (optional)
28- level = 0
29- outputs = None
29+ level = 0 # level in the pipeline hierarchy: level 0 means there is no parent task
30+ outputs = None # place holder for a list of Path containing output files
3031 time_elapsed_secs = None
31- time_out_secs = None
32+ time_out_secs = 3600 * 2 # time-out after which a task is considered dead
3233 version = version .ibllib ()
3334 signature = {'input_files' : [], 'output_files' : []} # list of tuples (filename, collection, required_flag)
3435 force = False # whether or not to re-download missing input files on local server if not present
@@ -69,6 +70,11 @@ def run(self, **kwargs):
6970 wraps the _run() method with
7071 - error management
7172 - logging to variable
73+ - writing a lock file if the GPU is used
74+ - labels the status property of the object. The status value is labeled as:
75+ 0: Complete
76+ -1: Errored
77+ -2: Didn't run as a lock was encountered
7278 """
7379 # if taskid of one properties are not available, local run only without alyx
7480 use_alyx = self .one is not None and self .taskid is not None
@@ -91,17 +97,20 @@ def run(self, **kwargs):
9197 # setup
9298 setup = self .setUp (** kwargs )
9399 _logger .info (f"Setup value is: { setup } " )
100+ self .status = 0
94101 if not setup :
95102 # case where outputs are present but don't have input files locally to rerun task
96103 # label task as complete
97- self .status = 0
98104 _ , self .outputs = self .assert_expected_outputs ()
99-
100105 else :
101106 # run task
102- self .status = 0
103107 start_time = time .time ()
104108 try :
109+ if self .gpu >= 1 :
110+ if not self ._creates_lock ():
111+ self .status = - 2
112+ _logger .info (f"Job { self .__class__ } exited as a lock was found" )
113+ return
105114 self .outputs = self ._run (** kwargs )
106115 _logger .info (f"Job { self .__class__ } complete" )
107116 except BaseException :
@@ -169,7 +178,6 @@ def setUp(self, **kwargs):
169178 :param kwargs:
170179 :return:
171180 """
172-
173181 if self .location == 'server' :
174182 self .get_signatures (** kwargs )
175183
@@ -196,7 +204,6 @@ def setUp(self, **kwargs):
196204 # TODO in future should raise error if even after downloading don't have the correct files
197205 self .assert_expected_inputs (raise_error = False )
198206 return True
199-
200207 else :
201208 self .data_handler = self .get_data_handler ()
202209 self .data_handler .setUp ()
@@ -206,9 +213,10 @@ def setUp(self, **kwargs):
206213
207214 def tearDown (self ):
208215 """
209- Function to optionally overload to check results
216+ Function after runs()
210217 """
211- pass
218+ if self .gpu >= 1 :
219+ self ._lock_file_path ().unlink ()
212220
213221 def cleanUp (self ):
214222 """
@@ -270,7 +278,9 @@ def get_data_handler(self, location=None):
270278 :return:
271279 """
272280 location = location or self .location
273-
281+ if location == 'local' :
282+ return data_handlers .LocalDataHandler (self .session_path , self .signature , one = self .one )
283+ self .one = self .one or ONE ()
274284 if location == 'server' :
275285 dhandler = data_handlers .ServerDataHandler (self .session_path , self .signature , one = self .one )
276286 elif location == 'serverglobus' :
@@ -281,9 +291,49 @@ def get_data_handler(self, location=None):
281291 dhandler = data_handlers .RemoteAwsDataHandler (self .session_path , self .signature , one = self .one )
282292 elif location == 'SDSC' :
283293 dhandler = data_handlers .SDSCDataHandler (self , self .session_path , self .signature , one = self .one )
284-
285294 return dhandler
286295
296+ @staticmethod
297+ def make_lock_file (taskname = "" , time_out_secs = 7200 ):
298+ """Creates a GPU lock file with a timeout of"""
299+ d = {'start' : time .time (), 'name' : taskname , 'time_out_secs' : time_out_secs }
300+ with open (Task ._lock_file_path (), 'w+' ) as fid :
301+ json .dump (d , fid )
302+ return d
303+
304+ @staticmethod
305+ def _lock_file_path ():
306+ """the lock file is in ~/.one/gpu.lock"""
307+ folder = Path .home ().joinpath ('.one' )
308+ folder .mkdir (exist_ok = True )
309+ return folder .joinpath ('gpu.lock' )
310+
311+ def _make_lock_file (self ):
312+ """creates a lock file with the current time"""
313+ return Task .make_lock_file (self .name , self .time_out_secs )
314+
315+ def is_locked (self ):
316+ """Checks if there is a lock file for this given task"""
317+ lock_file = self ._lock_file_path ()
318+ if not lock_file .exists ():
319+ return False
320+
321+ with open (lock_file ) as fid :
322+ d = json .load (fid )
323+ now = time .time ()
324+ if (now - d ['start' ]) > d ['time_out_secs' ]:
325+ lock_file .unlink ()
326+ return False
327+ else :
328+ return True
329+
330+ def _creates_lock (self ):
331+ if self .is_locked ():
332+ return False
333+ else :
334+ self ._make_lock_file ()
335+ return True
336+
287337
288338class Pipeline (abc .ABC ):
289339 """
0 commit comments