1
+ # -*- coding: utf-8 -*-
2
+ """ Single and multi-threaded executors."""
1
3
import os
2
4
import tempfile
3
5
import threading
10
12
import six
11
13
from six import string_types
12
14
15
+ import psutil
16
+
13
17
from .builder import Builder # pylint: disable=unused-import
14
18
from .errors import WorkflowException
15
19
from .loghandler import _logger
@@ -44,32 +48,33 @@ def run_jobs(self,
44
48
process , # type: Process
45
49
job_order_object , # type: Dict[Text, Any]
46
50
logger ,
47
- runtimeContext # type: RuntimeContext
51
+ runtime_context # type: RuntimeContext
48
52
): # type: (...) -> None
49
53
""" Execute the jobs for the given Process. """
50
54
pass
51
55
52
56
def execute (self ,
53
57
process , # type: Process
54
58
job_order_object , # type: Dict[Text, Any]
55
- runtimeContext , # type: RuntimeContext
59
+ runtime_context , # type: RuntimeContext
56
60
logger = _logger ,
57
61
): # type: (...) -> Tuple[Optional[Dict[Text, Any]], Text]
58
62
""" Execute the process. """
59
63
60
- if not runtimeContext .basedir :
64
+ if not runtime_context .basedir :
61
65
raise WorkflowException ("Must provide 'basedir' in runtimeContext" )
62
66
63
67
finaloutdir = None # Type: Optional[Text]
64
- original_outdir = runtimeContext .outdir
68
+ original_outdir = runtime_context .outdir
65
69
if isinstance (original_outdir , string_types ):
66
70
finaloutdir = os .path .abspath (original_outdir )
67
- runtimeContext = runtimeContext .copy ()
68
- runtimeContext .outdir = tempfile .mkdtemp (
69
- prefix = getdefault (runtimeContext .tmp_outdir_prefix , DEFAULT_TMP_PREFIX ))
70
- self .output_dirs .add (runtimeContext .outdir )
71
- runtimeContext .mutation_manager = MutationManager ()
72
- runtimeContext .toplevel = True
71
+ runtime_context = runtime_context .copy ()
72
+ runtime_context .outdir = tempfile .mkdtemp (
73
+ prefix = getdefault (runtime_context .tmp_outdir_prefix , DEFAULT_TMP_PREFIX ))
74
+ self .output_dirs .add (runtime_context .outdir )
75
+ runtime_context .mutation_manager = MutationManager ()
76
+ runtime_context .toplevel = True
77
+ runtime_context .workflow_eval_lock = threading .Condition (threading .RLock ())
73
78
74
79
job_reqs = None
75
80
if "cwl:requirements" in job_order_object :
@@ -81,20 +86,20 @@ def execute(self,
81
86
for req in job_reqs :
82
87
process .requirements .append (req )
83
88
84
- self .run_jobs (process , job_order_object , logger , runtimeContext )
89
+ self .run_jobs (process , job_order_object , logger , runtime_context )
85
90
86
91
if self .final_output and self .final_output [0 ] and finaloutdir :
87
92
self .final_output [0 ] = relocateOutputs (
88
93
self .final_output [0 ], finaloutdir , self .output_dirs ,
89
- runtimeContext .move_outputs , runtimeContext .make_fs_access ("" ),
90
- getdefault (runtimeContext .compute_checksum , True ))
94
+ runtime_context .move_outputs , runtime_context .make_fs_access ("" ),
95
+ getdefault (runtime_context .compute_checksum , True ))
91
96
92
- if runtimeContext .rm_tmpdir :
97
+ if runtime_context .rm_tmpdir :
93
98
cleanIntermediate (self .output_dirs )
94
99
95
100
if self .final_output and self .final_status :
96
101
97
- if runtimeContext .research_obj is not None and \
102
+ if runtime_context .research_obj is not None and \
98
103
isinstance (process , (JobBase , Process , WorkflowJobStep ,
99
104
WorkflowJob )) and process .parent_wf :
100
105
process_run_id = None
@@ -115,45 +120,46 @@ def run_jobs(self,
115
120
process , # type: Process
116
121
job_order_object , # type: Dict[Text, Any]
117
122
logger ,
118
- runtimeContext # type: RuntimeContext
123
+ runtime_context # type: RuntimeContext
119
124
): # type: (...) -> None
120
125
121
126
process_run_id = None # type: Optional[str]
122
127
reference_locations = {} # type: Dict[Text,Text]
123
128
124
129
# define provenance profile for single commandline tool
125
130
if not isinstance (process , Workflow ) \
126
- and runtimeContext .research_obj is not None :
127
- orcid = runtimeContext .orcid
128
- full_name = runtimeContext .cwl_full_name
131
+ and runtime_context .research_obj is not None :
132
+ orcid = runtime_context .orcid
133
+ full_name = runtime_context .cwl_full_name
129
134
process .provenance_object = CreateProvProfile (
130
- runtimeContext .research_obj , orcid , full_name )
135
+ runtime_context .research_obj , orcid , full_name )
131
136
process .parent_wf = process .provenance_object
132
- jobiter = process .job (job_order_object , self .output_callback , runtimeContext )
137
+ jobiter = process .job (job_order_object , self .output_callback ,
138
+ runtime_context )
133
139
134
140
try :
135
141
for job in jobiter :
136
142
if job :
137
- if runtimeContext .builder is not None :
138
- job .builder = runtimeContext .builder
143
+ if runtime_context .builder is not None :
144
+ job .builder = runtime_context .builder
139
145
if job .outdir :
140
146
self .output_dirs .add (job .outdir )
141
- if runtimeContext .research_obj is not None :
147
+ if runtime_context .research_obj is not None :
142
148
if not isinstance (process , Workflow ):
143
- runtimeContext .prov_obj = process .provenance_object
149
+ runtime_context .prov_obj = process .provenance_object
144
150
else :
145
- runtimeContext .prov_obj = job .prov_obj
146
- assert runtimeContext .prov_obj
151
+ runtime_context .prov_obj = job .prov_obj
152
+ assert runtime_context .prov_obj
147
153
process_run_id , reference_locations = \
148
- runtimeContext .prov_obj .evaluate (
149
- process , job , job_order_object ,
150
- runtimeContext .make_fs_access ,
151
- runtimeContext )
152
- runtimeContext = runtimeContext .copy ()
153
- runtimeContext .process_run_id = process_run_id
154
- runtimeContext .reference_locations = \
154
+ runtime_context .prov_obj .evaluate (
155
+ process , job , job_order_object ,
156
+ runtime_context .make_fs_access ,
157
+ runtime_context )
158
+ runtime_context = runtime_context .copy ()
159
+ runtime_context .process_run_id = process_run_id
160
+ runtime_context .reference_locations = \
155
161
reference_locations
156
- job .run (runtimeContext )
162
+ job .run (runtime_context )
157
163
else :
158
164
logger .error ("Workflow cannot make any more progress." )
159
165
break
@@ -168,60 +174,130 @@ class MultithreadedJobExecutor(JobExecutor):
168
174
"""
169
175
Experimental multi-threaded CWL executor.
170
176
171
- Can easily overload a system as it does not do resource accounting.
177
+ Does simple resource accounting, will not start a job unless it
178
+ has cores / ram available, but does not make any attempt to
179
+ optimize usage.
172
180
"""
181
+
173
182
def __init__ (self ): # type: () -> None
174
183
super (MultithreadedJobExecutor , self ).__init__ ()
175
184
self .threads = set () # type: Set[threading.Thread]
176
185
self .exceptions = [] # type: List[WorkflowException]
186
+ self .pending_jobs = [] # type: List[JobBase]
187
+ self .pending_jobs_lock = threading .Lock ()
188
+
189
+ self .max_ram = psutil .virtual_memory ().available / 2 ** 20
190
+ self .max_cores = psutil .cpu_count ()
191
+ self .allocated_ram = 0
192
+ self .allocated_cores = 0
193
+
194
+ def select_resources (self , request , runtime_context ): # pylint: disable=unused-argument
195
+ # type: (Dict[str, int], RuntimeContext) -> Dict[str, int]
196
+ """ Naïve check for available cpu cores and memory. """
197
+ result = {} # type: Dict[str, int]
198
+ maxrsc = {
199
+ "cores" : self .max_cores ,
200
+ "ram" : self .max_ram
201
+ }
202
+ for rsc in ("cores" , "ram" ):
203
+ if request [rsc + "Min" ] > maxrsc [rsc ]:
204
+ raise WorkflowException (
205
+ "Requested at least %d %s but only %d available" %
206
+ (request [rsc + "Min" ], rsc , maxrsc [rsc ]))
207
+ if request [rsc + "Max" ] < maxrsc [rsc ]:
208
+ result [rsc ] = request [rsc + "Max" ]
209
+ else :
210
+ result [rsc ] = maxrsc [rsc ]
211
+
212
+ return result
177
213
178
214
def run_job (self ,
179
- job , # type: JobBase
180
- runtimeContext # type: RuntimeContext
215
+ job , # type: JobBase
216
+ runtime_context # type: RuntimeContext
181
217
): # type: (...) -> None
182
218
""" Execute a single Job in a seperate thread. """
183
- def runner ():
184
- """ Job running thread. """
185
- try :
186
- job .run (runtimeContext )
187
- except WorkflowException as err :
188
- self .exceptions .append (err )
189
- except Exception as err :
190
- self .exceptions .append (WorkflowException (Text (err )))
191
- self .threads .remove (thread )
192
-
193
- thread = threading .Thread (target = runner )
194
- thread .daemon = True
195
- self .threads .add (thread )
196
- thread .start ()
197
-
198
- def wait_for_next_completion (self ): # type: () -> None
199
- """ Check for exceptions while waiting for the jobs to finish. """
219
+
220
+ if job is not None :
221
+ with self .pending_jobs_lock :
222
+ self .pending_jobs .append (job )
223
+
224
+ while self .pending_jobs :
225
+ with self .pending_jobs_lock :
226
+ job = self .pending_jobs [0 ]
227
+ if isinstance (job , JobBase ):
228
+ if ((self .allocated_ram + job .builder .resources ["ram" ])
229
+ > self .max_ram or
230
+ (self .allocated_cores + job .builder .resources ["cores" ])
231
+ > self .max_cores ):
232
+ return
233
+ self .pending_jobs .remove (job )
234
+
235
+ def runner (my_job , my_runtime_context ):
236
+ """ Job running thread. """
237
+ try :
238
+ my_job .run (my_runtime_context )
239
+ except WorkflowException as err :
240
+ _logger .exception ("Got workflow error" )
241
+ self .exceptions .append (err )
242
+ except Exception as err : # pylint: disable=broad-except
243
+ _logger .exception ("Got workflow error" )
244
+ self .exceptions .append (WorkflowException (Text (err )))
245
+ finally :
246
+ with my_runtime_context .workflow_eval_lock :
247
+ self .threads .remove (threading .current_thread ())
248
+ if isinstance (my_job , JobBase ):
249
+ self .allocated_ram -= my_job .builder .resources ["ram" ]
250
+ self .allocated_cores -= my_job .builder .resources ["cores" ]
251
+ my_runtime_context .workflow_eval_lock .notifyAll ()
252
+
253
+ thread = threading .Thread (
254
+ target = runner , args = (job , runtime_context ))
255
+ thread .daemon = True
256
+ self .threads .add (thread )
257
+ if isinstance (job , JobBase ):
258
+ self .allocated_ram += job .builder .resources ["ram" ]
259
+ self .allocated_cores += job .builder .resources ["cores" ]
260
+ thread .start ()
261
+
262
+
263
+ def wait_for_next_completion (self , runtimeContext ): # type: (RuntimeContext) -> None
264
+ """ Wait for jobs to finish. """
265
+ if runtimeContext .workflow_eval_lock is not None :
266
+ runtimeContext .workflow_eval_lock .wait ()
200
267
if self .exceptions :
201
268
raise self .exceptions [0 ]
202
269
203
270
def run_jobs (self ,
204
271
process , # type: Process
205
272
job_order_object , # type: Dict[Text, Any]
206
273
logger ,
207
- runtimeContext # type: RuntimeContext
274
+ runtime_context # type: RuntimeContext
208
275
): # type: (...) -> None
209
276
210
- jobiter = process .job (job_order_object , self .output_callback , runtimeContext )
277
+ jobiter = process .job (job_order_object , self .output_callback ,
278
+ runtime_context )
211
279
280
+ if runtime_context .workflow_eval_lock is None :
281
+ raise WorkflowException (
282
+ "runtimeContext.workflow_eval_lock must not be None" )
283
+
284
+ runtime_context .workflow_eval_lock .acquire ()
212
285
for job in jobiter :
213
- if job :
214
- if runtimeContext .builder is not None :
215
- job .builder = runtimeContext .builder
286
+ if job is not None :
287
+ if runtime_context .builder is not None :
288
+ job .builder = runtime_context .builder
216
289
if job .outdir :
217
290
self .output_dirs .add (job .outdir )
218
- self .run_job (job , runtimeContext )
219
- else :
291
+
292
+ self .run_job (job , runtime_context )
293
+
294
+ if job is None :
220
295
if self .threads :
221
- self .wait_for_next_completion ()
296
+ self .wait_for_next_completion (runtime_context )
222
297
else :
223
298
logger .error ("Workflow cannot make any more progress." )
224
299
break
225
300
226
301
while self .threads :
227
- self .wait_for_next_completion ()
302
+ self .wait_for_next_completion (runtime_context )
303
+ runtime_context .workflow_eval_lock .release ()
0 commit comments