3333
3434_local = threading .local ()
3535
36- RETRY_COUNT = 0
36+ DEFAULT_RETRY_COUNT = 0
3737
3838TASK_BUNCH_SIZE = 20
3939
6060 'gce_region' ,
6161 'priority' ,
6262 'max_run_duration' ,
63+ 'retry' ,
6364])
6465
6566
@@ -139,7 +140,13 @@ def _get_task_spec(batch_workload_spec):
139140 runnable .container .volumes = ['/var/scratch0:/mnt/scratch0' ]
140141 task_spec = batch .TaskSpec ()
141142 task_spec .runnables = [runnable ]
142- task_spec .max_retry_count = RETRY_COUNT
143+ if batch_workload_spec .retry :
144+ # Tasks in general have 6 hours to run (except pruning which has 24).
145+ # Our signed URLs last 24 hours. Therefore, the maxiumum number of retries
146+ # is 4. This is a temporary solution anyway.
147+ task_spec .max_retry_count = 4
148+ else :
149+ task_spec .max_retry_count = DEFAULT_RETRY_COUNT
143150 task_spec .max_run_duration = batch_workload_spec .max_run_duration
144151 return task_spec
145152
@@ -282,6 +289,7 @@ def _get_spec_from_config(command, job_name):
282289 project_name = batch_config .get ('project' )
283290 docker_image = instance_spec ['docker_image' ]
284291 user_data = instance_spec ['user_data' ]
292+ should_retry = instance_spec .get ('retry' , False )
285293 clusterfuzz_release = instance_spec .get ('clusterfuzz_release' , 'prod' )
286294
287295 # Lower numbers are lower priority. From:
@@ -290,6 +298,8 @@ def _get_spec_from_config(command, job_name):
290298 priority = 0 if low_priority else 1
291299
292300 max_run_duration = f'{ _get_task_duration (command )} s'
301+ if command == 'corpus_pruning' :
302+ should_retry = False # It is naturally retried the next day.
293303
294304 spec = BatchWorkloadSpec (
295305 clusterfuzz_release = clusterfuzz_release ,
@@ -309,5 +319,6 @@ def _get_spec_from_config(command, job_name):
309319 machine_type = instance_spec ['machine_type' ],
310320 priority = priority ,
311321 max_run_duration = max_run_duration ,
322+ retry = should_retry ,
312323 )
313324 return spec
0 commit comments