Skip to content

Commit 10461e6

Browse files
Retry failing tasks in batch. (#4468)
Except for fuzz and corpus_pruning.
1 parent ecb90a1 commit 10461e6

File tree

3 files changed

+16
-2
lines changed

3 files changed

+16
-2
lines changed

configs/test/batch/batch.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ mapping:
3939
subnetwork: 'projects/google.com:clusterfuzz/regions/gce-region/subnetworks/subnetworkname'
4040
preemptible: false
4141
machine_type: n1-standard-1
42+
retry: true
4243
LINUX-PREEMPTIBLE:
4344
clusterfuzz_release: 'prod'
4445
docker_image: 'gcr.io/clusterfuzz-images/base:a2f4dd6-202202070654'

src/clusterfuzz/_internal/google_cloud_utils/batch.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
_local = threading.local()
3535

36-
RETRY_COUNT = 0
36+
DEFAULT_RETRY_COUNT = 0
3737

3838
TASK_BUNCH_SIZE = 20
3939

@@ -60,6 +60,7 @@
6060
'gce_region',
6161
'priority',
6262
'max_run_duration',
63+
'retry',
6364
])
6465

6566

@@ -139,7 +140,13 @@ def _get_task_spec(batch_workload_spec):
139140
runnable.container.volumes = ['/var/scratch0:/mnt/scratch0']
140141
task_spec = batch.TaskSpec()
141142
task_spec.runnables = [runnable]
142-
task_spec.max_retry_count = RETRY_COUNT
143+
if batch_workload_spec.retry:
144+
# Tasks in general have 6 hours to run (except pruning which has 24).
145+
# Our signed URLs last 24 hours. Therefore, the maxiumum number of retries
146+
# is 4. This is a temporary solution anyway.
147+
task_spec.max_retry_count = 4
148+
else:
149+
task_spec.max_retry_count = DEFAULT_RETRY_COUNT
143150
task_spec.max_run_duration = batch_workload_spec.max_run_duration
144151
return task_spec
145152

@@ -282,6 +289,7 @@ def _get_spec_from_config(command, job_name):
282289
project_name = batch_config.get('project')
283290
docker_image = instance_spec['docker_image']
284291
user_data = instance_spec['user_data']
292+
should_retry = instance_spec.get('retry', False)
285293
clusterfuzz_release = instance_spec.get('clusterfuzz_release', 'prod')
286294

287295
# Lower numbers are lower priority. From:
@@ -290,6 +298,8 @@ def _get_spec_from_config(command, job_name):
290298
priority = 0 if low_priority else 1
291299

292300
max_run_duration = f'{_get_task_duration(command)}s'
301+
if command == 'corpus_pruning':
302+
should_retry = False # It is naturally retried the next day.
293303

294304
spec = BatchWorkloadSpec(
295305
clusterfuzz_release=clusterfuzz_release,
@@ -309,5 +319,6 @@ def _get_spec_from_config(command, job_name):
309319
machine_type=instance_spec['machine_type'],
310320
priority=priority,
311321
max_run_duration=max_run_duration,
322+
retry=should_retry,
312323
)
313324
return spec

src/clusterfuzz/_internal/tests/core/google_cloud_utils/batch_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def test_nonpreemptible(self):
5151
preemptible=False,
5252
machine_type='n1-standard-1',
5353
priority=1,
54+
retry=True,
5455
max_run_duration='21600s',
5556
)
5657

@@ -77,6 +78,7 @@ def test_fuzz_get_spec_from_config(self):
7778
preemptible=True,
7879
machine_type='n1-standard-1',
7980
priority=0,
81+
retry=False,
8082
max_run_duration='21600s',
8183
)
8284

0 commit comments

Comments
 (0)