Skip to content

Commit c26e353

Browse files
Add rate limiting (#4365)
Add rate limiting for tasks. Don't allow a task to be run too many times within a 6 hour window, especially if it's erroring. Also minor refactor of commands.py
1 parent da28b8b commit c26e353

File tree

18 files changed

+276
-34
lines changed

18 files changed

+276
-34
lines changed

src/appengine/handlers/upload_testcase.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@
2626
from clusterfuzz._internal import fuzzing
2727
from clusterfuzz._internal.base import external_users
2828
from clusterfuzz._internal.base import memoize
29-
from clusterfuzz._internal.base import task_utils
3029
from clusterfuzz._internal.base import tasks
3130
from clusterfuzz._internal.base import utils
31+
from clusterfuzz._internal.base.tasks import task_utils
3232
from clusterfuzz._internal.crash_analysis.stack_parsing import stack_analyzer
3333
from clusterfuzz._internal.datastore import data_handler
3434
from clusterfuzz._internal.datastore import data_types

src/clusterfuzz/_internal/base/tasks.py renamed to src/clusterfuzz/_internal/base/tasks/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424

2525
from clusterfuzz._internal.base import external_tasks
2626
from clusterfuzz._internal.base import persistent_cache
27-
from clusterfuzz._internal.base import task_utils
2827
from clusterfuzz._internal.base import utils
28+
from clusterfuzz._internal.base.tasks import task_utils
2929
from clusterfuzz._internal.config import local_config
3030
from clusterfuzz._internal.datastore import data_types
3131
from clusterfuzz._internal.datastore import ndb_utils
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Task rate limiting."""
15+
16+
import datetime
17+
18+
from clusterfuzz._internal.datastore import data_types
19+
from clusterfuzz._internal.datastore import ndb_utils
20+
from clusterfuzz._internal.metrics import logs
21+
from clusterfuzz._internal.system import environment
22+
23+
24+
def _get_datetime_now():
25+
return datetime.datetime.now()
26+
27+
28+
# Things that are sometimes run as tasks by commands.py but are really portions
29+
# of actual tasks.
30+
_UTASK_PSEUDO_TASKS = {'uworker_main', 'postprocess', 'preprocess'}
31+
32+
33+
class TaskRateLimiter:
34+
"""Rate limiter for tasks. This limits tasks to 100 erroneous runs or 2000
35+
succesful runs in 6 hours. It keeps track of task completion when record_task
36+
is called at the end of every task."""
37+
TASK_RATE_LIMIT_WINDOW = datetime.timedelta(hours=6)
38+
TASK_RATE_LIMIT_MAX_ERRORS = 100
39+
# TODO(metzman): Reevaluate this number, it's probably too high.
40+
TASK_RATE_LIMIT_MAX_COMPLETIONS = 2000
41+
42+
def __init__(self, task_name, task_argument, job_name):
43+
self.task_name = task_name
44+
self.task_argument = task_argument
45+
self.job_name = job_name
46+
47+
def __str__(self):
48+
return ' '.join([self.task_name, self.task_argument, self.job_name])
49+
50+
def record_task(self, success: bool) -> None:
51+
"""Records a task and whether it completed succesfully."""
52+
if self.task_name in _UTASK_PSEUDO_TASKS:
53+
# Don't rate limit these fake uworker tasks.
54+
return
55+
if success:
56+
status = data_types.TaskState.FINISHED
57+
else:
58+
status = data_types.TaskState.ERROR
59+
window_task = data_types.WindowRateLimitTask(
60+
task_name=self.task_name,
61+
task_argument=self.task_argument,
62+
job_name=self.job_name,
63+
status=status)
64+
window_task.put()
65+
66+
def is_rate_limited(self) -> bool:
67+
"""Checks if the given task is rate limited."""
68+
if self.task_name in _UTASK_PSEUDO_TASKS:
69+
# Don't rate limit these fake tasks.
70+
return False
71+
if environment.get_value('COMMAND_OVERRIDE'):
72+
# A user wants to run this task.
73+
return False
74+
window_start = _get_datetime_now() - self.TASK_RATE_LIMIT_WINDOW
75+
query = data_types.WindowRateLimitTask.query(
76+
data_types.WindowRateLimitTask.task_name == self.task_name,
77+
data_types.WindowRateLimitTask.task_argument == self.task_argument,
78+
data_types.WindowRateLimitTask.job_name == self.job_name,
79+
data_types.WindowRateLimitTask.timestamp >= window_start)
80+
tasks = ndb_utils.get_all_from_query(query)
81+
completed_count = 0
82+
error_count = 0
83+
for task in tasks:
84+
# Limit based on completions.
85+
completed_count += 1
86+
if completed_count > self.TASK_RATE_LIMIT_MAX_COMPLETIONS:
87+
logs.warning(
88+
f'{str(self)} rate limited. '
89+
f'It ran at least {self.TASK_RATE_LIMIT_MAX_COMPLETIONS} in window.'
90+
)
91+
return True
92+
93+
# Limit based on errors.
94+
if task.status == data_types.TaskState.ERROR:
95+
error_count += 1
96+
if error_count > self.TASK_RATE_LIMIT_MAX_ERRORS:
97+
logs.warning(
98+
f'{str(self)} rate limited. '
99+
f'It errored at least {self.TASK_RATE_LIMIT_MAX_ERRORS} in window.')
100+
return True
101+
102+
return False
File renamed without changes.

src/clusterfuzz/_internal/bot/tasks/commands.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from clusterfuzz._internal.base import errors
2323
from clusterfuzz._internal.base import tasks
2424
from clusterfuzz._internal.base import utils
25+
from clusterfuzz._internal.base.tasks import task_rate_limiting
2526
from clusterfuzz._internal.bot.tasks import blame_task
2627
from clusterfuzz._internal.bot.tasks import impact_task
2728
from clusterfuzz._internal.bot.tasks import task_types
@@ -190,12 +191,8 @@ def start_web_server_if_needed():
190191
logs.error('Failed to start web server, skipping.')
191192

192193

193-
def run_command(task_name,
194-
task_argument,
195-
job_name,
196-
uworker_env,
197-
preprocess=False):
198-
"""Run the command."""
194+
def run_command(task_name, task_argument, job_name, uworker_env):
195+
"""Runs the command."""
199196
task = COMMAND_MAP.get(task_name)
200197
if not task:
201198
logs.error("Unknown command '%s'" % task_name)
@@ -211,23 +208,32 @@ def run_command(task_name,
211208
raise AlreadyRunningError
212209

213210
result = None
211+
rate_limiter = task_rate_limiting.TaskRateLimiter(task_name, task_argument,
212+
job_name)
213+
if rate_limiter.is_rate_limited():
214+
logs.error(f'Rate limited task: {task_name} {task_argument} {job_name}')
215+
if task_name == 'fuzz':
216+
# Wait 10 seconds. We don't want to try again immediately because if we
217+
# tried to run a fuzz task then there is no other task to run.
218+
time.sleep(environment.get_value('FAIL_WAIT'))
219+
return None
214220
try:
215-
if not preprocess:
216-
result = task.execute(task_argument, job_name, uworker_env)
217-
else:
218-
result = task.preprocess(task_argument, job_name, uworker_env)
221+
result = task.execute(task_argument, job_name, uworker_env)
219222
except errors.InvalidTestcaseError:
220223
# It is difficult to try to handle the case where a test case is deleted
221224
# during processing. Rather than trying to catch by checking every point
222225
# where a test case is reloaded from the datastore, just abort the task.
223226
logs.warning('Test case %s no longer exists.' % task_argument)
227+
rate_limiter.record_task(success=False)
224228
except BaseException:
225229
# On any other exceptions, update state to reflect error and re-raise.
226230
if should_update_task_status(task_name):
227231
data_handler.update_task_status(task_state_name,
228232
data_types.TaskState.ERROR)
229-
233+
rate_limiter.record_task(success=False)
230234
raise
235+
else:
236+
rate_limiter.record_task(success=True)
231237

232238
# Task completed successfully.
233239
if should_update_task_status(task_name):
@@ -254,12 +260,8 @@ def _get_task_id(task_name, task_argument, job_name):
254260
# pylint: disable=too-many-nested-blocks
255261
# TODO(mbarbella): Rewrite this function to avoid nesting issues.
256262
@set_task_payload
257-
def process_command_impl(task_name,
258-
task_argument,
259-
job_name,
260-
high_end,
261-
is_command_override,
262-
preprocess=False):
263+
def process_command_impl(task_name, task_argument, job_name, high_end,
264+
is_command_override):
263265
"""Implementation of process_command."""
264266
uworker_env = None
265267
environment.set_value('TASK_NAME', task_name)
@@ -320,8 +322,7 @@ def process_command_impl(task_name,
320322
logs.error('Failed to fix platform and re-add task.')
321323

322324
# Add a wait interval to avoid overflowing task creation.
323-
failure_wait_interval = environment.get_value('FAIL_WAIT')
324-
time.sleep(failure_wait_interval)
325+
time.sleep(environment.get_value('FAIL_WAIT'))
325326
return None
326327

327328
if task_name != 'fuzz':
@@ -441,8 +442,7 @@ def process_command_impl(task_name,
441442
start_web_server_if_needed()
442443

443444
try:
444-
return run_command(task_name, task_argument, job_name, uworker_env,
445-
preprocess)
445+
return run_command(task_name, task_argument, job_name, uworker_env)
446446
finally:
447447
# Final clean up.
448448
cleanup_task_state()

src/clusterfuzz/_internal/bot/tasks/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222

2323
from clusterfuzz._internal.base import dates
2424
from clusterfuzz._internal.base import errors
25-
from clusterfuzz._internal.base import task_utils
2625
from clusterfuzz._internal.base import tasks
2726
from clusterfuzz._internal.base import utils
27+
from clusterfuzz._internal.base.tasks import task_utils
2828
from clusterfuzz._internal.bot import testcase_manager
2929
from clusterfuzz._internal.bot.tasks.utasks import uworker_handle_errors
3030
from clusterfuzz._internal.bot.tasks.utasks import uworker_io

src/clusterfuzz/_internal/bot/tasks/task_creation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
# limitations under the License.
1414
"""Common functions for task creation for test cases."""
1515
from clusterfuzz._internal.base import bisection
16-
from clusterfuzz._internal.base import task_utils
1716
from clusterfuzz._internal.base import tasks
1817
from clusterfuzz._internal.base import utils
18+
from clusterfuzz._internal.base.tasks import task_utils
1919
from clusterfuzz._internal.bot.tasks import task_types
2020
from clusterfuzz._internal.build_management import build_manager
2121
from clusterfuzz._internal.datastore import data_handler

src/clusterfuzz/_internal/bot/tasks/task_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
base/tasks.py depends on this module and many things commands.py imports depend
1616
on base/tasks.py (i.e. avoiding circular imports)."""
1717
from clusterfuzz._internal import swarming
18-
from clusterfuzz._internal.base import task_utils
1918
from clusterfuzz._internal.base import tasks
19+
from clusterfuzz._internal.base.tasks import task_utils
2020
from clusterfuzz._internal.bot.tasks import utasks
2121
from clusterfuzz._internal.google_cloud_utils import batch
2222
from clusterfuzz._internal.metrics import logs

src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from google.protobuf import timestamp_pb2
2323

2424
from clusterfuzz._internal import swarming
25-
from clusterfuzz._internal.base import task_utils
25+
from clusterfuzz._internal.base.tasks import task_utils
2626
from clusterfuzz._internal.bot.tasks.utasks import uworker_io
2727
from clusterfuzz._internal.bot.webserver import http_server
2828
from clusterfuzz._internal.metrics import logs

src/clusterfuzz/_internal/bot/tasks/utasks/uworker_io.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from google.protobuf import timestamp_pb2
2626
import google.protobuf.message
2727

28-
from clusterfuzz._internal.base import task_utils
28+
from clusterfuzz._internal.base.tasks import task_utils
2929
from clusterfuzz._internal.google_cloud_utils import storage
3030
from clusterfuzz._internal.metrics import logs
3131
from clusterfuzz._internal.protos import uworker_msg_pb2

0 commit comments

Comments
 (0)