Add rate limiting (#4365)

jonathanmetzman · web-flow · commit c26e353f2307 · 2024-11-07T07:43:06.000-05:00
Add rate limiting for tasks.
Don't allow a task to be run too many times within a 6 hour window,
especially if it's erroring.
Also minor refactor of commands.py
diff --git a/src/appengine/handlers/upload_testcase.py b/src/appengine/handlers/upload_testcase.py
@@ -26,9 +26,9 @@
 from clusterfuzz._internal import fuzzing
 from clusterfuzz._internal.base import external_users
 from clusterfuzz._internal.base import memoize
-from clusterfuzz._internal.base import task_utils
 from clusterfuzz._internal.base import tasks
 from clusterfuzz._internal.base import utils
+from clusterfuzz._internal.base.tasks import task_utils
 from clusterfuzz._internal.crash_analysis.stack_parsing import stack_analyzer
 from clusterfuzz._internal.datastore import data_handler
 from clusterfuzz._internal.datastore import data_types
diff --git a/src/clusterfuzz/_internal/base/tasks/__init__.py b/src/clusterfuzz/_internal/base/tasks/__init__.py
@@ -24,8 +24,8 @@
 
 from clusterfuzz._internal.base import external_tasks
 from clusterfuzz._internal.base import persistent_cache
-from clusterfuzz._internal.base import task_utils
 from clusterfuzz._internal.base import utils
+from clusterfuzz._internal.base.tasks import task_utils
 from clusterfuzz._internal.config import local_config
 from clusterfuzz._internal.datastore import data_types
 from clusterfuzz._internal.datastore import ndb_utils
diff --git a/src/clusterfuzz/_internal/base/tasks/task_rate_limiting.py b/src/clusterfuzz/_internal/base/tasks/task_rate_limiting.py
@@ -0,0 +1,102 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Task rate limiting."""
+
+import datetime
+
+from clusterfuzz._internal.datastore import data_types
+from clusterfuzz._internal.datastore import ndb_utils
+from clusterfuzz._internal.metrics import logs
+from clusterfuzz._internal.system import environment
+
+
+def _get_datetime_now():
+  return datetime.datetime.now()
+
+
+# Things that are sometimes run as tasks by commands.py but are really portions
+# of actual tasks.
+_UTASK_PSEUDO_TASKS = {'uworker_main', 'postprocess', 'preprocess'}
+
+
+class TaskRateLimiter:
+  """Rate limiter for tasks. This limits tasks to 100 erroneous runs or 2000
+  succesful runs in 6 hours. It keeps track of task completion when record_task
+  is called at the end of every task."""
+  TASK_RATE_LIMIT_WINDOW = datetime.timedelta(hours=6)
+  TASK_RATE_LIMIT_MAX_ERRORS = 100
+  # TODO(metzman): Reevaluate this number, it's probably too high.
+  TASK_RATE_LIMIT_MAX_COMPLETIONS = 2000
+
+  def __init__(self, task_name, task_argument, job_name):
+    self.task_name = task_name
+    self.task_argument = task_argument
+    self.job_name = job_name
+
+  def __str__(self):
+    return ' '.join([self.task_name, self.task_argument, self.job_name])
+
+  def record_task(self, success: bool) -> None:
+    """Records a task and whether it completed succesfully."""
+    if self.task_name in _UTASK_PSEUDO_TASKS:
+      # Don't rate limit these fake uworker tasks.
+      return
+    if success:
+      status = data_types.TaskState.FINISHED
+    else:
+      status = data_types.TaskState.ERROR
+    window_task = data_types.WindowRateLimitTask(
+        task_name=self.task_name,
+        task_argument=self.task_argument,
+        job_name=self.job_name,
+        status=status)
+    window_task.put()
+
+  def is_rate_limited(self) -> bool:
+    """Checks if the given task is rate limited."""
+    if self.task_name in _UTASK_PSEUDO_TASKS:
+      # Don't rate limit these fake tasks.
+      return False
+    if environment.get_value('COMMAND_OVERRIDE'):
+      # A user wants to run this task.
+      return False
+    window_start = _get_datetime_now() - self.TASK_RATE_LIMIT_WINDOW
+    query = data_types.WindowRateLimitTask.query(
+        data_types.WindowRateLimitTask.task_name == self.task_name,
+        data_types.WindowRateLimitTask.task_argument == self.task_argument,
+        data_types.WindowRateLimitTask.job_name == self.job_name,
+        data_types.WindowRateLimitTask.timestamp >= window_start)
+    tasks = ndb_utils.get_all_from_query(query)
+    completed_count = 0
+    error_count = 0
+    for task in tasks:
+      # Limit based on completions.
+      completed_count += 1
+      if completed_count > self.TASK_RATE_LIMIT_MAX_COMPLETIONS:
+        logs.warning(
+            f'{str(self)} rate limited. '
+            f'It ran at least {self.TASK_RATE_LIMIT_MAX_COMPLETIONS} in window.'
+        )
+        return True
+
+      # Limit based on errors.
+      if task.status == data_types.TaskState.ERROR:
+        error_count += 1
+      if error_count > self.TASK_RATE_LIMIT_MAX_ERRORS:
+        logs.warning(
+            f'{str(self)} rate limited. '
+            f'It errored at least {self.TASK_RATE_LIMIT_MAX_ERRORS} in window.')
+        return True
+
+    return False
diff --git a/src/clusterfuzz/_internal/base/tasks/task_utils.py b/src/clusterfuzz/_internal/base/tasks/task_utils.py
diff --git a/src/clusterfuzz/_internal/bot/tasks/commands.py b/src/clusterfuzz/_internal/bot/tasks/commands.py
@@ -22,6 +22,7 @@
 from clusterfuzz._internal.base import errors
 from clusterfuzz._internal.base import tasks
 from clusterfuzz._internal.base import utils
+from clusterfuzz._internal.base.tasks import task_rate_limiting
 from clusterfuzz._internal.bot.tasks import blame_task
 from clusterfuzz._internal.bot.tasks import impact_task
 from clusterfuzz._internal.bot.tasks import task_types
@@ -190,12 +191,8 @@ def start_web_server_if_needed():
     logs.error('Failed to start web server, skipping.')
 
 
-def run_command(task_name,
-                task_argument,
-                job_name,
-                uworker_env,
-                preprocess=False):
-  """Run the command."""
+def run_command(task_name, task_argument, job_name, uworker_env):
+  """Runs the command."""
   task = COMMAND_MAP.get(task_name)
   if not task:
     logs.error("Unknown command '%s'" % task_name)
@@ -211,23 +208,32 @@ def run_command(task_name,
       raise AlreadyRunningError
 
   result = None
+  rate_limiter = task_rate_limiting.TaskRateLimiter(task_name, task_argument,
+                                                    job_name)
+  if rate_limiter.is_rate_limited():
+    logs.error(f'Rate limited task: {task_name} {task_argument} {job_name}')
+    if task_name == 'fuzz':
+      # Wait 10 seconds. We don't want to try again immediately because if we
+      # tried to run a fuzz task then there is no other task to run.
+      time.sleep(environment.get_value('FAIL_WAIT'))
+    return None
   try:
-    if not preprocess:
-      result = task.execute(task_argument, job_name, uworker_env)
-    else:
-      result = task.preprocess(task_argument, job_name, uworker_env)
+    result = task.execute(task_argument, job_name, uworker_env)
   except errors.InvalidTestcaseError:
     # It is difficult to try to handle the case where a test case is deleted
     # during processing. Rather than trying to catch by checking every point
     # where a test case is reloaded from the datastore, just abort the task.
     logs.warning('Test case %s no longer exists.' % task_argument)
+    rate_limiter.record_task(success=False)
   except BaseException:
     # On any other exceptions, update state to reflect error and re-raise.
     if should_update_task_status(task_name):
       data_handler.update_task_status(task_state_name,
                                       data_types.TaskState.ERROR)
-
+      rate_limiter.record_task(success=False)
     raise
+  else:
+    rate_limiter.record_task(success=True)
 
   # Task completed successfully.
   if should_update_task_status(task_name):
@@ -254,12 +260,8 @@ def _get_task_id(task_name, task_argument, job_name):
 # pylint: disable=too-many-nested-blocks
 # TODO(mbarbella): Rewrite this function to avoid nesting issues.
 @set_task_payload
-def process_command_impl(task_name,
-                         task_argument,
-                         job_name,
-                         high_end,
-                         is_command_override,
-                         preprocess=False):
+def process_command_impl(task_name, task_argument, job_name, high_end,
+                         is_command_override):
   """Implementation of process_command."""
   uworker_env = None
   environment.set_value('TASK_NAME', task_name)
@@ -320,8 +322,7 @@ def process_command_impl(task_name,
           logs.error('Failed to fix platform and re-add task.')
 
       # Add a wait interval to avoid overflowing task creation.
-      failure_wait_interval = environment.get_value('FAIL_WAIT')
-      time.sleep(failure_wait_interval)
+      time.sleep(environment.get_value('FAIL_WAIT'))
       return None
 
     if task_name != 'fuzz':
@@ -441,8 +442,7 @@ def process_command_impl(task_name,
   start_web_server_if_needed()
 
   try:
-    return run_command(task_name, task_argument, job_name, uworker_env,
-                       preprocess)
+    return run_command(task_name, task_argument, job_name, uworker_env)
   finally:
     # Final clean up.
     cleanup_task_state()
diff --git a/src/clusterfuzz/_internal/bot/tasks/setup.py b/src/clusterfuzz/_internal/bot/tasks/setup.py
@@ -22,9 +22,9 @@
 
 from clusterfuzz._internal.base import dates
 from clusterfuzz._internal.base import errors
-from clusterfuzz._internal.base import task_utils
 from clusterfuzz._internal.base import tasks
 from clusterfuzz._internal.base import utils
+from clusterfuzz._internal.base.tasks import task_utils
 from clusterfuzz._internal.bot import testcase_manager
 from clusterfuzz._internal.bot.tasks.utasks import uworker_handle_errors
 from clusterfuzz._internal.bot.tasks.utasks import uworker_io
diff --git a/src/clusterfuzz/_internal/bot/tasks/task_creation.py b/src/clusterfuzz/_internal/bot/tasks/task_creation.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 """Common functions for task creation for test cases."""
 from clusterfuzz._internal.base import bisection
-from clusterfuzz._internal.base import task_utils
 from clusterfuzz._internal.base import tasks
 from clusterfuzz._internal.base import utils
+from clusterfuzz._internal.base.tasks import task_utils
 from clusterfuzz._internal.bot.tasks import task_types
 from clusterfuzz._internal.build_management import build_manager
 from clusterfuzz._internal.datastore import data_handler
diff --git a/src/clusterfuzz/_internal/bot/tasks/task_types.py b/src/clusterfuzz/_internal/bot/tasks/task_types.py
@@ -15,8 +15,8 @@
 base/tasks.py depends on this module and many things commands.py imports depend
 on base/tasks.py (i.e. avoiding circular imports)."""
 from clusterfuzz._internal import swarming
-from clusterfuzz._internal.base import task_utils
 from clusterfuzz._internal.base import tasks
+from clusterfuzz._internal.base.tasks import task_utils
 from clusterfuzz._internal.bot.tasks import utasks
 from clusterfuzz._internal.google_cloud_utils import batch
 from clusterfuzz._internal.metrics import logs
diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
@@ -22,7 +22,7 @@
 from google.protobuf import timestamp_pb2
 
 from clusterfuzz._internal import swarming
-from clusterfuzz._internal.base import task_utils
+from clusterfuzz._internal.base.tasks import task_utils
 from clusterfuzz._internal.bot.tasks.utasks import uworker_io
 from clusterfuzz._internal.bot.webserver import http_server
 from clusterfuzz._internal.metrics import logs
diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/uworker_io.py b/src/clusterfuzz/_internal/bot/tasks/utasks/uworker_io.py
@@ -25,7 +25,7 @@
 from google.protobuf import timestamp_pb2
 import google.protobuf.message
 
-from clusterfuzz._internal.base import task_utils
+from clusterfuzz._internal.base.tasks import task_utils
 from clusterfuzz._internal.google_cloud_utils import storage
 from clusterfuzz._internal.metrics import logs
 from clusterfuzz._internal.protos import uworker_msg_pb2
diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py
@@ -1114,6 +1114,19 @@ class TaskStatus(Model):
   time = ndb.DateTimeProperty()
 
 
+class WindowRateLimitTask(Model):
+  """Records the completion of a task. This cannot be part of TaskStatus because
+  it will have a different lifecycle (it's not needed after the window
+  completes). This should have a TTL as TASK_RATE_LIMIT_WINDOW in
+  task_rate_limiting.py (6 hours)."""
+  # TODO(metzman): Consider using task_id.
+  timestamp = ndb.DateTimeProperty(auto_now_add=True, indexed=True)
+  task_name = ndb.StringProperty(indexed=True)
+  task_argument = ndb.StringProperty(indexed=True)
+  job_name = ndb.StringProperty(indexed=True)
+  status = ndb.StringProperty(choices=[TaskState.ERROR, TaskState.FINISHED])
+
+
 class BuildMetadata(Model):
   """Metadata associated with a particular archived build."""
   # Job type that this build belongs to.
diff --git a/src/clusterfuzz/_internal/google_cloud_utils/batch.py b/src/clusterfuzz/_internal/google_cloud_utils/batch.py
@@ -21,8 +21,8 @@
 from google.cloud import batch_v1 as batch
 
 from clusterfuzz._internal.base import retry
-from clusterfuzz._internal.base import task_utils
 from clusterfuzz._internal.base import utils
+from clusterfuzz._internal.base.tasks import task_utils
 from clusterfuzz._internal.config import local_config
 from clusterfuzz._internal.datastore import data_types
 from clusterfuzz._internal.metrics import logs
diff --git a/src/clusterfuzz/_internal/tests/core/base/tasks/__init__.py b/src/clusterfuzz/_internal/tests/core/base/tasks/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/clusterfuzz/_internal/tests/core/base/tasks/task_rate_limiting_test.py b/src/clusterfuzz/_internal/tests/core/base/tasks/task_rate_limiting_test.py
diff --git a/src/clusterfuzz/_internal/tests/core/base/tasks/task_utils_test.py b/src/clusterfuzz/_internal/tests/core/base/tasks/task_utils_test.py
diff --git a/src/clusterfuzz/_internal/tests/core/base/tasks/tasks_test.py b/src/clusterfuzz/_internal/tests/core/base/tasks/tasks_test.py
diff --git a/src/clusterfuzz/_internal/tests/core/bot/tasks/task_types_test.py b/src/clusterfuzz/_internal/tests/core/bot/tasks/task_types_test.py
diff --git a/src/python/bot/startup/run_bot.py b/src/python/bot/startup/run_bot.py