[tworker] Speed up fuzz_task preprocess (#4422)

jonathanmetzman · web-flow · commit 0a413909cfbf · 2024-11-22T09:00:33.000-05:00
Attempt to make the tworkers faster so they can scale up to OSS-Fuzz.
Make the following changes based on profiling.

1. Sign URLs in parallel.
2. Don't do postprocess tasks. Pulling from a queue which we don't get
anything is relatively expensive.
3. Don't clean up after tasks since preprocess doesn't change the state
of the machine.
4. Don't sign delete URLs for corpus in fuzz task, we don't delete in
fuzz task anyway.
5. Limit the number of upload URLs for corpus in fuzz task to the number
we will actually use.
6. Memoize to avoid parsing YAML often.
7. (unrelated to preprocess) Remove overly-conservative, useless
limitation on async downloads.
8. Remove unneeded call to last_updated.
9. Remove unnecessary sleep when task is rate limited. This sleep only
makes sense for oss-fuzz hosts.
10. Don't download more than 25k testcases. This limit is probably good in general
but for now it's needed since a lot of oss-fuzz projects seem to not have pruning jobs
(maybe because they are in zone2).
diff --git a/src/clusterfuzz/_internal/base/concurrency.py b/src/clusterfuzz/_internal/base/concurrency.py
@@ -21,11 +21,32 @@
 POOL_SIZE = multiprocessing.cpu_count()
 
 
+class SingleThreadPool:
+  """Single thread pool for when it's not worth using Python's thread
+  implementation."""
+
+  def __init__(self, size):
+    del size
+
+  def map(self, f, l):
+    return list(map(f, l))
+
+
 @contextlib.contextmanager
-def make_pool(pool_size=POOL_SIZE):
+def make_pool(pool_size=POOL_SIZE, cpu_bound=False, max_pool_size=None):
+  """Returns a pool that can (usually) execute tasks concurrently."""
+  if max_pool_size is not None:
+    pool_size = max(pool_size, max_pool_size)
+
   # Don't use processes on Windows and unittests to avoid hangs.
   if (environment.get_value('PY_UNITTESTS') or
       environment.platform() == 'WINDOWS'):
-    yield futures.ThreadPoolExecutor(pool_size)
+    if cpu_bound:
+      yield SingleThreadPool(pool_size)
+    else:
+      yield futures.ThreadPoolExecutor(pool_size)
   else:
     yield futures.ProcessPoolExecutor(pool_size)
+
+
+# TODO(metzman): Find out if batching makes things even faster.
diff --git a/src/clusterfuzz/_internal/base/tasks/__init__.py b/src/clusterfuzz/_internal/base/tasks/__init__.py
@@ -317,10 +317,11 @@ def get_preprocess_task():
 
 def tworker_get_task():
   assert environment.is_tworker()
-  task = get_postprocess_task()
-  if task:
-    return task
-
+  # TODO(metzman): Pulling tasks is relatively expensive compared to
+  # preprocessing. It's too expensive to pull twice (once from the postproces
+  # queue that is probably empty) to do a single preprocess. Investigate
+  # combining preprocess and postprocess queues and allowing pulling of
+  # multiple messages.
   return get_preprocess_task()
 
 
diff --git a/src/clusterfuzz/_internal/bot/tasks/commands.py b/src/clusterfuzz/_internal/bot/tasks/commands.py
@@ -233,7 +233,8 @@ def run_command(task_name, task_argument, job_name, uworker_env):
         'argument': task_argument,
     })
     logs.error(f'Rate limited task: {task_name} {task_argument} {job_name}')
-    if task_name == 'fuzz':
+    if task_name == 'fuzz' and not environment.is_tworker():
+      # TODO(b/377885331): Get rid of this when oss-fuzz is migrated.
       # Wait 10 seconds. We don't want to try again immediately because if we
       # tried to run a fuzz task then there is no other task to run.
       time.sleep(environment.get_value('FAIL_WAIT'))
@@ -467,6 +468,7 @@ def process_command_impl(task_name, task_argument, job_name, high_end,
     return run_command(task_name, task_argument, job_name, uworker_env)
   finally:
     # Final clean up.
-    cleanup_task_state()
+    if not environment.is_tworker():
+      cleanup_task_state()
     if 'CF_TASK_ID' in os.environ:
       del os.environ['CF_TASK_ID']
diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/fuzz_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/fuzz_task.py
@@ -127,6 +127,15 @@ def get(self) -> uworker_msg_pb2.BlobUploadUrl:
     return url
 
 
+def _get_max_testcases() -> int:
+  return environment.get_value('MAX_TESTCASES', 1)
+
+
+def _get_max_corpus_uploads_per_task():
+  number_of_fuzzer_runs = _get_max_testcases()
+  return MAX_NEW_CORPUS_FILES * number_of_fuzzer_runs
+
+
 class Crash:
   """Represents a crash (before creating a testcase)."""
 
@@ -1497,7 +1506,7 @@ def do_engine_fuzzing(self, engine_impl):
 
     self.fuzz_task_output.app_revision = environment.get_value('APP_REVISION')
     # Do the actual fuzzing.
-    for fuzzing_round in range(environment.get_value('MAX_TESTCASES', 1)):
+    for fuzzing_round in range(_get_max_testcases()):
       logs.info(f'Fuzzing round {fuzzing_round}.')
       try:
         with _TrackFuzzTime(self.fully_qualified_fuzzer_name,
@@ -1572,7 +1581,7 @@ def do_blackbox_fuzzing(self, fuzzer, fuzzer_directory, job_type):
     thread_timeout = test_timeout
 
     # Determine number of testcases to process.
-    testcase_count = environment.get_value('MAX_TESTCASES')
+    testcase_count = _get_max_testcases()
 
     # For timeout multipler greater than 1, we need to decrease testcase count
     # to prevent exceeding task lease time.
@@ -2023,7 +2032,11 @@ def utask_preprocess(fuzzer_name, job_type, uworker_env):
         uworker_io.entity_to_protobuf(fuzz_target))
     fuzz_task_input.corpus.CopyFrom(
         corpus_manager.get_fuzz_target_corpus(
-            fuzzer_name, fuzz_target.project_qualified_name()).serialize())
+            fuzzer_name,
+            fuzz_target.project_qualified_name(),
+            include_delete_urls=False,
+            max_upload_urls=_get_max_corpus_uploads_per_task(),
+            max_download_urls=25000).serialize())
 
   for _ in range(MAX_CRASHES_UPLOADED):
     url = fuzz_task_input.crash_upload_urls.add()
diff --git a/src/clusterfuzz/_internal/fuzzing/corpus_manager.py b/src/clusterfuzz/_internal/fuzzing/corpus_manager.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Functions for corpus synchronization with GCS."""
 
+import itertools
 import os
 import re
 import shutil
@@ -641,15 +642,27 @@ def sync_data_bundle_corpus_to_disk(data_bundle_corpus, directory):
   return len(fails) < MAX_SYNC_ERRORS
 
 
+def _last_updated(*args, **kwargs):
+  if environment.is_tworker():
+    return None
+  return storage.last_updated(*args, **kwargs)
+
+
 def get_proto_corpus(bucket_name,
                      bucket_path,
                      max_upload_urls,
-                     include_delete_urls=False):
+                     include_delete_urls=False,
+                     max_download_urls=None):
   """Returns a proto representation of a corpus."""
   gcs_url = _get_gcs_url(bucket_name, bucket_path)
   # TODO(metzman): Allow this step to be skipped by trusted fuzzers.
   urls = (f'{storage.GS_PREFIX}/{bucket_name}/{url}'
           for url in storage.list_blobs(gcs_url))
+
+  if max_download_urls is not None:
+    urls = itertools.islice(urls, max_download_urls)
+  # TODO(metzman): Stop limiting URLs when pruning works on oss-fuzz
+  # again.
   corpus_urls = dict(
       storage.sign_urls_for_existing_files(urls, include_delete_urls))
 
@@ -660,7 +673,7 @@ def get_proto_corpus(bucket_name,
       upload_urls=upload_urls,
       gcs_url=gcs_url,
   )
-  last_updated = storage.last_updated(_get_gcs_url(bucket_name, bucket_path))
+  last_updated = _last_updated(_get_gcs_url(bucket_name, bucket_path))
   if last_updated:
     timestamp = timestamp_pb2.Timestamp()  # pylint: disable=no-member
     timestamp.FromDatetime(last_updated)
@@ -688,7 +701,8 @@ def get_fuzz_target_corpus(engine,
                            quarantine=False,
                            include_regressions=False,
                            include_delete_urls=False,
-                           max_upload_urls=10000):
+                           max_upload_urls=10000,
+                           max_download_urls=None):
   """Copies the corpus from gcs to disk. Can run on uworker."""
   fuzz_target_corpus = uworker_msg_pb2.FuzzTargetCorpus()  # pylint: disable=no-member
   bucket_name, bucket_path = get_target_bucket_and_path(
@@ -697,7 +711,8 @@ def get_fuzz_target_corpus(engine,
       bucket_name,
       bucket_path,
       include_delete_urls=include_delete_urls,
-      max_upload_urls=max_upload_urls)
+      max_upload_urls=max_upload_urls,
+      max_download_urls=max_download_urls)
   fuzz_target_corpus.corpus.CopyFrom(corpus)
 
   assert not (include_regressions and quarantine)
@@ -707,7 +722,8 @@ def get_fuzz_target_corpus(engine,
         bucket_name,
         regressions_bucket_path,
         max_upload_urls=0,  # This is never uploaded to using this mechanism.
-        include_delete_urls=False)  # This is never deleted from.
+        include_delete_urls=False,  # This is never deleted from.
+        max_download_urls=max_download_urls)
     fuzz_target_corpus.regressions_corpus.CopyFrom(regressions_corpus)
 
   return ProtoFuzzTargetCorpus(engine, project_qualified_target_name,
diff --git a/src/clusterfuzz/_internal/google_cloud_utils/storage.py b/src/clusterfuzz/_internal/google_cloud_utils/storage.py
@@ -1135,6 +1135,7 @@ def get_object_size(cloud_storage_file_path):
   return int(gcs_object['size'])
 
 
+@memoize.wrap(memoize.FifoInMemory(1))
 def blobs_bucket():
   """Get the blobs bucket name."""
   # Allow tests to override blobs bucket name safely.
@@ -1351,18 +1352,24 @@ def _sign_urls_for_existing_file(
   return (download_url, delete_url)
 
 
+def _mappable_sign_urls_for_existing_file(url_and_include_delete_urls):
+  url, include_delete_urls = url_and_include_delete_urls
+  return _sign_urls_for_existing_file(url, include_delete_urls)
+
+
 def sign_urls_for_existing_files(urls,
                                  include_delete_urls) -> List[Tuple[str, str]]:
   logs.info('Signing URLs for existing files.')
-  result = [
-      _sign_urls_for_existing_file(url, include_delete_urls) for url in urls
-  ]
+  args = ((url, include_delete_urls) for url in urls)
+  with concurrency.make_pool(cpu_bound=True, max_pool_size=2) as pool:
+    result = pool.map(_mappable_sign_urls_for_existing_file, args)
   logs.info('Done signing URLs for existing files.')
   return result
 
 
 def get_arbitrary_signed_upload_url(remote_directory):
-  return get_arbitrary_signed_upload_urls(remote_directory, num_uploads=1)[0]
+  return list(
+      get_arbitrary_signed_upload_urls(remote_directory, num_uploads=1))[0]
 
 
 def get_arbitrary_signed_upload_urls(remote_directory: str,
@@ -1390,6 +1397,8 @@ def get_arbitrary_signed_upload_urls(remote_directory: str,
 
   urls = (f'{base_path}-{idx}' for idx in range(num_uploads))
   logs.info('Signing URLs for arbitrary uploads.')
-  result = [get_signed_upload_url(url) for url in urls]
+  with concurrency.make_pool(
+      _POOL_SIZE, cpu_bound=True, max_pool_size=2) as pool:
+    result = list(pool.map(get_signed_upload_url, urls))
   logs.info('Done signing URLs for arbitrary uploads.')
   return result
diff --git a/src/clusterfuzz/_internal/system/fast_http.py b/src/clusterfuzz/_internal/system/fast_http.py
@@ -36,8 +36,6 @@ def download_urls(urls_and_filepaths: List[Tuple[str, str]]) -> List[bool]:
   batch_size = len(urls_and_filepaths) // concurrency.POOL_SIZE
   # Avoid issues with range when urls is less than _POOL_SIZE.
   batch_size = max(batch_size, len(urls_and_filepaths))
-  # Avoid OOMs by limiting the amount of concurrent downloads.
-  batch_size = min(5, batch_size)
 
   for idx in range(0, len(urls_and_filepaths), batch_size):
     batch = urls_and_filepaths[idx:idx + batch_size]