Refactor base_os_version Logic for Task Scheduling Performance (#5031)

hunsche · web-flow · commit 922f21fe7267 · 2025-11-17T15:20:37.000-03:00
## Motivation

The previous implementation for determining the `base_os_version` for
new tasks introduced a significant performance bottleneck. The
`add_task` function (and its underlying `bulk_add_tasks` wrapper)
queried the Datastore for the `Job` and `OssFuzzProject` entities *for
each individual task* being created.

In high-throughput scenarios, such as the `schedule_fuzz.py` cron job
which can schedule upwards of 300,000 tasks at once, this behavior
results in an equivalent number of Datastore queries. This "N+1" query
problem leads to extreme slowness, high operational costs, and a
significant risk of timeouts and failed task creation.

An alternative approach using a single batch query with an `IN` clause
was considered. However, this is also not scalable for a very large
number of entities and could hit Datastore limits or result in an
unacceptably slow query.

This PR refactors the logic to be far more efficient and scalable.

## Solution

The core idea of this change is to move the responsibility of
determining the `base_os_version` to the point where the necessary
information is already available, thus eliminating redundant Datastore
lookups.

1.  **Logic moved to `schedule_fuzz.py`:**
The `schedule_fuzz.py` cron job already queries for all `Job` and
`OssFuzzProject` entities to perform its scheduling calculations. We now
leverage these in-memory entities to determine the correct
`base_os_version` *before* the `Task` object is created.

2.  **`base_os_version` Precedence:**
The logic for selecting the OS version is now explicitly handled within
the schedulers (`OssfuzzFuzzTaskScheduler` and
`ChromeFuzzTaskScheduler`) with the following precedence:
    - Use `OssFuzzProject.base_os_version` if it exists.
    - Otherwise, use `Job.base_os_version` if it exists.
    - Otherwise, the value is `None`.

3.  **Simplified Task Creation:**
The determined `base_os_version` is passed directly into the `Task`
constructor via the `extra_info` dictionary. This makes the `add_task`
and `bulk_add_tasks` functions in `tasks/__init__.py` "dumb" in this
regard; they no longer perform any Datastore queries for this purpose
and simply publish the tasks they are given.

4.  **Reverting to `add_task`:**
The logic has been consolidated back into the `add_task` function,
removing the `bulk_add_tasks` implementation to simplify the call chain.
The `add_task` function now correctly handles the `base_os_version`
logic and uses `job.is_external()` for dispatching to `external_tasks`.

## Benefits

* **Drastic Performance Improvement:** Reduces the number of Datastore
queries during the scheduling of fuzz tasks from potentially hundreds of
thousands to zero.
* **Enhanced Scalability:** The system can now schedule extremely large
batches of tasks efficiently without overwhelming the Datastore or
risking timeouts.
* **Improved Code Cohesion:** The logic for determining task properties
now resides within the scheduler, where the necessary context is already
present. This makes the `add_task` function simpler and more focused on
its core responsibility of enqueuing a task.
diff --git a/src/clusterfuzz/_internal/base/tasks/__init__.py b/src/clusterfuzz/_internal/base/tasks/__init__.py
@@ -809,23 +809,6 @@ def bulk_add_tasks(tasks, queue=None, eta_now=False):
     for task in tasks:
       task.eta = now
 
-  for task in tasks:
-    # Determine base_os_version.
-    job = data_types.Job.query(data_types.Job.name == task.job).get()
-    if not job:
-      logs.warning(f"Job {task.job} not found for bulk task.", task=task)
-      continue
-
-    task.extra_info = task.extra_info or {}
-    if job.base_os_version:
-      task.extra_info['base_os_version'] = job.base_os_version
-
-    if utils.is_oss_fuzz():
-      oss_fuzz_project = data_types.OssFuzzProject.query(
-          data_types.OssFuzzProject.name == job.project).get()
-      if oss_fuzz_project and oss_fuzz_project.base_os_version:
-        task.extra_info['base_os_version'] = oss_fuzz_project.base_os_version
-
   pubsub_client = pubsub.PubSubClient()
   pubsub_messages = [task.to_pubsub_message() for task in tasks]
   topic_name = pubsub.topic_name(utils.get_application_id(), queue)
@@ -843,17 +826,32 @@ def add_task(command,
   if wait_time is None:
     wait_time = random.randint(1, TASK_CREATION_WAIT_INTERVAL)
 
+  base_os_version = None
   if job_type != 'none':
     job = data_types.Job.query(data_types.Job.name == job_type).get()
     if not job:
       raise Error(f'Job {job_type} not found.')
 
+    if utils.is_oss_fuzz():
+      project = data_types.OssFuzzProject.query(
+          data_types.OssFuzzProject.name == job.project).get()
+      if project and project.base_os_version:
+        base_os_version = project.base_os_version
+      elif job.base_os_version:
+        base_os_version = job.base_os_version
+    else:
+      if job.base_os_version:
+        base_os_version = job.base_os_version
+
     if job.is_external():
       external_tasks.add_external_task(command, argument, job)
       return
 
   # Add the task.
   eta = utils.utcnow() + datetime.timedelta(seconds=wait_time)
+  extra_info = extra_info or {}
+  if base_os_version:
+    extra_info['base_os_version'] = base_os_version
   task = Task(command, argument, job_type, eta=eta, extra_info=extra_info)
 
   bulk_add_tasks([task], queue=queue)
diff --git a/src/clusterfuzz/_internal/cron/schedule_fuzz.py b/src/clusterfuzz/_internal/cron/schedule_fuzz.py
@@ -135,18 +135,25 @@ class FuzzTaskCandidate:
   Something like this would probably not be needed if we were using SQL and
   could use joins."""
 
-  def __init__(self, job, project, fuzzer=None, weight=None):
+  def __init__(self,
+               job,
+               project,
+               fuzzer=None,
+               weight=None,
+               base_os_version=None):
     self.job = job
     self.project = project
     self.fuzzer = fuzzer
     self.weight = weight
+    self.base_os_version = base_os_version
 
   def copy(self):
     return FuzzTaskCandidate(
         job=self.job,
         project=self.project,
         fuzzer=self.fuzzer,
-        weight=self.weight)
+        weight=self.weight,
+        base_os_version=self.base_os_version)
 
 
 class OssfuzzFuzzTaskScheduler(BaseFuzzTaskScheduler):
@@ -166,13 +173,22 @@ def get_fuzz_tasks(self) -> Dict[str, tasks.Task]:
       project_weight = project.cpu_weight / total_cpu_weight
       project_weights[project.name] = project_weight
 
+    projects_by_name = {project.name: project for project in projects}
+
     # Then get FuzzTaskCandidate weights.
     logs.info('Getting jobs.')
     # TODO(metzman): Handle cases where jobs are fuzzed by multiple fuzzers.
     candidates_by_job = {}
     for job in ndb_utils.get_all_from_query(data_types.Job.query()):
+      project = projects_by_name.get(job.project)
+      base_os_version = None
+      if project and project.base_os_version:
+        base_os_version = project.base_os_version
+      elif job.base_os_version:
+        base_os_version = job.base_os_version
+
       candidates_by_job[job.name] = FuzzTaskCandidate(
-          job=job.name, project=job.project)
+          job=job.name, project=job.project, base_os_version=base_os_version)
 
     fuzzer_job_weight_by_project = collections.defaultdict(int)
     fuzz_task_candidates = []
@@ -213,7 +229,11 @@ def get_fuzz_tasks(self) -> Dict[str, tasks.Task]:
     choices = random.choices(
         fuzz_task_candidates, weights=weights, k=num_instances)
     fuzz_tasks = [
-        tasks.Task('fuzz', fuzz_task_candidate.fuzzer, fuzz_task_candidate.job)
+        tasks.Task(
+            'fuzz',
+            fuzz_task_candidate.fuzzer,
+            fuzz_task_candidate.job,
+            extra_info={'base_os_version': fuzz_task_candidate.base_os_version})
         for fuzz_task_candidate in choices
     ]
     # TODO(metzman): Use number of targets even though weight
@@ -236,8 +256,12 @@ def get_fuzz_tasks(self) -> List[tasks.Task]:
     # Only consider linux jobs for chrome fuzzing.
     job_query = data_types.Job.query(data_types.Job.platform == 'LINUX')
     for job in ndb_utils.get_all_from_query(job_query):
+      base_os_version = None
+      if job.base_os_version:
+        base_os_version = job.base_os_version
+
       candidates_by_job[job.name] = FuzzTaskCandidate(
-          job=job.name, project=job.project)
+          job=job.name, project=job.project, base_os_version=base_os_version)
 
     fuzz_task_candidates = []
     fuzzer_job_query = ndb_utils.get_all_from_query(
@@ -261,7 +285,11 @@ def get_fuzz_tasks(self) -> List[tasks.Task]:
     choices = random.choices(
         fuzz_task_candidates, weights=weights, k=num_instances)
     fuzz_tasks = [
-        tasks.Task('fuzz', candidate.fuzzer, candidate.job)
+        tasks.Task(
+            'fuzz',
+            candidate.fuzzer,
+            candidate.job,
+            extra_info={'base_os_version': candidate.base_os_version})
         for candidate in choices
     ]
     return fuzz_tasks
diff --git a/src/clusterfuzz/_internal/tests/appengine/handlers/cron/schedule_fuzz_test.py b/src/clusterfuzz/_internal/tests/appengine/handlers/cron/schedule_fuzz_test.py
@@ -78,6 +78,163 @@ def test_get_fuzz_tasks(self):
     expected_results = [('fuzz', 'libFuzzer', 'myjob')] * 5
     self.assertListEqual(comparable_results, expected_results)
 
+  def test_os_version_precedence_project_over_job(self):
+    """Tests that project version is prioritized over job version."""
+    job_name = 'myjob'
+    project_name = 'myproject'
+    data_types.Job(
+        name='dead_job',
+        environment_string=f'PROJECT_NAME = {project_name}',
+        platform='LINUX',
+    ).put()
+    data_types.Job(
+        name=job_name,
+        environment_string=f'PROJECT_NAME = {project_name}',
+        platform='LINUX',
+        base_os_version='job-version',
+    ).put()
+    data_types.Job(
+        name='dead_project_job',
+        environment_string='PROJECT_NAME = dead_project',
+        platform='LINUX',
+    ).put()
+
+    data_types.FuzzerJob(
+        job='dead_job', weight=0.0, platform='LINUX', fuzzer='libFuzzer').put()
+    data_types.FuzzerJob(
+        job=job_name, platform='LINUX', fuzzer='libFuzzer').put()
+    data_types.FuzzerJob(
+        job='dead_project_job', platform='LINUX', fuzzer='libFuzzer').put()
+
+    data_types.OssFuzzProject(
+        name=project_name, base_os_version='project-version').put()
+    data_types.OssFuzzProject(name='dead_project', cpu_weight=0.0).put()
+
+    scheduler = schedule_fuzz.OssfuzzFuzzTaskScheduler(num_cpus=2)
+    tasks = scheduler.get_fuzz_tasks()
+    self.assertEqual(len(tasks), 1)
+    task = tasks[0]
+
+    self.assertEqual(task.job, job_name)
+    self.assertEqual(task.extra_info.get('base_os_version'), 'project-version')
+
+  def test_os_version_fallback_to_job(self):
+    """Tests that job version is used as a fallback."""
+    job_name = 'myjob'
+    project_name = 'myproject'
+    data_types.Job(
+        name='dead_job',
+        environment_string=f'PROJECT_NAME = {project_name}',
+        platform='LINUX',
+    ).put()
+    data_types.Job(
+        name=job_name,
+        environment_string=f'PROJECT_NAME = {project_name}',
+        platform='LINUX',
+        base_os_version='job-version',
+    ).put()
+    data_types.Job(
+        name='dead_project_job',
+        environment_string='PROJECT_NAME = dead_project',
+        platform='LINUX',
+    ).put()
+
+    data_types.FuzzerJob(
+        job='dead_job', weight=0.0, platform='LINUX', fuzzer='libFuzzer').put()
+    data_types.FuzzerJob(
+        job=job_name, platform='LINUX', fuzzer='libFuzzer').put()
+    data_types.FuzzerJob(
+        job='dead_project_job', platform='LINUX', fuzzer='libFuzzer').put()
+
+    data_types.OssFuzzProject(name=project_name).put()
+    data_types.OssFuzzProject(name='dead_project', cpu_weight=0.0).put()
+
+    scheduler = schedule_fuzz.OssfuzzFuzzTaskScheduler(num_cpus=2)
+    tasks = scheduler.get_fuzz_tasks()
+    self.assertEqual(len(tasks), 1)
+    task = tasks[0]
+
+    self.assertEqual(task.job, job_name)
+    self.assertEqual(task.extra_info.get('base_os_version'), 'job-version')
+
+  def test_os_version_no_version(self):
+    """Tests that no os version is set when neither project nor job has one."""
+    job_name = 'myjob'
+    project_name = 'myproject'
+    data_types.Job(
+        name='dead_job',
+        environment_string=f'PROJECT_NAME = {project_name}',
+        platform='LINUX',
+    ).put()
+    data_types.Job(
+        name=job_name,
+        environment_string=f'PROJECT_NAME = {project_name}',
+        platform='LINUX',
+        base_os_version=None,
+    ).put()
+    data_types.Job(
+        name='dead_project_job',
+        environment_string='PROJECT_NAME = dead_project',
+        platform='LINUX',
+    ).put()
+
+    data_types.FuzzerJob(
+        job='dead_job', weight=0.0, platform='LINUX', fuzzer='libFuzzer').put()
+    data_types.FuzzerJob(
+        job=job_name, platform='LINUX', fuzzer='libFuzzer').put()
+    data_types.FuzzerJob(
+        job='dead_project_job', platform='LINUX', fuzzer='libFuzzer').put()
+
+    data_types.OssFuzzProject(name=project_name).put()
+    data_types.OssFuzzProject(name='dead_project', cpu_weight=0.0).put()
+
+    scheduler = schedule_fuzz.OssfuzzFuzzTaskScheduler(num_cpus=2)
+    tasks = scheduler.get_fuzz_tasks()
+    self.assertEqual(len(tasks), 1)
+    task = tasks[0]
+
+    self.assertEqual(task.job, job_name)
+    self.assertIsNone(task.extra_info.get('base_os_version'))
+
+
+@test_utils.with_cloud_emulators('datastore')
+class ChromeFuzzTaskSchedulerTest(unittest.TestCase):
+  """Tests for ChromeFuzzTaskScheduler."""
+
+  def setUp(self):
+    self.maxDiff = None
+    self.job_name = 'myjob'
+
+  def _setup_chrome_entities(self, job_os_version=None):
+    """Set up entities for Chrome tests."""
+    data_types.Job(
+        name=self.job_name,
+        project='chrome',
+        platform='LINUX',
+        base_os_version=job_os_version).put()
+    data_types.FuzzerJob(
+        job=self.job_name, platform='LINUX', fuzzer='libFuzzer',
+        weight=1.0).put()
+
+  def _run_and_get_task(self):
+    """Runs the scheduler and returns the single task created."""
+    scheduler = schedule_fuzz.ChromeFuzzTaskScheduler(num_cpus=2)
+    tasks = scheduler.get_fuzz_tasks()
+    self.assertEqual(len(tasks), 1)
+    return tasks[0]
+
+  def test_os_version_from_job(self):
+    """Tests that the os version is correctly read from the job."""
+    self._setup_chrome_entities(job_os_version='job-version')
+    task = self._run_and_get_task()
+    self.assertEqual(task.extra_info.get('base_os_version'), 'job-version')
+
+  def test_os_version_job_without_version(self):
+    """Tests that no os version is set when the job has none."""
+    self._setup_chrome_entities()
+    task = self._run_and_get_task()
+    self.assertIsNone(task.extra_info.get('base_os_version'))
+
 
 class TestGetCpuUsage(unittest.TestCase):
   """Tests for get_cpu_limit_for_regions."""
diff --git a/src/clusterfuzz/_internal/tests/core/base/tasks/tasks_test.py b/src/clusterfuzz/_internal/tests/core/base/tasks/tasks_test.py