Skip to content

Commit 408f2ad

Browse files
hunschedecoNR
andauthored
Fix GitHub rate limit in project_setup by using recursive tree fetch (#5107)
## Description This PR addresses the GitHub API rate limiting issues encountered during the `project_setup` cron job execution for OSS-Fuzz projects. ### Problem The previous implementation of `get_oss_fuzz_projects` made multiple API calls per project (fetching directory trees individually), resulting in O(N) requests where N is the number of projects. This frequently triggered GitHub's rate limits (403 Forbidden). ### Solution Optimized the project fetching logic to use the `recursive=1` parameter when fetching the OSS-Fuzz repository tree. - Fetches the entire directory tree in a single API call (O(1)). - Uses SHA from the tree to cache `project.yaml` contents, skipping unchanged files. - Disables retries specifically for 403 errors to prevent retry storms. - Parses the flattened tree structure to identify projects with `project.yaml` and `Dockerfile`. - Significantly reduces the API request count, mitigating rate limit issues. ### Verification - **Regression Testing**: Ran existing tests in `src/clusterfuzz/_internal/tests/appengine/handlers/cron/project_setup_test.py`. All passed. - **New Tests**: Added `test_get_oss_fuzz_projects_api_error` to verify error handling when the API call fails. - **Mock Data**: Updated `url_results.txt` to include a mock response for the recursive tree fetch, covering edge cases like: - Invalid/Malformed YAML. - Nested files (ignored). - Files at root (ignored). - Projects without YAML (ignored). ### Impact - **Performance**: Drastic reduction in GitHub API calls. - **Reliability**: Reduced likelihood of cron job failures due to rate limiting. - **Scope**: Changes are isolated to `project_setup.py` and only affect the OSS-Fuzz project source flow. --------- Co-authored-by: André Nogueira Ribeiro <94117783+decoNR@users.noreply.github.com>
1 parent e84526d commit 408f2ad

File tree

4 files changed

+173
-43
lines changed

4 files changed

+173
-43
lines changed

src/clusterfuzz/_internal/cron/project_setup.py

Lines changed: 77 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import requests
2424
import yaml
2525

26+
from clusterfuzz._internal.base import retry
2627
from clusterfuzz._internal.base import tasks
2728
from clusterfuzz._internal.base import untrusted
2829
from clusterfuzz._internal.base import utils
@@ -80,6 +81,14 @@ class ProjectSetupError(Exception):
8081
"""Exception."""
8182

8283

84+
class GitHubGenericError(Exception):
85+
"""GitHub generic error."""
86+
87+
88+
class GitHubRateLimitError(Exception):
89+
"""GitHub rate limit error."""
90+
91+
8392
class JobInfo:
8493
"""Job information."""
8594

@@ -218,6 +227,11 @@ def _to_experimental_job(job_info):
218227
return job_info
219228

220229

230+
@retry.wrap(
231+
retries=3,
232+
delay=2,
233+
function='cron.project_setup.get_github_url',
234+
exception_types=[GitHubGenericError])
221235
def get_github_url(url):
222236
"""Return contents of URL."""
223237
github_credentials = db_config.get_value('github_credentials')
@@ -227,54 +241,79 @@ def get_github_url(url):
227241
client_id, client_secret = github_credentials.strip().split(';')
228242
response = requests.get(
229243
url, auth=(client_id, client_secret), timeout=HTTP_TIMEOUT_SECONDS)
244+
245+
if response.status_code == 403:
246+
raise GitHubRateLimitError(f'GitHub rate limit exceeded for {url}.')
247+
230248
if response.status_code != 200:
231249
logs.error(
232250
f'Failed to get github url: {url}.', status_code=response.status_code)
233-
response.raise_for_status()
251+
raise GitHubGenericError(f'Failed to get github url: {url}.')
234252

235253
return json.loads(response.text)
236254

237255

238-
def find_github_item_url(github_json, name):
239-
"""Get url of a blob/tree from a github json response."""
240-
for item in github_json['tree']:
241-
if item['path'] == name:
242-
return item['url']
243-
244-
return None
245-
246-
247256
def get_oss_fuzz_projects():
248257
"""Return list of projects for oss-fuzz."""
249258
ossfuzz_tree_url = ('https://api.github.com/repos/google/oss-fuzz/'
250-
'git/trees/master')
259+
'git/trees/master?recursive=1')
251260
tree = get_github_url(ossfuzz_tree_url)
252-
projects = []
253261

254-
projects_url = find_github_item_url(tree, 'projects')
255-
if not projects_url:
256-
logs.error('No projects found.')
257-
return []
262+
projects = []
263+
project_map = {}
258264

259-
tree = get_github_url(projects_url)
260265
for item in tree['tree']:
261-
if item['type'] != 'tree':
266+
path = item['path']
267+
if not path.startswith('projects/'):
262268
continue
263269

264-
item_json = get_github_url(item['url'])
265-
project_yaml_url = find_github_item_url(item_json, 'project.yaml')
266-
if not project_yaml_url:
270+
parts = path.split('/')
271+
if len(parts) != 3:
267272
continue
268273

269-
projects_yaml = get_github_url(project_yaml_url)
270-
info = yaml.safe_load(base64.b64decode(projects_yaml['content']))
274+
project_name = parts[1]
275+
filename = parts[2]
276+
277+
if project_name not in project_map:
278+
project_map[project_name] = {
279+
'yaml_url': None,
280+
'has_dockerfile': False,
281+
'yaml_sha': None
282+
}
271283

272-
has_dockerfile = (
273-
find_github_item_url(item_json, 'Dockerfile') or 'dockerfile' in info)
284+
if filename == 'project.yaml':
285+
project_map[project_name]['yaml_url'] = item['url']
286+
project_map[project_name]['yaml_sha'] = item['sha']
287+
elif filename == 'Dockerfile':
288+
project_map[project_name]['has_dockerfile'] = True
289+
290+
# Get all existing projects to check for cache hits.
291+
existing_projects = {p.name: p for p in data_types.OssFuzzProject.query()}
292+
293+
for project_name, details in project_map.items():
294+
if not details['yaml_url']:
295+
continue
296+
297+
# Check if we have a cached version of project.yaml.
298+
existing_project = existing_projects.get(project_name)
299+
if (existing_project and existing_project.project_yaml_sha and
300+
existing_project.project_yaml_sha == details['yaml_sha']):
301+
# Cache hit.
302+
continue
303+
304+
try:
305+
projects_yaml = get_github_url(details['yaml_url'])
306+
content = base64.b64decode(projects_yaml['content'])
307+
info = yaml.safe_load(content)
308+
except Exception as e:
309+
logs.error(f'Failed to parse project.yaml for {project_name}: {e}')
310+
continue
311+
312+
has_dockerfile = (details['has_dockerfile'] or 'dockerfile' in info)
274313
if not has_dockerfile:
275314
continue
276315

277-
projects.append((item['path'], info))
316+
projects.append((project_name, info, details['yaml_sha']))
278317

279318
return projects
280319

@@ -286,7 +325,7 @@ def get_projects_from_gcs(gcs_url):
286325
except json.decoder.JSONDecodeError as e:
287326
raise ProjectSetupError(f'Error loading json file from {gcs_url}: {e}')
288327

289-
return [(project['name'], project) for project in data['projects']]
328+
return [(project['name'], project, None) for project in data['projects']]
290329

291330

292331
def _process_sanitizers_field(sanitizers):
@@ -533,7 +572,7 @@ def cleanup_old_projects_settings(project_names):
533572
ndb_utils.delete_multi(to_delete)
534573

535574

536-
def create_project_settings(project, info, service_account):
575+
def create_project_settings(project, info, project_yaml_sha, service_account):
537576
"""Setup settings for ClusterFuzz (such as CPU distribution)."""
538577
key = ndb.Key(data_types.OssFuzzProject, project)
539578
oss_fuzz_project = key.get()
@@ -561,6 +600,10 @@ def create_project_settings(project, info, service_account):
561600
if oss_fuzz_project.base_os_version != base_os_version:
562601
oss_fuzz_project.base_os_version = base_os_version
563602
oss_fuzz_project.put()
603+
604+
if oss_fuzz_project.project_yaml_sha != project_yaml_sha:
605+
oss_fuzz_project.project_yaml_sha = project_yaml_sha
606+
oss_fuzz_project.put()
564607
else:
565608
if language in MEMORY_SAFE_LANGUAGES:
566609
cpu_weight = OSS_FUZZ_MEMORY_SAFE_LANGUAGE_PROJECT_WEIGHT
@@ -574,7 +617,8 @@ def create_project_settings(project, info, service_account):
574617
cpu_weight=cpu_weight,
575618
service_account=service_account['email'],
576619
ccs=ccs,
577-
base_os_version=base_os_version).put()
620+
base_os_version=base_os_version,
621+
project_yaml_sha=project_yaml_sha).put()
578622

579623

580624
def _create_pubsub_topic(name, client):
@@ -1010,7 +1054,7 @@ def set_up(self, projects):
10101054
"""Do project setup. Return a list of all the project names that were set
10111055
up."""
10121056
job_names = []
1013-
for project, info in projects:
1057+
for project, info, project_yaml_sha in projects:
10141058
logs.info(f'Syncing configs for {project}.')
10151059

10161060
backup_bucket_name = None
@@ -1038,11 +1082,12 @@ def set_up(self, projects):
10381082

10391083
# Set up projects settings (such as CPU distribution settings).
10401084
if not info.get('disabled', False):
1041-
create_project_settings(project, info, service_account)
1085+
create_project_settings(project, info, project_yaml_sha,
1086+
service_account)
10421087

10431088
# Delete old/disabled project settings.
10441089
enabled_projects = [
1045-
project for project, info in projects
1090+
project for project, info, _ in projects
10461091
if not info.get('disabled', False)
10471092
]
10481093
return SetupResult(enabled_projects, job_names)

src/clusterfuzz/_internal/datastore/data_types.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,6 +1476,9 @@ class OssFuzzProject(Model):
14761476
# Base OS version for the project.
14771477
base_os_version = ndb.StringProperty()
14781478

1479+
# SHA of the project.yaml file.
1480+
project_yaml_sha = ndb.StringProperty()
1481+
14791482

14801483
class OssFuzzProjectInfo(Model):
14811484
"""Set up information for a project (cpu allocation, instance groups, service

src/clusterfuzz/_internal/tests/appengine/handlers/cron/project_setup_data/url_results.txt

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,5 +240,76 @@
240240
"content": "aG9tZXBhZ2U6ICJodHRwczovL3d3dy5mcmVldHlwZS5vcmcvIgo=\\n",
241241
"encoding": "base64"
242242
}
243+
""",
244+
'https://api.github.com/repos/google/oss-fuzz/git/trees/master?recursive=1': """
245+
{
246+
"sha": "recursive_sha",
247+
"url": "https://api.github.com/repos/google/oss-fuzz/git/trees/master?recursive=1",
248+
"tree": [
249+
{
250+
"path": "projects/boringssl/project.yaml",
251+
"mode": "100644",
252+
"type": "blob",
253+
"sha": "e57f1846ff0fdbb0fe08e98ca38b6235008b41be",
254+
"url": "https://api.github.com/repos/google/oss-fuzz/git/blobs/e57f1846ff0fdbb0fe08e98ca38b6235008b41be"
255+
},
256+
{
257+
"path": "projects/boringssl/Dockerfile",
258+
"mode": "100644",
259+
"type": "blob",
260+
"sha": "0368f8166f92043678183473f6cc225a5b316e75",
261+
"url": "https://api.github.com/repos/google/oss-fuzz/git/blobs/0368f8166f92043678183473f6cc225a5b316e75"
262+
},
263+
{
264+
"path": "projects/curl/project.yaml",
265+
"mode": "100644",
266+
"type": "blob",
267+
"sha": "30580bab5896f92de90e904cda76ecdb41c6397a",
268+
"url": "https://api.github.com/repos/google/oss-fuzz/git/blobs/30580bab5896f92de90e904cda76ecdb41c6397a"
269+
},
270+
{
271+
"path": "projects/freetype2/project.yaml",
272+
"mode": "100644",
273+
"type": "blob",
274+
"sha": "46400ddfc8f2db662f70d6f09190dadc31244a58",
275+
"url": "https://api.github.com/repos/google/oss-fuzz/git/blobs/46400ddfc8f2db662f70d6f09190dadc31244a58"
276+
},
277+
{
278+
"path": "projects/bad_yaml/project.yaml",
279+
"mode": "100644",
280+
"type": "blob",
281+
"sha": "bad_yaml_sha",
282+
"url": "https://api.github.com/repos/google/oss-fuzz/git/blobs/bad_yaml_sha"
283+
},
284+
{
285+
"path": "projects/nested/subdir/file",
286+
"mode": "100644",
287+
"type": "blob",
288+
"url": "https://api.github.com/repos/google/oss-fuzz/git/blobs/nested_sha"
289+
},
290+
{
291+
"path": "projects/root_file",
292+
"mode": "100644",
293+
"type": "blob",
294+
"url": "https://api.github.com/repos/google/oss-fuzz/git/blobs/root_sha"
295+
},
296+
{
297+
"path": "projects/only_docker/Dockerfile",
298+
"mode": "100644",
299+
"type": "blob",
300+
"url": "https://api.github.com/repos/google/oss-fuzz/git/blobs/docker_sha"
301+
}
302+
],
303+
"truncated": false
304+
}
305+
""",
306+
'https://api.github.com/repos/google/oss-fuzz/git/blobs/bad_yaml_sha': """
307+
{
308+
"sha": "bad_yaml_sha",
309+
"size": 10,
310+
"url": "https://api.github.com/repos/google/oss-fuzz/git/blobs/bad_yaml_sha",
311+
"content": "aW52YWxpZCB5YW1sOiA6OiB3aGF0Cg==\\n",
312+
"encoding": "base64"
313+
}
243314
"""
244315
}

0 commit comments

Comments
 (0)