Skip to content

Commit 530b990

Browse files
author
Paolo Tranquilli
committed
MaD generator: some final minor tweaks
1 parent ee7eb86 commit 530b990

File tree

1 file changed

+24
-14
lines changed

1 file changed

+24
-14
lines changed

misc/scripts/models-as-data/bulk_generate_mad.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,10 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str:
326326

327327

328328
def download_dca_databases(
329-
experiment_name: str, pat: str, projects: List[Project]
329+
language: str,
330+
experiment_name: str,
331+
pat: str,
332+
projects: List[Project],
330333
) -> List[tuple[Project, str | None]]:
331334
"""
332335
Download databases from a DCA experiment.
@@ -337,7 +340,6 @@ def download_dca_databases(
337340
Returns:
338341
List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
339342
"""
340-
database_results = {}
341343
print("\n=== Finding projects ===")
342344
response = get_json_from_github(
343345
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
@@ -363,7 +365,7 @@ def download_dca_databases(
363365

364366
artifact_map[pretty_name] = analyzed_database
365367

366-
def download(item: tuple[str, dict]) -> str:
368+
def download_and_extract(item: tuple[str, dict]) -> str:
367369
pretty_name, analyzed_database = item
368370
artifact_name = analyzed_database["artifact_name"]
369371
repository = analyzed_database["repository"]
@@ -391,16 +393,19 @@ def download(item: tuple[str, dict]) -> str:
391393
# And then we extract it to build_dir/artifact_name
392394
zip_ref.extractall(artifact_unzipped_location)
393395
# And then we iterate over the contents of the extracted directory
394-
# and extract the tar.gz files inside it
395-
for entry in os.listdir(artifact_unzipped_location):
396-
artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
397-
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
398-
# And we just untar it to the same directory as the zip file
399-
tar_ref.extractall(artifact_unzipped_location)
400-
return os.path.join(artifact_unzipped_location, remove_extension(entry))
396+
# and extract the language tar.gz file inside it
397+
artifact_tar_location = os.path.join(
398+
artifact_unzipped_location, f"{language}.tar.gz"
399+
)
400+
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
401+
# And we just untar it to the same directory as the zip file
402+
tar_ref.extractall(artifact_unzipped_location)
403+
ret = os.path.join(artifact_unzipped_location, language)
404+
print(f"Extraction complete: {ret}")
405+
return ret
401406

402407
results = run_in_parallel(
403-
download,
408+
download_and_extract,
404409
list(artifact_map.items()),
405410
on_error=lambda item, exc: print(
406411
f"ERROR: Failed to download database for {item[0]}: {exc}"
@@ -410,7 +415,7 @@ def download(item: tuple[str, dict]) -> str:
410415
),
411416
)
412417

413-
print(f"\n=== Extracted {len(database_results)} databases ===")
418+
print(f"\n=== Extracted {len(results)} databases ===")
414419

415420
return [(project_map[n], r) for n, r in zip(artifact_map, results)]
416421

@@ -463,7 +468,9 @@ def main(config, args) -> None:
463468
case "repo":
464469
extractor_options = config.get("extractor_options", [])
465470
database_results = build_databases_from_projects(
466-
language, extractor_options, projects
471+
language,
472+
extractor_options,
473+
projects,
467474
)
468475
case "dca":
469476
experiment_name = args.dca
@@ -480,7 +487,10 @@ def main(config, args) -> None:
480487
with open(args.pat, "r") as f:
481488
pat = f.read().strip()
482489
database_results = download_dca_databases(
483-
experiment_name, pat, projects
490+
language,
491+
experiment_name,
492+
pat,
493+
projects,
484494
)
485495

486496
# Generate models for all projects

0 commit comments

Comments
 (0)