Skip to content

Commit b3568ac

Browse files
authored
Merge pull request #870 from juanjemdIos/master
Solve empty repos and problematic repo when dividing the text for classification. Fixes #859, Fixes #891, Fixes #862
2 parents 923fb7a + fa749fe commit b3568ac

File tree

3 files changed

+46
-5
lines changed

3 files changed

+46
-5
lines changed

src/somef/process_repository.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -774,10 +774,17 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
774774
with open(repo_zip_file, "wb") as f:
775775
f.write(repo_zip)
776776

777-
with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
778-
zip_ref.extractall(repo_extract_dir)
779-
777+
try:
778+
with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
779+
zip_ref.extractall(repo_extract_dir)
780+
except zipfile.BadZipFile:
781+
logging.error("Downloaded archive is not a valid zip (repo may be empty)")
782+
return None
783+
780784
repo_folders = os.listdir(repo_extract_dir)
785+
if not repo_folders:
786+
logging.warning("Repository archive is empty")
787+
return None
781788

782789
repo_dir = os.path.join(repo_extract_dir, repo_folders[0])
783790
return repo_dir

src/somef/test/test_JSON_export.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ def test_issue_830(self):
427427
# data = text_file.read()
428428
# text_file.close()
429429
# json_content = json.loads(data)
430-
430+
431431
# assert "description" in json_content, "Missing 'description' property"
432432

433433
# assert len(json_content["description"]) > 0, "Description list is empty"
@@ -438,5 +438,37 @@ def test_issue_830(self):
438438

439439
# os.remove(test_data_path + "test_issue_862.json")
440440

441+
def test_issue_859(self):
442+
"""Checks whether a repository without content works fine. Must have just some results from the API."""
443+
444+
somef_cli.run_cli(threshold=0.8,
445+
ignore_classifiers=False,
446+
repo_url="https://github.com/shiningZZ/GU-CAFF",
447+
local_repo=None,
448+
doc_src=None,
449+
in_file=None,
450+
output=test_data_path + "test-859.json",
451+
graph_out=None,
452+
graph_format="turtle",
453+
codemeta_out= None,
454+
pretty=True,
455+
missing=False,
456+
readme_only=False)
457+
458+
with open(test_data_path + "test-859.json", "r") as text_file:
459+
json_content = json.load(text_file)
460+
461+
assert "code_repository" in json_content
462+
assert len(json_content["code_repository"]) > 0
463+
464+
for key, entries in json_content.items():
465+
if isinstance(entries, list):
466+
for entry in entries:
467+
assert entry.get("technique") == "GitHub_API", \
468+
f"Unexpected technique {entry.get('technique')} in key {key}"
469+
470+
os.remove(test_data_path + "test-859.json")
471+
472+
441473
if __name__ == '__main__':
442474
unittest.main()

src/somef/utils/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
REGEXP_PYPI_2 = "[![Latest PyPI version]"
2525
REGEXP_COLAB = "https://colab.research.google.com/drive"
2626
# needed to cleanup bibtext files.
27-
REGEXP_BIBTEX = r'\@[a-zA-Z]+\{[.\n\S\s]+?[author|title][.\n\S\s]+?[author|title][.\n\S\s]+?\n\}'
27+
# REGEXP_BIBTEX = r'\@[a-zA-Z]+\{[.\n\S\s]+?[author|title][.\n\S\s]+?[author|title][.\n\S\s]+?\n\}'
28+
# REGEXP_BIBTEX = r'@[a-zA-Z]+\{[\s\S]*?(?:author|title)[\s\S]*?(?:author|title)[\s\S]*?\}'
29+
REGEXP_BIBTEX = r'@[a-zA-Z]+\{(?=[\s\S]*\bauthor\b)(?=[\s\S]*\btitle\b)[\s\S]*?\}'
2830
REGEXP_DOI = r'\[\!\[DOI\]([^\]]+)\]\(([^)]+)\)'
2931
REGEXP_LINKS = r"\[(.*?)?\]\(([^)]+)\)"
3032
REGEXP_IMAGES = r"!\[(.*?)?\]\((.*?)?\)"

0 commit comments

Comments
 (0)