diff --git a/.github/workflows/action-test-before-PR.yml b/.github/workflows/action-test-before-PR.yml index d2ca2786..acfc8680 100644 --- a/.github/workflows/action-test-before-PR.yml +++ b/.github/workflows/action-test-before-PR.yml @@ -31,4 +31,4 @@ jobs: run: poetry run somef configure -a - name: Run pytest - run: poetry run pytest -v src/somef/test + run: poetry run pytest -v -s src/somef/test diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py index b97cf28b..a759e507 100644 --- a/src/somef/header_analysis.py +++ b/src/somef/header_analysis.py @@ -138,6 +138,11 @@ def extract_header_content(text: str) -> Tuple[pd.DataFrame, str | None]: content, none_header_content = mardown_parser.extract_content_per_header(text, headers) parents = mardown_parser.extract_headers_parents(text) + + min_len = min(len(header_list), len(content)) + header_list = header_list[:min_len] + content = content[:min_len] + df = pd.DataFrame({ 'Header': header_list, 'Content': content, diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index c6bc02e1..4b3d54a4 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -156,6 +156,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc repository_metadata) logging.info("--> create excerpts") excerpts = create_excerpts.create_excerpts(string_list) + logging.info("--> extract text excerpts headers") excerpts_headers = mardown_parser.extract_text_excerpts_header(readme_unfiltered_text) header_parents = mardown_parser.extract_headers_parents(readme_unfiltered_text) score_dict = supervised_classification.run_classifiers(excerpts, file_paths) diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index 7cc72e2d..edd9b7ae 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -407,36 +407,36 @@ def test_issue_830(self): # except Exception as e: # print(f"Failed to delete {cls.json_file}: {e}") - # def test_issue_862(self): - # """Checks if this repository does not gets stuck when labeling headers""" - # somef_cli.run_cli(threshold=0.8, - # ignore_classifiers=False, - # repo_url=None, - # local_repo=test_data_repositories + "componentInstaller", - # doc_src=None, - # in_file=None, - # output=test_data_path + "test_issue_862.json", - # graph_out=None, - # graph_format="turtle", - # codemeta_out=None, - # pretty=True, - # missing=False, - # readme_only=False) + def test_issue_862(self): + """Checks if this repository does not gets stuck when labeling headers""" + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url=None, + local_repo=test_data_repositories + "componentInstaller", + doc_src=None, + in_file=None, + output=test_data_path + "test_issue_862.json", + graph_out=None, + graph_format="turtle", + codemeta_out=None, + pretty=True, + missing=False, + readme_only=False) - # text_file = open(test_data_path + "test_issue_862.json", "r") - # data = text_file.read() - # text_file.close() - # json_content = json.loads(data) + text_file = open(test_data_path + "test_issue_862.json", "r") + data = text_file.read() + text_file.close() + json_content = json.loads(data) - # assert "description" in json_content, "Missing 'description' property" + assert "description" in json_content, "Missing 'description' property" - # assert len(json_content["description"]) > 0, "Description list is empty" + assert len(json_content["description"]) > 0, "Description list is empty" - # first_desc = json_content["description"][0]["result"] - # assert "value" in first_desc, "Missing 'value' in description result" - # assert first_desc["value"], "Description 'value' is empty" + first_desc = json_content["description"][0]["result"] + assert "value" in first_desc, "Missing 'value' in description result" + assert first_desc["value"], "Description 'value' is empty" - # os.remove(test_data_path + "test_issue_862.json") + os.remove(test_data_path + "test_issue_862.json") def test_issue_859(self): """Checks whether a repository without content works fine. Must have just some results from the API.""" diff --git a/src/somef/utils/markdown_utils.py b/src/somef/utils/markdown_utils.py index e898d7d5..65449251 100644 --- a/src/somef/utils/markdown_utils.py +++ b/src/somef/utils/markdown_utils.py @@ -63,6 +63,10 @@ def remove_comments(html_text): ------- Markdown with no HTML comments """ - comment_pattern = r'' - html_without_comments = re.sub(comment_pattern, '', html_text, flags=re.DOTALL) + # comment_pattern = r'' + # comment_pattern = r'