Merge pull request #91 from MDverse/feat/update-clean_text-function

pierrepo · web-flow · commit 3cd7ed81b361 · 2026-02-16T09:22:09.000+01:00
Feat/update clean text function
diff --git a/src/mdverse_scrapers/core/toolbox.py b/src/mdverse_scrapers/core/toolbox.py
@@ -175,7 +175,7 @@ def read_query_file(query_file_path: Path, logger: "loguru.Logger" = loguru.logg
     exclusion_path_patterns : list[str]
         Patterns for path exclusion.
     """
-    with open(query_file_path) as param_file:
+    with open(query_file_path, encoding="utf-8") as param_file:
         logger.info(f"Reading parameters from: {query_file_path}")
         data_loaded = yaml.safe_load(param_file)
     keywords = data_loaded["keywords"]
@@ -209,28 +209,57 @@ def remove_duplicates_in_list_of_dicts(input_list: list[dict]) -> list[dict]:
     return output_list
 
 
-def clean_text(string):
-    """Decode html and remove breaks.
+def strip_html(input_text: str) -> str:
+    """Remove html tags.
 
     Arguments
     ---------
-    string: str
-        input string
+    input_text: str
+        input text
 
     Returns
     -------
     str
-        decoded string.
+        clean text
     """
-    # Remove HTML tags
-    # text_decode = BeautifulSoup(string, features="lxml")
-    # text_decode = u''.join(text_decode.findAll(text=True))
-    text_decode = BeautifulSoup(string, features="lxml").text
-    # Remove tabulation and carriage return
-    text_decode = re.sub(r"[\n\r\t]", " ", text_decode)
-    # Remove multi spaces
-    text_decode = re.sub(r" {2,}", " ", text_decode)
-    return text_decode
+    return BeautifulSoup(input_text, features="lxml").text
+
+
+def strip_whitespace(input_text: str) -> str:
+    """Remove whitespace characters.
+
+    Arguments
+    ---------
+    input_text: str
+        input text
+
+    Returns
+    -------
+    str
+        clean text
+    """
+    # Remove tabulation and carriage return.
+    text_clean = re.sub(r"[\n\r\t]", " ", input_text)
+    # Remove multi spaces.
+    text_clean = re.sub(r" {2,}", " ", text_clean)
+    return text_clean
+
+
+def clean_text(input_text: str) -> str:
+    """Remove html tags and whitespace characters.
+
+    Arguments
+    ---------
+    input_text: str
+        input text
+
+    Returns
+    -------
+    str
+        clean text
+    """
+    clean_text = strip_html(input_text)
+    return strip_whitespace(clean_text)
 
 
 def remove_excluded_files(
@@ -303,7 +332,7 @@ def find_false_positive_datasets(
 ) -> list[str]:
     """Find false positive datasets.
 
-    False positive datasets are datasets that propably do not
+    False positive datasets are datasets that probably do not
     contain any molecular dynamics data.
 
     Parameters
diff --git a/src/mdverse_scrapers/scrapers/figshare.py b/src/mdverse_scrapers/scrapers/figshare.py
@@ -21,6 +21,7 @@
     print_statistics,
     read_query_file,
     remove_excluded_files,
+    strip_html,
 )
 from ..models.enums import DatasetSourceName
 from ..models.scraper import ScraperContext
@@ -242,12 +243,12 @@ def extract_metadata_from_single_dataset_record(
         "dataset_url_in_repository": record_json.get("url_public_html"),
         "date_created": record_json.get("created_date"),
         "date_last_updated": record_json.get("modified_date"),
-        "title": clean_text(record_json.get("title")),
+        "title": clean_text(record_json.get("title", "")),
         "author_names": [
             clean_text(author.get("full_name"))
             for author in record_json.get("authors", [])
         ],
-        "description": clean_text(record_json.get("description")),
+        "description": strip_html(record_json.get("description", "")),
         "license": record_json.get("license", {}).get("name"),
         "doi": record_json.get("doi"),
         "download_number": dataset_stats["download_number"],
@@ -330,7 +331,7 @@ def search_all_datasets(
             found_datasets_per_keyword = []
             # Search endpoint: /articles/search
             # https://docs.figshare.com/#articles_search
-            # Iterate seach on pages.
+            # Iterate search on pages.
             while True:
                 data_query = {
                     "order": "published_date",
diff --git a/src/mdverse_scrapers/scrapers/zenodo.py b/src/mdverse_scrapers/scrapers/zenodo.py
@@ -20,6 +20,7 @@
     read_query_file,
     remove_duplicates_in_list_of_dicts,
     remove_excluded_files,
+    strip_html,
 )
 from ..models.enums import DatasetSourceName
 from ..models.file import FileMetadata
@@ -162,7 +163,7 @@ def extract_data_from_zip_file(url, logger: "loguru.Logger" = loguru.logger):
     Returns
     -------
     list
-        List of dictionnaries with data extracted from zip preview.
+        List of dictionaries with data extracted from zip preview.
     """
     file_lst = []
     response = make_http_get_request_with_retries(
@@ -330,7 +331,7 @@ def extract_metadata_from_json(
                 for author in hit.get("metadata", {}).get("creators", [])
                 if author.get("name", None)
             ],
-            "description": clean_text(hit.get("metadata", {}).get("description", "")),
+            "description": strip_html(hit.get("metadata", {}).get("description", "")),
             "keywords": [
                 str(keyword) for keyword in hit.get("metadata", {}).get("keywords", [])
             ],