Skip to content
This repository was archived by the owner on Mar 10, 2026. It is now read-only.

Commit 3cd7ed8

Browse files
authored
Merge pull request #91 from MDverse/feat/update-clean_text-function
Feat/update clean text function
2 parents cd3d07b + abf54f7 commit 3cd7ed8

File tree

3 files changed

+52
-21
lines changed

3 files changed

+52
-21
lines changed

src/mdverse_scrapers/core/toolbox.py

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def read_query_file(query_file_path: Path, logger: "loguru.Logger" = loguru.logg
175175
exclusion_path_patterns : list[str]
176176
Patterns for path exclusion.
177177
"""
178-
with open(query_file_path) as param_file:
178+
with open(query_file_path, encoding="utf-8") as param_file:
179179
logger.info(f"Reading parameters from: {query_file_path}")
180180
data_loaded = yaml.safe_load(param_file)
181181
keywords = data_loaded["keywords"]
@@ -209,28 +209,57 @@ def remove_duplicates_in_list_of_dicts(input_list: list[dict]) -> list[dict]:
209209
return output_list
210210

211211

212-
def clean_text(string):
213-
"""Decode html and remove breaks.
212+
def strip_html(input_text: str) -> str:
213+
"""Remove html tags.
214214
215215
Arguments
216216
---------
217-
string: str
218-
input string
217+
input_text: str
218+
input text
219219
220220
Returns
221221
-------
222222
str
223-
decoded string.
223+
clean text
224224
"""
225-
# Remove HTML tags
226-
# text_decode = BeautifulSoup(string, features="lxml")
227-
# text_decode = u''.join(text_decode.findAll(text=True))
228-
text_decode = BeautifulSoup(string, features="lxml").text
229-
# Remove tabulation and carriage return
230-
text_decode = re.sub(r"[\n\r\t]", " ", text_decode)
231-
# Remove multi spaces
232-
text_decode = re.sub(r" {2,}", " ", text_decode)
233-
return text_decode
225+
return BeautifulSoup(input_text, features="lxml").text
226+
227+
228+
def strip_whitespace(input_text: str) -> str:
229+
"""Remove whitespace characters.
230+
231+
Arguments
232+
---------
233+
input_text: str
234+
input text
235+
236+
Returns
237+
-------
238+
str
239+
clean text
240+
"""
241+
# Remove tabulation and carriage return.
242+
text_clean = re.sub(r"[\n\r\t]", " ", input_text)
243+
# Remove multi spaces.
244+
text_clean = re.sub(r" {2,}", " ", text_clean)
245+
return text_clean
246+
247+
248+
def clean_text(input_text: str) -> str:
249+
"""Remove html tags and whitespace characters.
250+
251+
Arguments
252+
---------
253+
input_text: str
254+
input text
255+
256+
Returns
257+
-------
258+
str
259+
clean text
260+
"""
261+
clean_text = strip_html(input_text)
262+
return strip_whitespace(clean_text)
234263

235264

236265
def remove_excluded_files(
@@ -303,7 +332,7 @@ def find_false_positive_datasets(
303332
) -> list[str]:
304333
"""Find false positive datasets.
305334
306-
False positive datasets are datasets that propably do not
335+
False positive datasets are datasets that probably do not
307336
contain any molecular dynamics data.
308337
309338
Parameters

src/mdverse_scrapers/scrapers/figshare.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
print_statistics,
2222
read_query_file,
2323
remove_excluded_files,
24+
strip_html,
2425
)
2526
from ..models.enums import DatasetSourceName
2627
from ..models.scraper import ScraperContext
@@ -242,12 +243,12 @@ def extract_metadata_from_single_dataset_record(
242243
"dataset_url_in_repository": record_json.get("url_public_html"),
243244
"date_created": record_json.get("created_date"),
244245
"date_last_updated": record_json.get("modified_date"),
245-
"title": clean_text(record_json.get("title")),
246+
"title": clean_text(record_json.get("title", "")),
246247
"author_names": [
247248
clean_text(author.get("full_name"))
248249
for author in record_json.get("authors", [])
249250
],
250-
"description": clean_text(record_json.get("description")),
251+
"description": strip_html(record_json.get("description", "")),
251252
"license": record_json.get("license", {}).get("name"),
252253
"doi": record_json.get("doi"),
253254
"download_number": dataset_stats["download_number"],
@@ -330,7 +331,7 @@ def search_all_datasets(
330331
found_datasets_per_keyword = []
331332
# Search endpoint: /articles/search
332333
# https://docs.figshare.com/#articles_search
333-
# Iterate seach on pages.
334+
# Iterate search on pages.
334335
while True:
335336
data_query = {
336337
"order": "published_date",

src/mdverse_scrapers/scrapers/zenodo.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
read_query_file,
2121
remove_duplicates_in_list_of_dicts,
2222
remove_excluded_files,
23+
strip_html,
2324
)
2425
from ..models.enums import DatasetSourceName
2526
from ..models.file import FileMetadata
@@ -162,7 +163,7 @@ def extract_data_from_zip_file(url, logger: "loguru.Logger" = loguru.logger):
162163
Returns
163164
-------
164165
list
165-
List of dictionnaries with data extracted from zip preview.
166+
List of dictionaries with data extracted from zip preview.
166167
"""
167168
file_lst = []
168169
response = make_http_get_request_with_retries(
@@ -330,7 +331,7 @@ def extract_metadata_from_json(
330331
for author in hit.get("metadata", {}).get("creators", [])
331332
if author.get("name", None)
332333
],
333-
"description": clean_text(hit.get("metadata", {}).get("description", "")),
334+
"description": strip_html(hit.get("metadata", {}).get("description", "")),
334335
"keywords": [
335336
str(keyword) for keyword in hit.get("metadata", {}).get("keywords", [])
336337
],

0 commit comments

Comments
 (0)