Skip to content

Commit 9d8ff41

Browse files
author
Gerit Wagner
committed
presenter information
1 parent b093168 commit 9d8ff41

File tree

2 files changed

+30
-0
lines changed

2 files changed

+30
-0
lines changed

bib_dedupe/prep_title.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,31 @@ def remove_authors_personal_copy(title: str) -> str:
4949
return re.sub(pattern, "", title).strip()
5050

5151

52+
def remove_presenter_information_noise(title: str) -> str:
53+
"""
54+
Remove noisy titles like:
55+
- "presenter information"
56+
- "presenter information presenter information"
57+
And also strip trailing "presenter information" tags.
58+
"""
59+
t = re.sub(r"\s+", " ", title).strip()
60+
# collapse repetitions anywhere
61+
t = re.sub(
62+
r"\bpresenter information\b(?:\s+\bpresenter information\b)+",
63+
"presenter information",
64+
t,
65+
flags=re.IGNORECASE,
66+
)
67+
# if the whole title is just that phrase -> drop
68+
if re.fullmatch(r"presenter information", t, flags=re.IGNORECASE):
69+
return ""
70+
# strip trailing occurrences as tag-like noise
71+
t = re.sub(
72+
r"(?:\s*[-–—:;,.]?\s*)presenter information\s*$", "", t, flags=re.IGNORECASE
73+
).strip()
74+
return t
75+
76+
5277
# flake8: noqa: E501
5378
# pylint: disable=line-too-long
5479
def prep_title(title_array: np.array) -> np.array:
@@ -187,6 +212,10 @@ def prep_title(title_array: np.array) -> np.array:
187212
[remove_authors_personal_copy(title) for title in title_array]
188213
)
189214

215+
title_array = np.array(
216+
[remove_presenter_information_noise(title) for title in title_array]
217+
)
218+
190219
# Replace multiple spaces with a single space
191220
title_array = np.array(
192221
[re.sub(r"\s+", " ", title).rstrip().lstrip() for title in title_array]

tests/prep_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ def test_prep_container_title(input_container_title: str, expected_output: str)
374374
"Author's personal copy Detection of anomalous bids in procurement auctions",
375375
"detection anomalous bids procurement auctions",
376376
),
377+
("Presenter Information Presenter Information", "")
377378
# (
378379
# "Behavioral effects of acute sublethal exposure to dimethoate on wood mice Apodemus sylvaticus: II--Field studies on radio-tagged mice in a cereal ecosystem",
379380
# "behavioral effects acute sublethal exposure dimethoate wood mice apodemus sylvaticus 2 field studies radio tagged mice cereal ecosystem",

0 commit comments

Comments
 (0)