Skip to content

Commit 1120346

Browse files
author
Gerit Wagner
committed
format
1 parent 75f5993 commit 1120346

File tree

2 files changed

+13
-14
lines changed

2 files changed

+13
-14
lines changed

bib_dedupe/match_conditions.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,8 @@ def both_entrytypes(entrytype: str) -> str:
8888
f"({au09_ti09_ctXX} & ({match(NUMBER)} & {non_contradicting(PAGES)} | {non_contradicting(NUMBER)} & {match(PAGES)}) & {non_contradicting(VOLUME, YEAR, DOI, ABSTRACT)})",
8989
f"({au09_ti09_ctXX} & {match(VOLUME, PAGES)})",
9090
f"({au09_ti09_ctXX} & {match(PAGES, YEAR)} & {non_contradicting(VOLUME, NUMBER, DOI)})",
91-
9291
# DOI-exact match; when container-titles are non-contradicting (may be missing)
9392
f"(({match(DOI)} & ~(doi_1 == '' | doi_2 == '')) & ({TITLE} > 0.95) & ({AUTHOR} > 0.9) & ({YEAR} > 0.9)) & {non_contradicting(CONTAINER_TITLE)} ",
94-
9593
# no TITLE
9694
f"({au10_tiXX_ct10} & {match(VOLUME, NUMBER, PAGES, YEAR)} & {non_contradicting(DOI)} & ({ABSTRACT} > 0.95 | {non_contradicting(ABSTRACT)}))", # typically for number-mismatches in title
9795
]

tests/case_test.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1-
from typing import Dict, Any, Iterable, Set
1+
from typing import Any
2+
from typing import Dict
3+
from typing import Iterable
4+
from typing import Set
25

36
import pandas as pd
47
import pytest
58

69
import bib_dedupe.cluster
7-
from bib_dedupe.bib_dedupe import prep, block, match
10+
from bib_dedupe.bib_dedupe import block
11+
from bib_dedupe.bib_dedupe import match
12+
from bib_dedupe.bib_dedupe import prep
813

914

1015
def _make_records_df(rec1: Dict[str, Any], rec2: Dict[str, Any]) -> pd.DataFrame:
@@ -14,7 +19,9 @@ def _make_records_df(rec1: Dict[str, Any], rec2: Dict[str, Any]) -> pd.DataFrame
1419
return pd.DataFrame([rec1_full, rec2_full])
1520

1621

17-
def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str) -> bool:
22+
def _in_same_cluster(
23+
duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str
24+
) -> bool:
1825
"""Return True if ids `a` and `b` appear together in at least one duplicate cluster."""
1926
target: Set[str] = {a, b}
2027
for group in duplicate_id_sets:
@@ -95,7 +102,6 @@ def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str)
95102
},
96103
True,
97104
),
98-
99105
# Li et al. 2019 (exact same DOI; abstract formatting differs)
100106
(
101107
{
@@ -126,7 +132,6 @@ def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str)
126132
},
127133
True,
128134
),
129-
130135
# Adeli & Lewis 2008 (same DOI; multiple IDs/“search_set” variants)
131136
(
132137
{
@@ -186,7 +191,6 @@ def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str)
186191
},
187192
True,
188193
),
189-
190194
# Sauer & Seuring 2023 (misc vs article representation; same DOI)
191195
(
192196
{
@@ -211,7 +215,6 @@ def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str)
211215
},
212216
True,
213217
),
214-
215218
# Clark et al. 2025 (misc vs article; same DOI)
216219
(
217220
{
@@ -236,9 +239,6 @@ def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str)
236239
},
237240
True,
238241
),
239-
240-
241-
242242
# Add further (bib_record_1, bib_record_2, expected_match) tuples here
243243
],
244244
)
@@ -255,7 +255,8 @@ def test_individual_cases_match(bib_record_1, bib_record_2, expected_match) -> N
255255
duplicate_id_sets = bib_dedupe.cluster.get_connected_components(matched_df)
256256
print(duplicate_id_sets)
257257

258-
actual_match = _in_same_cluster(duplicate_id_sets, bib_record_1["ID"], bib_record_2["ID"])
259-
258+
actual_match = _in_same_cluster(
259+
duplicate_id_sets, bib_record_1["ID"], bib_record_2["ID"]
260+
)
260261

261262
assert actual_match == expected_match

0 commit comments

Comments
 (0)