1- from typing import Dict , Any , Iterable , Set
1+ from typing import Any
2+ from typing import Dict
3+ from typing import Iterable
4+ from typing import Set
25
36import pandas as pd
47import pytest
58
69import bib_dedupe .cluster
7- from bib_dedupe .bib_dedupe import prep , block , match
10+ from bib_dedupe .bib_dedupe import block
11+ from bib_dedupe .bib_dedupe import match
12+ from bib_dedupe .bib_dedupe import prep
813
914
1015def _make_records_df (rec1 : Dict [str , Any ], rec2 : Dict [str , Any ]) -> pd .DataFrame :
@@ -14,7 +19,9 @@ def _make_records_df(rec1: Dict[str, Any], rec2: Dict[str, Any]) -> pd.DataFrame
1419 return pd .DataFrame ([rec1_full , rec2_full ])
1520
1621
17- def _in_same_cluster (duplicate_id_sets : Iterable [Iterable [str ]], a : str , b : str ) -> bool :
22+ def _in_same_cluster (
23+ duplicate_id_sets : Iterable [Iterable [str ]], a : str , b : str
24+ ) -> bool :
1825 """Return True if ids `a` and `b` appear together in at least one duplicate cluster."""
1926 target : Set [str ] = {a , b }
2027 for group in duplicate_id_sets :
@@ -95,7 +102,6 @@ def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str)
95102 },
96103 True ,
97104 ),
98-
99105 # Li et al. 2019 (exact same DOI; abstract formatting differs)
100106 (
101107 {
@@ -126,7 +132,6 @@ def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str)
126132 },
127133 True ,
128134 ),
129-
130135 # Adeli & Lewis 2008 (same DOI; multiple IDs/“search_set” variants)
131136 (
132137 {
@@ -186,7 +191,6 @@ def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str)
186191 },
187192 True ,
188193 ),
189-
190194 # Sauer & Seuring 2023 (misc vs article representation; same DOI)
191195 (
192196 {
@@ -211,7 +215,6 @@ def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str)
211215 },
212216 True ,
213217 ),
214-
215218 # Clark et al. 2025 (misc vs article; same DOI)
216219 (
217220 {
@@ -236,9 +239,6 @@ def _in_same_cluster(duplicate_id_sets: Iterable[Iterable[str]], a: str, b: str)
236239 },
237240 True ,
238241 ),
239-
240-
241-
242242 # Add further (bib_record_1, bib_record_2, expected_match) tuples here
243243 ],
244244)
@@ -255,7 +255,8 @@ def test_individual_cases_match(bib_record_1, bib_record_2, expected_match) -> N
255255 duplicate_id_sets = bib_dedupe .cluster .get_connected_components (matched_df )
256256 print (duplicate_id_sets )
257257
258- actual_match = _in_same_cluster (duplicate_id_sets , bib_record_1 ["ID" ], bib_record_2 ["ID" ])
259-
258+ actual_match = _in_same_cluster (
259+ duplicate_id_sets , bib_record_1 ["ID" ], bib_record_2 ["ID" ]
260+ )
260261
261262 assert actual_match == expected_match
0 commit comments