1+ import json
12from pathlib import Path
23from typing import Any
34from typing import Dict
@@ -33,223 +34,25 @@ def _in_same_cluster(
3334 return False
3435
3536
36- @pytest .mark .parametrize (
37- "bib_record_1, bib_record_2, expected_match" ,
38- [
39- (
40- {
41- "ENTRYTYPE" : "article" ,
42- "ID" : "1" ,
43- "doi" : "10.1073/PNAS.1604234114" ,
44- "author" : "Abrahao, Bruno and Parigi, Paolo and Gupta, Alok and Cook, Karen S." ,
45- "title" : "Reputation offsets trust judgments based on social biases among Airbnb users" ,
46- "journal" : "Proceedings of the National Academy of Sciences" ,
47- "number" : "37" ,
48- "pages" : "9848--9853" ,
49- "volume" : "114" ,
50- "year" : "2017" ,
51- },
52- {
53- "ENTRYTYPE" : "article" ,
54- "ID" : "2" ,
55- "author" : "B. Abrahao; P. Parigi; A. Gupta; K. S. Cook" ,
56- "year" : "2017" ,
57- "title" : "Reputation offsets trust judgments based on social biases among Airbnb users" ,
58- },
59- True ,
60- ),
61- (
62- {
63- "ENTRYTYPE" : "article" ,
64- "ID" : "1" ,
65- "author" : "Smith, John" ,
66- "title" : "Learning-based scheduling for digital work platforms" ,
67- "year" : "2020" ,
68- "journal" : "Journal of Digital Studies" ,
69- },
70- {
71- "ENTRYTYPE" : "article" ,
72- "ID" : "2" ,
73- "author" : "Smith, John" ,
74- "title" : "Learning-based scheduling for digital work platforms" ,
75- "year" : "2020" ,
76- "journal" : "Workshop of Digital Studies" ,
77- },
78- False ,
79- ),
80- # Tan et al. 2004 vs 2005 (same review, different issue/year/doi suffix)
81- (
82- {
83- "ENTRYTYPE" : "article" ,
84- "ID" : "id_0022176" ,
85- "author" : "Tan A.Schulze A.O'Donnell C. P.Davis P. G." ,
86- "year" : "2004" ,
87- "title" : "AIR VERSUS OXYGEN FOR RESUSCITATION OF INFANTS AT BIRTH" ,
88- "journal" : "Cochrane Database Syst Rev" ,
89- "number" : "3" ,
90- "pages" : "Cd002273" ,
91- "doi" : "10.1002/14651858.CD002273.pub2" ,
92- "isbn" : "1361-6137" ,
93- },
94- {
95- "ENTRYTYPE" : "article" ,
96- "ID" : "id_0021834" ,
97- "author" : "Tan A.Schulze A.O'Donnell C. P.Davis P. G." ,
98- "year" : "2005" ,
99- "title" : "AIR VERSUS OXYGEN FOR RESUSCITATION OF INFANTS AT BIRTH" ,
100- "journal" : "Cochrane Database Syst Rev" ,
101- "number" : "2" ,
102- "pages" : "Cd002273" ,
103- "doi" : "10.1002/14651858.CD002273.pub3" ,
104- "isbn" : "1361-6137" ,
105- },
106- True ,
107- ),
108- # Li et al. 2019 (exact same DOI; abstract formatting differs)
109- (
110- {
111- "ENTRYTYPE" : "article" ,
112- "ID" : "id_0001432" ,
113- "author" : "Li Z. M.Kong C. Y.Zhang S. L.Han B.Zhang Z. Y.Wang L. S." ,
114- "year" : "2019" ,
115- "title" : "ALCOHOL AND HBV SYNERGISTICALLY PROMOTE HEPATIC STEATOSIS" ,
116- "journal" : "Ann Hepatol" ,
117- "volume" : "18" ,
118- "number" : "6" ,
119- "pages" : "913-917" ,
120- "doi" : "10.1016/j.aohep.2019.04.013" ,
121- "isbn" : "1665-2681 (Print)\n 1665-2681" ,
122- },
123- {
124- "ENTRYTYPE" : "article" ,
125- "ID" : "id_0025776" ,
126- "author" : "Li Z. M.Kong C. Y.Zhang S. L.Han B.Zhang Z. Y.Wang L. S." ,
127- "year" : "2019" ,
128- "title" : "ALCOHOL AND HBV SYNERGISTICALLY PROMOTE HEPATIC STEATOSIS" ,
129- "journal" : "Ann Hepatol" ,
130- "volume" : "18" ,
131- "number" : "6" ,
132- "pages" : "913-917" ,
133- "doi" : "10.1016/j.aohep.2019.04.013" ,
134- "isbn" : "1665-2681" ,
135- },
136- True ,
137- ),
138- # Adeli & Lewis 2008 (same DOI; multiple IDs/“search_set” variants)
139- (
140- {
141- "ENTRYTYPE" : "article" ,
142- "ID" : "id_0000728" ,
143- "author" : "Adeli K.Lewis G. F." ,
144- "year" : "2008" ,
145- "title" : "Intestinal lipoprotein overproduction in insulin-resistant states" ,
146- "journal" : "Curr Opin Lipidol" ,
147- "volume" : "19" ,
148- "number" : "3" ,
149- "pages" : "221-8" ,
150- "doi" : "10.1097/MOL.0b013e3282ffaf82" ,
151- "isbn" : "0957-9672 (Print)\n 0957-9672" ,
152- },
153- {
154- "ENTRYTYPE" : "article" ,
155- "ID" : "id_0000728B" ,
156- "author" : "Adeli K.Lewis G. F." ,
157- "year" : "2008" ,
158- "title" : "Intestinal lipoprotein overproduction in insulin-resistant states" ,
159- "journal" : "Curr Opin Lipidol" ,
160- "volume" : "19" ,
161- "number" : "3" ,
162- "pages" : "221-8" ,
163- "doi" : "10.1097/MOL.0b013e3282ffaf82" ,
164- "isbn" : "0957-9672 (Print)\n 0957-9672" ,
165- },
166- True ,
167- ),
168- (
169- {
170- "ENTRYTYPE" : "article" ,
171- "ID" : "id_0000728B" ,
172- "author" : "Adeli K.Lewis G. F." ,
173- "year" : "2008" ,
174- "title" : "Intestinal lipoprotein overproduction in insulin-resistant states" ,
175- "journal" : "Curr Opin Lipidol" ,
176- "volume" : "19" ,
177- "number" : "3" ,
178- "pages" : "221-8" ,
179- "doi" : "10.1097/MOL.0b013e3282ffaf82" ,
180- "isbn" : "0957-9672 (Print)\n 0957-9672" ,
181- },
182- {
183- "ENTRYTYPE" : "article" ,
184- "ID" : "id_0000728NEW" ,
185- "author" : "Adeli K.Lewis G. F." ,
186- "year" : "2008" ,
187- "title" : "Intestinal lipoprotein overproduction in insulin-resistant states" ,
188- "journal" : "Curr Opin Lipidol" ,
189- "volume" : "19" ,
190- "number" : "3" ,
191- "pages" : "221-8" ,
192- "doi" : "10.1097/MOL.0b013e3282ffaf82" ,
193- "isbn" : "0957-9672 (Print)\n 0957-9672" ,
194- },
195- True ,
196- ),
197- # Sauer & Seuring 2023 (misc vs article representation; same DOI)
198- (
199- {
200- "ENTRYTYPE" : "misc" ,
201- "ID" : "SauerSeuring2023" ,
202- "author" : "Sauer, Philipp C and Seuring, Stefan" ,
203- "year" : "2023" ,
204- "title" : "How to conduct systematic literature reviews in management research: a guide in 6 steps and 14 decisions" ,
205- "doi" : "10.1007/S11846-023-00668-3" ,
206- },
207- {
208- "ENTRYTYPE" : "article" ,
209- "ID" : "SauerSeuring2023B" ,
210- "author" : "Sauer, Philipp C. and Seuring, Stefan" ,
211- "year" : "2023" ,
212- "title" : "How to conduct systematic literature reviews in management research: a guide in 6 steps and 14 decisions" ,
213- "journal" : "Review of Managerial Science" ,
214- "volume" : "17" ,
215- "number" : "5" ,
216- "pages" : "1899--1933" ,
217- "doi" : "10.1007/S11846-023-00668-3" ,
218- },
219- True ,
220- ),
221- # Clark et al. 2025 (misc vs article; same DOI)
222- (
223- {
224- "ENTRYTYPE" : "misc" ,
225- "ID" : "ClarkBartonAlbarqoEtAl2025" ,
226- "author" : "Clark, Justin; Barton, Belinda; Albarqo, Loai; Byambasuren, Oyungerel; Jowsey, Tanisha; Keogh, Justin; Liang, Tian; Moro, Christian; O'neill, Hayley; Jones, Mark" ,
227- "year" : "2025" ,
228- "title" : "Generative artificial intelligence use in evidence synthesis: A systematic review" ,
229- "doi" : "10.1017/RSM.2025.16" ,
230- },
231- {
232- "ENTRYTYPE" : "article" ,
233- "ID" : "ClarkBartonAlbarqouniEtAl2025" ,
234- "author" : "Clark, Justin; Barton, Belinda; Albarqouni, Loai; Byambasuren, Oyungerel; Jowsey, Tanisha; Keogh, Justin; Liang, Tian; Moro, Christian; O’Neill, Hayley; Jones, Mark" ,
235- "year" : "2025" ,
236- "title" : "Generative artificial intelligence use in evidence synthesis: A systematic review" ,
237- "journal" : "Research Synthesis Methods" ,
238- "volume" : "16" ,
239- "number" : "4" ,
240- "pages" : "601--619" ,
241- "doi" : "10.1017/RSM.2025.16" ,
242- },
243- True ,
244- ),
245- # Add further (bib_record_1, bib_record_2, expected_match) tuples here
246- ],
247- )
248- def test_individual_cases_match (
249- bib_record_1 : dict , bib_record_2 : dict , expected_match : bool
250- ) -> None :
37+ CASES_PATH = Path (__file__ ).parent / "test_cases.json"
38+
39+
40+ def load_cases () -> list :
41+ data = json .loads (CASES_PATH .read_text (encoding = "utf-8" ))
42+ cases = data ["cases" ]
43+ # Each item here corresponds to one test invocation
44+ return [
45+ pytest .param (
46+ c ["record_a" ], c ["record_b" ], c ["expected_duplicate" ], id = c .get ("id" )
47+ )
48+ for c in cases
49+ ]
50+
51+
52+ @pytest .mark .parametrize ("record_a,record_b,expected_duplicate" , load_cases ())
53+ def test_dedupe (record_a : dict , record_b : dict , expected_duplicate : bool ) -> None :
25154 """Check if two BibTeX-like records are deduplicated as expected."""
252- records_df = _make_records_df (bib_record_1 , bib_record_2 )
55+ records_df = _make_records_df (record_a , record_b )
25356
25457 prep_df = prep (records_df )
25558 pairs_df = block (records_df = prep_df )
@@ -260,10 +63,8 @@ def test_individual_cases_match(
26063 duplicate_id_sets = bib_dedupe .cluster .get_connected_components (matched_df )
26164 print (duplicate_id_sets )
26265
263- actual_match = _in_same_cluster (
264- duplicate_id_sets , bib_record_1 ["ID" ], bib_record_2 ["ID" ]
265- )
266- if actual_match == expected_match :
66+ actual_match = _in_same_cluster (duplicate_id_sets , record_a ["ID" ], record_b ["ID" ])
67+ if actual_match == expected_duplicate :
26768 Path ("EXPORT.csv" ).unlink ()
26869
269- assert actual_match == expected_match
70+ assert actual_match == expected_duplicate
0 commit comments