Skip to content

Commit ac7bee5

Browse files
author
Gerit Wagner
committed
export test cases to json (import special-cases)
1 parent 337890d commit ac7bee5

File tree

2 files changed

+466
-221
lines changed

2 files changed

+466
-221
lines changed

tests/case_test.py

Lines changed: 22 additions & 221 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from pathlib import Path
23
from typing import Any
34
from typing import Dict
@@ -33,223 +34,25 @@ def _in_same_cluster(
3334
return False
3435

3536

36-
@pytest.mark.parametrize(
37-
"bib_record_1, bib_record_2, expected_match",
38-
[
39-
(
40-
{
41-
"ENTRYTYPE": "article",
42-
"ID": "1",
43-
"doi": "10.1073/PNAS.1604234114",
44-
"author": "Abrahao, Bruno and Parigi, Paolo and Gupta, Alok and Cook, Karen S.",
45-
"title": "Reputation offsets trust judgments based on social biases among Airbnb users",
46-
"journal": "Proceedings of the National Academy of Sciences",
47-
"number": "37",
48-
"pages": "9848--9853",
49-
"volume": "114",
50-
"year": "2017",
51-
},
52-
{
53-
"ENTRYTYPE": "article",
54-
"ID": "2",
55-
"author": "B. Abrahao; P. Parigi; A. Gupta; K. S. Cook",
56-
"year": "2017",
57-
"title": "Reputation offsets trust judgments based on social biases among Airbnb users",
58-
},
59-
True,
60-
),
61-
(
62-
{
63-
"ENTRYTYPE": "article",
64-
"ID": "1",
65-
"author": "Smith, John",
66-
"title": "Learning-based scheduling for digital work platforms",
67-
"year": "2020",
68-
"journal": "Journal of Digital Studies",
69-
},
70-
{
71-
"ENTRYTYPE": "article",
72-
"ID": "2",
73-
"author": "Smith, John",
74-
"title": "Learning-based scheduling for digital work platforms",
75-
"year": "2020",
76-
"journal": "Workshop of Digital Studies",
77-
},
78-
False,
79-
),
80-
# Tan et al. 2004 vs 2005 (same review, different issue/year/doi suffix)
81-
(
82-
{
83-
"ENTRYTYPE": "article",
84-
"ID": "id_0022176",
85-
"author": "Tan A.Schulze A.O'Donnell C. P.Davis P. G.",
86-
"year": "2004",
87-
"title": "AIR VERSUS OXYGEN FOR RESUSCITATION OF INFANTS AT BIRTH",
88-
"journal": "Cochrane Database Syst Rev",
89-
"number": "3",
90-
"pages": "Cd002273",
91-
"doi": "10.1002/14651858.CD002273.pub2",
92-
"isbn": "1361-6137",
93-
},
94-
{
95-
"ENTRYTYPE": "article",
96-
"ID": "id_0021834",
97-
"author": "Tan A.Schulze A.O'Donnell C. P.Davis P. G.",
98-
"year": "2005",
99-
"title": "AIR VERSUS OXYGEN FOR RESUSCITATION OF INFANTS AT BIRTH",
100-
"journal": "Cochrane Database Syst Rev",
101-
"number": "2",
102-
"pages": "Cd002273",
103-
"doi": "10.1002/14651858.CD002273.pub3",
104-
"isbn": "1361-6137",
105-
},
106-
True,
107-
),
108-
# Li et al. 2019 (exact same DOI; abstract formatting differs)
109-
(
110-
{
111-
"ENTRYTYPE": "article",
112-
"ID": "id_0001432",
113-
"author": "Li Z. M.Kong C. Y.Zhang S. L.Han B.Zhang Z. Y.Wang L. S.",
114-
"year": "2019",
115-
"title": "ALCOHOL AND HBV SYNERGISTICALLY PROMOTE HEPATIC STEATOSIS",
116-
"journal": "Ann Hepatol",
117-
"volume": "18",
118-
"number": "6",
119-
"pages": "913-917",
120-
"doi": "10.1016/j.aohep.2019.04.013",
121-
"isbn": "1665-2681 (Print)\n1665-2681",
122-
},
123-
{
124-
"ENTRYTYPE": "article",
125-
"ID": "id_0025776",
126-
"author": "Li Z. M.Kong C. Y.Zhang S. L.Han B.Zhang Z. Y.Wang L. S.",
127-
"year": "2019",
128-
"title": "ALCOHOL AND HBV SYNERGISTICALLY PROMOTE HEPATIC STEATOSIS",
129-
"journal": "Ann Hepatol",
130-
"volume": "18",
131-
"number": "6",
132-
"pages": "913-917",
133-
"doi": "10.1016/j.aohep.2019.04.013",
134-
"isbn": "1665-2681",
135-
},
136-
True,
137-
),
138-
# Adeli & Lewis 2008 (same DOI; multiple IDs/“search_set” variants)
139-
(
140-
{
141-
"ENTRYTYPE": "article",
142-
"ID": "id_0000728",
143-
"author": "Adeli K.Lewis G. F.",
144-
"year": "2008",
145-
"title": "Intestinal lipoprotein overproduction in insulin-resistant states",
146-
"journal": "Curr Opin Lipidol",
147-
"volume": "19",
148-
"number": "3",
149-
"pages": "221-8",
150-
"doi": "10.1097/MOL.0b013e3282ffaf82",
151-
"isbn": "0957-9672 (Print)\n0957-9672",
152-
},
153-
{
154-
"ENTRYTYPE": "article",
155-
"ID": "id_0000728B",
156-
"author": "Adeli K.Lewis G. F.",
157-
"year": "2008",
158-
"title": "Intestinal lipoprotein overproduction in insulin-resistant states",
159-
"journal": "Curr Opin Lipidol",
160-
"volume": "19",
161-
"number": "3",
162-
"pages": "221-8",
163-
"doi": "10.1097/MOL.0b013e3282ffaf82",
164-
"isbn": "0957-9672 (Print)\n0957-9672",
165-
},
166-
True,
167-
),
168-
(
169-
{
170-
"ENTRYTYPE": "article",
171-
"ID": "id_0000728B",
172-
"author": "Adeli K.Lewis G. F.",
173-
"year": "2008",
174-
"title": "Intestinal lipoprotein overproduction in insulin-resistant states",
175-
"journal": "Curr Opin Lipidol",
176-
"volume": "19",
177-
"number": "3",
178-
"pages": "221-8",
179-
"doi": "10.1097/MOL.0b013e3282ffaf82",
180-
"isbn": "0957-9672 (Print)\n0957-9672",
181-
},
182-
{
183-
"ENTRYTYPE": "article",
184-
"ID": "id_0000728NEW",
185-
"author": "Adeli K.Lewis G. F.",
186-
"year": "2008",
187-
"title": "Intestinal lipoprotein overproduction in insulin-resistant states",
188-
"journal": "Curr Opin Lipidol",
189-
"volume": "19",
190-
"number": "3",
191-
"pages": "221-8",
192-
"doi": "10.1097/MOL.0b013e3282ffaf82",
193-
"isbn": "0957-9672 (Print)\n0957-9672",
194-
},
195-
True,
196-
),
197-
# Sauer & Seuring 2023 (misc vs article representation; same DOI)
198-
(
199-
{
200-
"ENTRYTYPE": "misc",
201-
"ID": "SauerSeuring2023",
202-
"author": "Sauer, Philipp C and Seuring, Stefan",
203-
"year": "2023",
204-
"title": "How to conduct systematic literature reviews in management research: a guide in 6 steps and 14 decisions",
205-
"doi": "10.1007/S11846-023-00668-3",
206-
},
207-
{
208-
"ENTRYTYPE": "article",
209-
"ID": "SauerSeuring2023B",
210-
"author": "Sauer, Philipp C. and Seuring, Stefan",
211-
"year": "2023",
212-
"title": "How to conduct systematic literature reviews in management research: a guide in 6 steps and 14 decisions",
213-
"journal": "Review of Managerial Science",
214-
"volume": "17",
215-
"number": "5",
216-
"pages": "1899--1933",
217-
"doi": "10.1007/S11846-023-00668-3",
218-
},
219-
True,
220-
),
221-
# Clark et al. 2025 (misc vs article; same DOI)
222-
(
223-
{
224-
"ENTRYTYPE": "misc",
225-
"ID": "ClarkBartonAlbarqoEtAl2025",
226-
"author": "Clark, Justin; Barton, Belinda; Albarqo, Loai; Byambasuren, Oyungerel; Jowsey, Tanisha; Keogh, Justin; Liang, Tian; Moro, Christian; O'neill, Hayley; Jones, Mark",
227-
"year": "2025",
228-
"title": "Generative artificial intelligence use in evidence synthesis: A systematic review",
229-
"doi": "10.1017/RSM.2025.16",
230-
},
231-
{
232-
"ENTRYTYPE": "article",
233-
"ID": "ClarkBartonAlbarqouniEtAl2025",
234-
"author": "Clark, Justin; Barton, Belinda; Albarqouni, Loai; Byambasuren, Oyungerel; Jowsey, Tanisha; Keogh, Justin; Liang, Tian; Moro, Christian; O’Neill, Hayley; Jones, Mark",
235-
"year": "2025",
236-
"title": "Generative artificial intelligence use in evidence synthesis: A systematic review",
237-
"journal": "Research Synthesis Methods",
238-
"volume": "16",
239-
"number": "4",
240-
"pages": "601--619",
241-
"doi": "10.1017/RSM.2025.16",
242-
},
243-
True,
244-
),
245-
# Add further (bib_record_1, bib_record_2, expected_match) tuples here
246-
],
247-
)
248-
def test_individual_cases_match(
249-
bib_record_1: dict, bib_record_2: dict, expected_match: bool
250-
) -> None:
37+
CASES_PATH = Path(__file__).parent / "test_cases.json"
38+
39+
40+
def load_cases() -> list:
41+
data = json.loads(CASES_PATH.read_text(encoding="utf-8"))
42+
cases = data["cases"]
43+
# Each item here corresponds to one test invocation
44+
return [
45+
pytest.param(
46+
c["record_a"], c["record_b"], c["expected_duplicate"], id=c.get("id")
47+
)
48+
for c in cases
49+
]
50+
51+
52+
@pytest.mark.parametrize("record_a,record_b,expected_duplicate", load_cases())
53+
def test_dedupe(record_a: dict, record_b: dict, expected_duplicate: bool) -> None:
25154
"""Check if two BibTeX-like records are deduplicated as expected."""
252-
records_df = _make_records_df(bib_record_1, bib_record_2)
55+
records_df = _make_records_df(record_a, record_b)
25356

25457
prep_df = prep(records_df)
25558
pairs_df = block(records_df=prep_df)
@@ -260,10 +63,8 @@ def test_individual_cases_match(
26063
duplicate_id_sets = bib_dedupe.cluster.get_connected_components(matched_df)
26164
print(duplicate_id_sets)
26265

263-
actual_match = _in_same_cluster(
264-
duplicate_id_sets, bib_record_1["ID"], bib_record_2["ID"]
265-
)
266-
if actual_match == expected_match:
66+
actual_match = _in_same_cluster(duplicate_id_sets, record_a["ID"], record_b["ID"])
67+
if actual_match == expected_duplicate:
26768
Path("EXPORT.csv").unlink()
26869

269-
assert actual_match == expected_match
70+
assert actual_match == expected_duplicate

0 commit comments

Comments
 (0)