Skip to content

Commit 2a5f8ce

Browse files
authored
cases and match-condition (#49)
1 parent ac7bee5 commit 2a5f8ce

File tree

2 files changed

+90
-0
lines changed

2 files changed

+90
-0
lines changed

bib_dedupe/match_conditions.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,6 @@ def both_entrytypes(entrytype: str) -> str:
110110
f' ~({CONTAINER_TITLE}_2.str.contains("conf") | {CONTAINER_TITLE}_2.str.contains("work") | {CONTAINER_TITLE}_2.str.contains("proc")) ) & '
111111
f' ( ({CONTAINER_TITLE}_1.str.contains("conf") | {CONTAINER_TITLE}_1.str.contains("work") | {CONTAINER_TITLE}_1.str.contains("proc")) & '
112112
f' ~{CONTAINER_TITLE}_1.str.contains("j") ))',
113+
# Inproceedings: more sensitive to year mismatches
114+
f'({both_entrytypes("inproceedings")} & {mismatch(YEAR)})',
113115
]

tests/test_cases.json

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,94 @@
439439
"booktitle": "Americas Conference on Information Systems"
440440
},
441441
"expected_duplicate": false
442+
},
443+
{
444+
"id": "deng_taylor_moussawi_hicss_2021_vs_2022_minitrack_intro",
445+
"note": "Annual HICSS minitrack introductions share title/authors but are different years and different DOIs -> should NOT match.",
446+
"record_a": {
447+
"ENTRYTYPE": "inproceedings",
448+
"ID": "id_0000003",
449+
"colrev_status": "md_processed",
450+
"author": "Deng, Xuefei and Taylor, Joseph and Moussawi, Sara",
451+
"title": "Introduction to the Minitrack on Crowdsourcing and Digital Workforce in the Gig Economy",
452+
"year": "2021",
453+
"booktitle": "Proceedings of the Annual Hawaii International Conference on System Sciences",
454+
"doi": "10.24251/HICSS.2021.510"
455+
},
456+
"record_b": {
457+
"ENTRYTYPE": "inproceedings",
458+
"ID": "id_0000004",
459+
"colrev_status": "md_processed",
460+
"author": "Deng, Xuefei and Taylor, Joseph and Moussawi, Sara",
461+
"title": "Introduction to the Minitrack on Crowdsourcing and Digital Workforce in the Gig Economy",
462+
"year": "2022",
463+
"booktitle": "Proceedings of the Annual Hawaii International Conference on System Sciences",
464+
"doi": "10.24251/HICSS.2022.544"
465+
},
466+
"expected_duplicate": false
467+
},
468+
{
469+
"id": "erroneous_conference_merge_hall_dianne_2020_unknown_vs_hall_gillian_1995",
470+
"note": "Same venue, but different authors/years (2020 vs 1995); record_a has UNKNOWN title -> must NOT match.",
471+
"record_a": {
472+
"ENTRYTYPE": "inproceedings",
473+
"ID": "0000006472",
474+
"author": "Hall, Dianne and Lee, Kangbok and Han, Sumin and In, Joonhwan",
475+
"year": "2020",
476+
"title": "UNKNOWN",
477+
"booktitle": "Americas Conference on Information Systems"
478+
},
479+
"record_b": {
480+
"ENTRYTYPE": "inproceedings",
481+
"ID": "0000005291",
482+
"author": "Hall, Gillian",
483+
"year": "1995",
484+
"title": "Negotiation in Database Schema Integration",
485+
"booktitle": "Americas Conference on Information Systems"
486+
},
487+
"expected_duplicate": false
488+
},
489+
{
490+
"id": "reynoso_2010_unknown_title_vs_reynoso_2013_spanish_title",
491+
"note": "Similar author strings but different years and (one) unknown title vs specific title -> must NOT match.",
492+
"record_a": {
493+
"ENTRYTYPE": "inproceedings",
494+
"ID": "0000005436",
495+
"author": "Reynoso, Gómez and Manuel, Juan and Lizbeth, Estela and Andrade, Muñoz and Macías Díaz, Jorge Eduardo",
496+
"year": "2010",
497+
"title": "UNKNOWN",
498+
"booktitle": "Americas Conference on Information Systems"
499+
},
500+
"record_b": {
501+
"ENTRYTYPE": "inproceedings",
502+
"ID": "0000009531",
503+
"author": "Reynoso, Gómez and Manuel, Juan and Calzada De Luna, Américo C and Lizbeth, Estela and Andrade, Muñoz",
504+
"year": "2013",
505+
"title": "Medición Experimental del Desempeño de las Aplicaciones Bajo Ambientes de Red",
506+
"booktitle": "Americas Conference on Information Systems"
507+
},
508+
"expected_duplicate": false
509+
},
510+
{
511+
"id": "erroneous_conference_merge_patel_1999_vs_2025",
512+
"note": "Clearly different authors/years (1999 vs 2025) but both AMCIS inproceedings -> must NOT match.",
513+
"record_a": {
514+
"ENTRYTYPE": "inproceedings",
515+
"ID": "0000008147",
516+
"author": "Patel, Nandish V",
517+
"year": "1999",
518+
"title": "Developing Tailorable Information Systems through Deferred System's Design",
519+
"booktitle": "Americas Conference on Information Systems"
520+
},
521+
"record_b": {
522+
"ENTRYTYPE": "inproceedings",
523+
"ID": "011979",
524+
"author": "Patel, Nadya Shaznay and Avnit, Karin and Koh, Jeffrey T K V and Lim, Jawn and Tay, Peter and Supian, Hedirman and Kwan, Jeffrey Tzu and Koh, Valino and Chai, Kay",
525+
"year": "2025",
526+
"title": "UNKNOWN",
527+
"booktitle": "Americas Conference on Information Systems"
528+
},
529+
"expected_duplicate": false
442530
}
443531
]
444532
}

0 commit comments

Comments
 (0)