Merge branch 'main' into ext_full_tests

Gerit Wagner · Gerit Wagner · commit 1f9504655f9b · 2025-12-16T12:59:47.000+01:00
diff --git a/README.md b/README.md
@@ -19,13 +19,13 @@ Unlike traditional deduplication methods, BibDedupe focuses on entity resolution
 
 ## Features
 
-- **Automated Duplicate Linking with Zero False Positives**: BibDedupe automates the duplicate linking process with a focus on eliminating false positives.
-- **Preprocessing Approach**: BibDedupe uses a preprocessing approach that reflects the unique error generation process in academic databases, such as author re-formatting, journal abbreviation or translations.
-- **Entity Resolution**: BibDedupe does not simply delete duplicates, but it links duplicates to resolve the entitity and integrates the data. This allows for validation, and undo operations.
-- **Programmatic Access**: BibDedupe is designed for seamless integration into existing research workflows, providing programmatic access for easy incorporation into scripts and applications.
-- **Transparent and Reproducible Rules**: BibDedupe's blocking and matching rules are transparent and easily reproducible to promote reproducibility in deduplication processes.
-- **Continuous Benchmarking**: Continuous integration tests running on GitHub Actions ensure ongoing benchmarking, maintaining the library's reliability and performance across datasets.
-- **Efficient and Parallel Computation**: BibDedupe implements computations efficiently and in parallel, using appropriate data structures and functions for optimal performance.
+- **Automated duplicate linking with zero false positives**: BibDedupe automates the duplicate linking process with a focus on eliminating false positives.
+- **Preprocessing approach**: BibDedupe uses a preprocessing approach that reflects the unique error generation process in academic databases, such as author re-formatting, journal abbreviation or translations.
+- **Entity resolution**: BibDedupe does not simply delete duplicates, but it links duplicates to resolve the entitity and integrates the data. This allows for validation, and undo operations.
+- **Programmatic access**: BibDedupe is designed for seamless integration into existing research workflows, providing programmatic access for easy incorporation into scripts and applications.
+- **Transparent and reproducible rules**: BibDedupe's blocking and matching rules are transparent and easily reproducible to promote reproducibility in deduplication processes.
+- **Continuous benchmarking**: Continuous integration tests running on GitHub Actions ensure ongoing benchmarking, maintaining the library's reliability and performance across datasets.
+- **Efficient and parallel computation**: BibDedupe implements computations efficiently and in parallel, using appropriate data structures and functions for optimal performance.
 
 ## Documentation
 
diff --git a/bib_dedupe/match_conditions.py b/bib_dedupe/match_conditions.py
@@ -89,8 +89,8 @@ def both_entrytypes(entrytype: str) -> str:
     f"({au09_ti09_ctXX} & {match(VOLUME, PAGES)})",
     f"({au09_ti09_ctXX} & {match(PAGES, YEAR)} & {non_contradicting(VOLUME, NUMBER, DOI)})",
 
-    # DOI-exact is very strong; don't require container_title (often missing for misc/preprints)
-    f"(({match(DOI)} & ~(doi_1 == '' | doi_2 == '')) & ({TITLE} > 0.95) & ({AUTHOR} > 0.9) & ({YEAR} > 0.9))",
+    # DOI-exact match; when container-titles are non-contradicting (may be missing)
+    f"(({match(DOI)} & ~(doi_1 == '' | doi_2 == '')) & ({TITLE} > 0.95) & ({AUTHOR} > 0.9) & ({YEAR} > 0.9)) & {non_contradicting(CONTAINER_TITLE)} ",
 
     # no TITLE
     f"({au10_tiXX_ct10} & {match(VOLUME, NUMBER, PAGES, YEAR)} & {non_contradicting(DOI)} & ({ABSTRACT} > 0.95 | {non_contradicting(ABSTRACT)}))",  # typically for number-mismatches in title