draft

Gerit Wagner · Gerit Wagner · commit e33c13626768 · 2026-01-31T14:07:29.000+01:00
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -7,6 +7,47 @@ You can contribute in many ways:
 
 ## Types of Contributions
 
+### Report duplicate error (FP or FN)
+
+Provide the case in the following format, allowing us to add it to the `tests/test_cases.json`:
+
+```json
+{
+    "id": "abrahao_parigi_gupta_cook_2017_pnas_short_vs_full",
+    "note": "Same paper; record_b uses abbreviated author formatting and omits venue fields; record_a includes DOI.",
+
+    "record_a": {
+        "ENTRYTYPE": "article",
+        "ID": "1",
+        "doi": "10.1073/PNAS.1604234114",
+        "author": "Abrahao, Bruno and Parigi, Paolo and Gupta, Alok and Cook, Karen S.",
+        "title": "Reputation offsets trust judgments based on social biases among Airbnb users",
+        "journal": "Proceedings of the National Academy of Sciences",
+        "number": "37",
+        "pages": "9848--9853",
+        "volume": "114",
+        "year": "2017"
+    },
+    "record_b": {
+        "ENTRYTYPE": "article",
+        "ID": "2",
+        "author": "B. Abrahao; P. Parigi; A. Gupta; K. S. Cook",
+        "year": "2017",
+        "title": "Reputation offsets trust judgments based on social biases among Airbnb users"
+    },
+
+"expected_duplicate": true
+}
+```
+
+### Fixing duplicate errors
+
+All changes to deduplication logic (`prep`, `sim`, `match`) should be accompanied with a test case in the pull request.
+
+TODO:
+- before merging, the ldd-full tests should be run to determine how the changes affect overall performance. (TBD: locally? how will it be triggered? how do we ensure that the right version/branch is tested? How are results added in the pull request? Do we want to consider performance implications?)
+- consider possiblity of schema inconsistency
+
 ### Report Bugs
 
 Report bugs at https://github.com/CoLRev-Environment/bib-dedupe/issues.
@@ -51,13 +92,13 @@ Ready to contribute? Here's how to set up BibDedupe for local development.
 1. Fork the `bib-dedupe` repo on GitHub.
 2. Clone your fork locally:
 
-    ```
+    ```sh
     git clone git@github.com:your_name_here/bib-dedupe.git
     ```
 
 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:
 
-    ```
+    ```sh
     mkvirtualenv bib-dedupe
     cd bib-dedupe/
     pip3 install poetry
@@ -66,7 +107,7 @@ Ready to contribute? Here's how to set up BibDedupe for local development.
 
 4. Create a branch for local development:
 
-    ```
+    ```sh
     git checkout -b name-of-your-bugfix-or-feature
     ```
 
@@ -75,14 +116,14 @@ Ready to contribute? Here's how to set up BibDedupe for local development.
 5. When you're done making changes, check that your changes pass the
    tests and pre-commit hooks:
 
-    ```
+    ```sh
     pytest
     pre-commit run -a
     ```
 
 6. Commit your changes and push your branch to GitHub:
 
-    ```
+    ```sh
     git add .
     git commit -m "Your detailed description of your changes."
     git push origin name-of-your-bugfix-or-feature
@@ -98,9 +139,8 @@ Before you submit a pull request, check that it meets these guidelines:
 2. If the pull request adds functionality, the docs should be updated. Put
    your new functionality into a function with a docstring, and add the
    feature to the list in README.rst.
-3. The pull request should work for Python 3.5, 3.6, 3.7 and 3.8, and for PyPy. Check
-   https://travis-ci.com/CoLRev-Ecosystem/bib-dedupe/pull_requests
-   and make sure that the tests pass for all supported Python versions.
+3. The pull request should work for the Python versions specified in the `pyproject.toml`.
+   Make sure that the tests pass for all supported Python versions.
 
 ## Coding standards
 
diff --git a/docs/architecture.rst b/docs/architecture.rst
@@ -0,0 +1,40 @@
+Architecture
+====================================
+
+.. mermaid::
+
+  flowchart LR
+
+    A0["Input: records_df"]
+      --> A1["prep(records_df)<br>• prep_schema<br>• prep_author<br>• prep_title<br>• prep_abstract<br>• prep_container_title<br>• prep_doi<br>• prep_volume<br>• prep_number<br>• prep_pages<br>• prep_year"]
+
+    O2["(manual review outside code)"]
+
+    subgraph API["Public API (call sequence)"]
+      A1 --> A2["block(prep_df)<br>Uses block.py to create candidate record pairs"]
+
+      A2 --> A3["match(pairs_df)<br>Uses match.py to compute similarities and classify pairs"]
+
+      A3 --> A4["cluster(matched_df)<br>Uses cluster.py to build connected components (duplicate groups)"]
+
+      A4 --> A5["merge(records_df, duplicate_id_sets)<br>Uses merge.py to combine records within each group"]
+
+      subgraph Manual["Optional manual review (human-in-the-loop)"]
+          O1["export_maybe(records_df, matched_df)<br>Write uncertain pairs via maybe_cases.export"]
+          O3["import_maybe(matched_df)<br>Read decisions via maybe_cases.import"]
+      end
+    end
+
+    O1 --> O2 --> O3
+
+    A5 --> A6["Output: merged records_df"]
+
+    %% optional linkage back into the main flow
+    A3 -. "optional manual review" .-> O1
+    O3 -. "returns updated matched_df" .-> A4
+
+
+Runtime of individual steps
+
+cd docs
+python benchmark_runtime_detailed.py
diff --git a/docs/benchmark.rst b/docs/benchmark.rst
@@ -0,0 +1,4 @@
+Benchmark
+====================================
+
+TODO
diff --git a/docs/benchmark_runtime_detailed.py b/docs/benchmark_runtime_detailed.py
@@ -0,0 +1,53 @@
+import time
+import statistics as stats
+import pandas as pd
+
+import bib_dedupe.bib_dedupe as bd
+from pathlib import Path
+
+BENCHMARK_DIR = Path("../tests/ldd-full-benchmark")
+
+def timed(label, fn, *args, **kwargs):
+    t0 = time.perf_counter()
+    out = fn(*args, **kwargs)
+    dt = time.perf_counter() - t0
+    return out, dt
+
+
+def benchmark_pipeline(records_df, *, cpu=-1, repeats=5, warmup=1):
+    # warmup (important for caches, process pools, etc.)
+    for _ in range(warmup):
+        prepped = bd.prep(records_df, verbosity_level=0, cpu=cpu)
+        pairs = bd.block(prepped, verbosity_level=0, cpu=cpu)
+        _ = bd.match(pairs, verbosity_level=0, cpu=cpu)
+
+    prep_times, block_times, match_times = [], [], []
+    for _ in range(repeats):
+        prepped, t_prep = timed("prep", bd.prep, records_df, verbosity_level=0, cpu=cpu)
+        pairs, t_block = timed("block", bd.block, prepped, verbosity_level=0, cpu=cpu)
+        matched, t_match = timed("match", bd.match, pairs, verbosity_level=0, cpu=cpu)
+
+        prep_times.append(t_prep)
+        block_times.append(t_block)
+        match_times.append(t_match)
+
+    def summ(xs):
+        return {
+            "n": len(xs),
+            "mean_s": stats.mean(xs),
+            "median_s": stats.median(xs),
+            "min_s": min(xs),
+            "max_s": max(xs),
+        }
+
+    return {
+        "prep": summ(prep_times),
+        "block": summ(block_times),
+        "match_total": summ(match_times),
+    }
+
+
+dataset = "cardiac"
+df = pd.read_csv(BENCHMARK_DIR / dataset / "records_pre_merged.csv")
+
+print(benchmark_pipeline(df, cpu=-1, repeats=10, warmup=2))
diff --git a/docs/conf.py b/docs/conf.py
@@ -23,7 +23,7 @@
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
-extensions = ["sphinx.ext.autodoc", "sphinx_copybutton"]
+extensions = ["sphinx.ext.autodoc", "sphinx_copybutton", "sphinxcontrib.mermaid"]
 
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
diff --git a/docs/index.rst b/docs/index.rst
@@ -95,3 +95,5 @@ For advanced use cases, it is also possible to complete and customize each step
    installation
    usage
    api
+   architecture
+   benchmark

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +Benchmark
 +====================================
++
 +TODO