CoLRev-Environment
diff --git a/‎.github/workflows/evaluate.yml‎
Lines changed: 0 additions & 86 deletions b/‎.github/workflows/evaluate.yml‎
Lines changed: 0 additions & 86 deletions
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 44 additions & 5 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 44 additions & 5 deletions
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 12 additions & 8 deletions b/‎README.md‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎bib_dedupe/bib_dedupe.py‎
Lines changed: 1 addition & 1 deletion b/‎bib_dedupe/bib_dedupe.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bib_dedupe/block.py‎
Lines changed: 38 additions & 7 deletions b/‎bib_dedupe/block.py‎
Lines changed: 38 additions & 7 deletions
@@ -1,18 +1,57 @@
 name: Run Tests
 
 on:
-  - push
-  - pull_request
+  push:
+  pull_request:
 
 jobs:
-  test-full-deps:
+  test:
+    name: Quick tests (${{ matrix.platform }}, py${{ matrix.python-version }})
     strategy:
       matrix:
         platform: [ubuntu-latest, windows-latest]
         python-version: ['3.10', '3.11', '3.12', '3.13']
     runs-on: ${{ matrix.platform }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          fetch-depth: 0
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install uv and dependencies
+        run: |
+          pip install uv
+          uv venv
+          uv pip install -e .[dev] || echo "No dev extra"
+          echo "Dependencies installed successfully"
+
+      - name: Setup git
+        run: |
+          git config --global user.name "CoLRev update"
+          git config --global user.email "actions@users.noreply.github.com"
+          git config --global url.https://github.com/.insteadOf git://github.com/
+
+      - name: Run tests (excluding full_test.py)
+        run: uv run pytest --ignore=tests/full_test.py
+
+  full-test:
+    name: Full test (${{ matrix.platform }}, py${{ matrix.python-version }})
+    needs: test
+    strategy:
+      matrix:
+        platform: [ubuntu-latest, windows-latest]
+        python-version: ['3.10', '3.11', '3.12', '3.13']
+    runs-on: ${{ matrix.platform }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          fetch-depth: 0
 
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
@@ -32,5 +71,5 @@ jobs:
           git config --global user.email "actions@users.noreply.github.com"
           git config --global url.https://github.com/.insteadOf git://github.com/
 
-      - name: Run tests
-        run: uv run pytest
+      - name: Run full_test.py
+        run: uv run pytest -q tests/full_test.py
@@ -0,0 +1,3 @@
+[submodule "tests/ldd-full-benchmark"]
+	path = tests/ldd-full-benchmark
+	url = git@github.com:CoLRev-Environment/ldd-full-benchmark.git
@@ -17,6 +17,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0).
 ### Fixed
 -->
 
+## 0.11.0 - 2025-11-21
+
+- Extract evaluation to separate repository (to be published soon)
+- Blocking: cleanup to ensure consistent use of ID_1 and ID_2
+- Refactoring to prevent FutureWarnings by pandas
+- Extend match conditions for records with missing fields (e.g., based on GROBID extraction)
+- Drop records with empty titles in block instead of prep to prevent subtle errors
+- Prevents same-source merges in connected components
+
 ## 0.10.0 - 2024-11-05
 
 - Fix and silence pandas Future warnings
 
@@ -8,7 +8,9 @@
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/CoLRev-Environment/bib-dedupe/.github%2Fworkflows%2Ftests.yml?label=tests)](https://github.com/CoLRev-Environment/bib-dedupe/actions/workflows/tests.yml)
 [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/CoLRev-Environment/bib-dedupe/.github%2Fworkflows%2Fdocs.yml?label=docs)](https://github.com/CoLRev-Environment/bib-dedupe/actions/workflows/docs.yml)
-[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/CoLRev-Environment/bib-dedupe/.github%2Fworkflows%2Fevaluate.yml?label=continuous%20evaluation)](https://github.com/CoLRev-Environment/bib-dedupe/actions/workflows/evaluate.yml)
+<!--
+[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/CoLRev-Environment/literature-deduplication-benchmarks/actions/workflows/ldd-full.yml?label=continuous%20evaluation)](https://github.com/CoLRev-Environment/literature-deduplication-benchmarks/actions/workflows/ldd-full.yml)
+-->
 
 </div>
 
@@ -19,13 +21,15 @@ Unlike traditional deduplication methods, BibDedupe focuses on entity resolution
 
 ## Features
 
-- **Automated Duplicate Linking with Zero False Positives**: BibDedupe automates the duplicate linking process with a focus on eliminating false positives.
-- **Preprocessing Approach**: BibDedupe uses a preprocessing approach that reflects the unique error generation process in academic databases, such as author re-formatting, journal abbreviation or translations.
-- **Entity Resolution**: BibDedupe does not simply delete duplicates, but it links duplicates to resolve the entitity and integrates the data. This allows for validation, and undo operations.
-- **Programmatic Access**: BibDedupe is designed for seamless integration into existing research workflows, providing programmatic access for easy incorporation into scripts and applications.
-- **Transparent and Reproducible Rules**: BibDedupe's blocking and matching rules are transparent and easily reproducible to promote reproducibility in deduplication processes.
-- **Continuous Benchmarking**: Continuous integration tests running on GitHub Actions ensure ongoing benchmarking, maintaining the library's reliability and performance across datasets.
-- **Efficient and Parallel Computation**: BibDedupe implements computations efficiently and in parallel, using appropriate data structures and functions for optimal performance.
+- **Automated duplicate linking with zero false positives**: BibDedupe automates the duplicate linking process with a focus on eliminating false positives.
+- **Preprocessing approach**: BibDedupe uses a preprocessing approach that reflects the unique error generation process in academic databases, such as author re-formatting, journal abbreviation or translations.
+- **Entity resolution**: BibDedupe does not simply delete duplicates, but it links duplicates to resolve the entitity and integrates the data. This allows for validation, and undo operations.
+- **Programmatic access**: BibDedupe is designed for seamless integration into existing research workflows, providing programmatic access for easy incorporation into scripts and applications.
+- **Transparent and reproducible rules**: BibDedupe's blocking and matching rules are transparent and easily reproducible to promote reproducibility in deduplication processes.
+- **Continuous benchmarking**: Continuous integration tests running on GitHub Actions ensure ongoing benchmarking, maintaining the library's reliability and performance across datasets.
+- **Efficient and parallel computation**: BibDedupe implements computations efficiently and in parallel, using appropriate data structures and functions for optimal performance.
+
+Regular benchmarks are available [here](https://github.com/CoLRev-Environment/literature-deduplication-benchmarks).
 
 ## Documentation
 
 
@@ -166,7 +166,7 @@ def merge(
     if verbosity_level is not None:
         verbose_print.verbosity_level = verbosity_level
 
-    if matched_df:
+    if matched_df is not None:
         duplicate_id_sets = bib_dedupe.cluster.get_connected_components(matched_df)
 
     if not duplicate_id_sets:
 
@@ -15,6 +15,7 @@
 from bib_dedupe.constants.fields import NUMBER
 from bib_dedupe.constants.fields import PAGES
 from bib_dedupe.constants.fields import SEARCH_SET
+from bib_dedupe.constants.fields import TITLE
 from bib_dedupe.constants.fields import TITLE_SHORT
 from bib_dedupe.constants.fields import VOLUME
 from bib_dedupe.constants.fields import YEAR
@@ -85,7 +86,9 @@ def create_pairs_for_block_fields(
 
     pairs = (
         non_empty_df.groupby("block_hash", group_keys=True)["ID"]
-        .apply(lambda x: pd.DataFrame(list(combinations(x, 2)), columns=["ID1", "ID2"]))
+        .apply(
+            lambda x: pd.DataFrame(list(combinations(x, 2)), columns=["ID_1", "ID_2"])
+        )
         .reset_index(drop=True)
     )
     pairs["block_rule"] = "-".join(block_fields)
@@ -208,6 +211,23 @@ def block(records_df: pd.DataFrame, cpu: int = -1) -> pd.DataFrame:
 
     Returns:
     pd.DataFrame: The dataframe after blocking operation.
+
+
+    Output table structure (columns, in order):
+        block_rule,
+        ID_1, ENTRYTYPE_1, author_1, year_1, title_1, volume_1, number_1,
+        pages_1, abstract_1, doi_1, series_1, search_set_1, container_title_1,
+        author_full_1, author_first_1, title_short_1, container_title_short_1,
+        ID_2, ENTRYTYPE_2, author_2, year_2, title_2, volume_2, number_2,
+        pages_2, abstract_2, doi_2, series_2, search_set_2, container_title_2,
+        author_full_2, author_first_2, title_short_2, container_title_short_2
+
+    Column meanings:
+    - ID_1 / ID_2: The two record identifiers that form the pair.
+    - block_rule: Name/description of the blocking rule that surfaced this pair;
+        use an empty string if not applicable.
+    - *_1 / *_2: Field values of the left/right record in the pair, respectively.
+        These mirror the original record schema (e.g., author, year, title, etc
     """
     INSTRUCTION = "(please run prep(records_df) and pass the prepared df)"
     assert (
@@ -222,8 +242,16 @@ def block(records_df: pd.DataFrame, cpu: int = -1) -> pd.DataFrame:
     )
     start_time = time.time()
 
-    pairs_df = pd.DataFrame(columns=["ID1", "ID2", "require_title_overlap"])
-    pairs_df = pairs_df.astype({"ID1": str, "ID2": str, "require_title_overlap": bool})
+    if records_df[TITLE].isnull().any():
+        verbose_print.print(
+            "Warning: Some records have empty title field. These records will not be considered."
+        )
+        records_df = records_df.dropna(subset=[TITLE])
+
+    pairs_df = pd.DataFrame(columns=["ID_1", "ID_2", "require_title_overlap"])
+    pairs_df = pairs_df.astype(
+        {"ID_1": str, "ID_2": str, "require_title_overlap": bool}
+    )
     if cpu == 1:
         for field in block_fields_list:
             pairs_df = pd.concat(
@@ -242,15 +270,15 @@ def block(records_df: pd.DataFrame, cpu: int = -1) -> pd.DataFrame:
         pairs_df = pd.concat(results, ignore_index=True)
 
     # title overlap is only required when there is no blocked pair that requires it.
-    pairs_df["require_title_overlap"] = pairs_df.groupby(["ID1", "ID2"])[
+    pairs_df["require_title_overlap"] = pairs_df.groupby(["ID_1", "ID_2"])[
         "require_title_overlap"
     ].transform("all")
-    pairs_df = pairs_df.drop_duplicates(subset=["ID1", "ID2"])
+    pairs_df = pairs_df.drop_duplicates(subset=["ID_1", "ID_2"])
 
     pairs_df = pd.merge(
         pairs_df,
         records_df.add_suffix("_1"),
-        left_on="ID1",
+        left_on="ID_1",
         right_on="ID_1",
         how="left",
         suffixes=("", "_1"),
@@ -259,7 +287,7 @@ def block(records_df: pd.DataFrame, cpu: int = -1) -> pd.DataFrame:
     pairs_df = pd.merge(
         pairs_df,
         records_df.add_suffix("_2"),
-        left_on="ID2",
+        left_on="ID_2",
         right_on="ID_2",
         how="left",
         suffixes=("", "_2"),
@@ -273,4 +301,7 @@ def block(records_df: pd.DataFrame, cpu: int = -1) -> pd.DataFrame:
     verbose_print.print(f"Blocked pairs reduced to {pairs_df.shape[0]:,} pairs")
     end_time = time.time()
     verbose_print.print(f"Block completed after: {end_time - start_time:.2f} seconds")
+
+    pairs_df.drop(columns=["require_title_overlap"], inplace=True)
+
     return pairs_df
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "tests/ldd-full-benchmark"]`
	`2`	`+ path = tests/ldd-full-benchmark`
	`3`	`+ url = git@github.com:CoLRev-Environment/ldd-full-benchmark.git`