explosion · svlandeg · Apr 3, 2025 · Apr 3, 2025 · Apr 3, 2025 · svlandeg
diff --git a/spacy_layout/util.py b/spacy_layout/util.py
@@ -33,10 +33,26 @@ def decode_obj(obj: Any, chain: Callable | None = None) -> Any:
 def encode_df(obj: Any, chain: Callable | None = None) -> Any:
     """Convert pandas.DataFrame for serialization."""
     if isinstance(obj, DataFrame):
-        return {"data": obj.to_dict(), TYPE_ATTR: "DataFrame"}
+        # ensure unique column names, as data will be lost otherwise
+        df = _ensure_unique_columns(obj)
+        return {"data": df.to_dict(), TYPE_ATTR: "DataFrame"}
     return obj if chain is None else chain(obj)
 
 
+def _ensure_unique_columns(df: DataFrame) -> DataFrame:
+    seen_cols = {}
+    new_cols = []
+    for col_name in df.columns:
+        if col_name not in seen_cols:
+            seen_cols[col_name] = 1
+            new_cols.append(col_name)
+        else:
+            seen_cols[col_name] += 1
+            new_cols.append(f"{col_name} ({seen_cols[col_name]})")
+    df.columns = new_cols
+    return df
+
+
 def decode_df(obj: Any, chain: Callable | None = None) -> Any:
     """Load pandas.DataFrame from serialized data."""
     if isinstance(obj, dict) and obj.get(TYPE_ATTR) == "DataFrame":

diff --git a/tests/data/duplicate_columns.pdf b/tests/data/duplicate_columns.pdf
diff --git a/tests/test_general.py b/tests/test_general.py
@@ -20,6 +20,7 @@
 PDF_SIMPLE_BYTES = PDF_SIMPLE.open("rb").read()
 PDF_TABLE = Path(__file__).parent / "data" / "table.pdf"
 PDF_INDEX = Path(__file__).parent / "data" / "table_document_index.pdf"
+PDF_DUP_COL = Path(__file__).parent / "data" / "duplicate_columns.pdf"
 
 
 @pytest.fixture
@@ -213,3 +214,15 @@ def test_serialize_roundtrip(path, nlp):
         table_before = before._.get(layout.attrs.span_data)
         table_after = after._.get(layout.attrs.span_data)
         assert_frame_equal(table_before, table_after)
+
+
+@pytest.mark.parametrize("path", [PDF_DUP_COL])
+def test_duplicate_columns(path, nlp):
+    layout = spaCyLayout(nlp)
+    old_doc = layout(path)
+    old_table = old_doc._.tables[0]._.data
+    assert list(old_table.columns) == ['Index', 'Value', 'Value', 'Index', 'Value', 'Value']
+    doc_bin = DocBin(docs=[old_doc], store_user_data=True)
+    new_doc = list(doc_bin.get_docs(nlp.vocab))[0]
+    new_table = new_doc._.tables[0]._.data
+    assert list(new_table.columns) == ['Index', 'Value', 'Value (2)', 'Index (2)', 'Value (3)', 'Value (4)']