diff --git a/spacy_layout/util.py b/spacy_layout/util.py index 6b59ddb..65b7459 100644 --- a/spacy_layout/util.py +++ b/spacy_layout/util.py @@ -33,10 +33,26 @@ def decode_obj(obj: Any, chain: Callable | None = None) -> Any: def encode_df(obj: Any, chain: Callable | None = None) -> Any: """Convert pandas.DataFrame for serialization.""" if isinstance(obj, DataFrame): - return {"data": obj.to_dict(), TYPE_ATTR: "DataFrame"} + # ensure unique column names, as data will be lost otherwise + df = _ensure_unique_columns(obj) + return {"data": df.to_dict(), TYPE_ATTR: "DataFrame"} return obj if chain is None else chain(obj) +def _ensure_unique_columns(df: DataFrame) -> DataFrame: + seen_cols = {} + new_cols = [] + for col_name in df.columns: + if col_name not in seen_cols: + seen_cols[col_name] = 1 + new_cols.append(col_name) + else: + seen_cols[col_name] += 1 + new_cols.append(f"{col_name} ({seen_cols[col_name]})") + df.columns = new_cols + return df + + def decode_df(obj: Any, chain: Callable | None = None) -> Any: """Load pandas.DataFrame from serialized data.""" if isinstance(obj, dict) and obj.get(TYPE_ATTR) == "DataFrame": diff --git a/tests/data/duplicate_columns.pdf b/tests/data/duplicate_columns.pdf new file mode 100644 index 0000000..50255a4 Binary files /dev/null and b/tests/data/duplicate_columns.pdf differ diff --git a/tests/test_general.py b/tests/test_general.py index 132c827..27dc80b 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -20,6 +20,7 @@ PDF_SIMPLE_BYTES = PDF_SIMPLE.open("rb").read() PDF_TABLE = Path(__file__).parent / "data" / "table.pdf" PDF_INDEX = Path(__file__).parent / "data" / "table_document_index.pdf" +PDF_DUP_COL = Path(__file__).parent / "data" / "duplicate_columns.pdf" @pytest.fixture @@ -213,3 +214,15 @@ def test_serialize_roundtrip(path, nlp): table_before = before._.get(layout.attrs.span_data) table_after = after._.get(layout.attrs.span_data) assert_frame_equal(table_before, table_after) + + +@pytest.mark.parametrize("path", [PDF_DUP_COL]) +def test_duplicate_columns(path, nlp): + layout = spaCyLayout(nlp) + old_doc = layout(path) + old_table = old_doc._.tables[0]._.data + assert list(old_table.columns) == ['Index', 'Value', 'Value', 'Index', 'Value', 'Value'] + doc_bin = DocBin(docs=[old_doc], store_user_data=True) + new_doc = list(doc_bin.get_docs(nlp.vocab))[0] + new_table = new_doc._.tables[0]._.data + assert list(new_table.columns) == ['Index', 'Value', 'Value (2)', 'Index (2)', 'Value (3)', 'Value (4)']