Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion spacy_layout/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,26 @@ def decode_obj(obj: Any, chain: Callable | None = None) -> Any:
def encode_df(obj: Any, chain: Callable | None = None) -> Any:
"""Convert pandas.DataFrame for serialization."""
if isinstance(obj, DataFrame):
return {"data": obj.to_dict(), TYPE_ATTR: "DataFrame"}
# ensure unique column names, as data will be lost otherwise
df = _ensure_unique_columns(obj)
return {"data": df.to_dict(), TYPE_ATTR: "DataFrame"}
return obj if chain is None else chain(obj)


def _ensure_unique_columns(df: DataFrame) -> DataFrame:
seen_cols = {}
new_cols = []
for col_name in df.columns:
if col_name not in seen_cols:
seen_cols[col_name] = 1
new_cols.append(col_name)
else:
seen_cols[col_name] += 1
new_cols.append(f"{col_name} ({seen_cols[col_name]})")
df.columns = new_cols
return df


def decode_df(obj: Any, chain: Callable | None = None) -> Any:
"""Load pandas.DataFrame from serialized data."""
if isinstance(obj, dict) and obj.get(TYPE_ATTR) == "DataFrame":
Expand Down
Binary file added tests/data/duplicate_columns.pdf
Binary file not shown.
13 changes: 13 additions & 0 deletions tests/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
PDF_SIMPLE_BYTES = PDF_SIMPLE.open("rb").read()
PDF_TABLE = Path(__file__).parent / "data" / "table.pdf"
PDF_INDEX = Path(__file__).parent / "data" / "table_document_index.pdf"
PDF_DUP_COL = Path(__file__).parent / "data" / "duplicate_columns.pdf"


@pytest.fixture
Expand Down Expand Up @@ -213,3 +214,15 @@ def test_serialize_roundtrip(path, nlp):
table_before = before._.get(layout.attrs.span_data)
table_after = after._.get(layout.attrs.span_data)
assert_frame_equal(table_before, table_after)


@pytest.mark.parametrize("path", [PDF_DUP_COL])
def test_duplicate_columns(path, nlp):
layout = spaCyLayout(nlp)
old_doc = layout(path)
old_table = old_doc._.tables[0]._.data
assert list(old_table.columns) == ['Index', 'Value', 'Value', 'Index', 'Value', 'Value']
doc_bin = DocBin(docs=[old_doc], store_user_data=True)
new_doc = list(doc_bin.get_docs(nlp.vocab))[0]
new_table = new_doc._.tables[0]._.data
assert list(new_table.columns) == ['Index', 'Value', 'Value (2)', 'Index (2)', 'Value (3)', 'Value (4)']
Copy link
Contributor Author

@svlandeg svlandeg Apr 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on main, the result would be

['Index', 'Value']

i.e. 4 columns would have just been removed from the serialized output.