Skip to content

Commit 610aa4d

Browse files
Parquet formatter coercion issue (#494)
* Normalize column types * Update unit test * Update changelog * Change return type for _normalize_column_types()
1 parent 469ed2e commit 610aa4d

File tree

3 files changed

+115
-0
lines changed

3 files changed

+115
-0
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414

1515
- `SchemaExtractionTemplate` prompt updated to explicitly instruct the LLM not to use `__` as a prefix or suffix in node labels or relationship types.
1616

17+
### Fixed
18+
19+
- Fixed `ValueError` in `Neo4jGraphParquetFormatter` when nodes of the same label have mixed property types (e.g. `str` and `int` for the same property), which caused `pa.Table.from_pylist()` to fail. Mixed-type columns are now coerced to a consistent type before Parquet table creation.
20+
1721
## 1.14.0
1822

1923
### Added

src/neo4j_graphrag/experimental/components/parquet_formatter.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,46 @@ def _relationships_to_rows(
419419

420420
return type_to_rows
421421

422+
@staticmethod
423+
def _normalize_column_types(rows: list[dict[str, Any]]) -> None:
424+
"""Coerce mixed-type columns in *rows* in-place so PyArrow can build the table.
425+
426+
PyArrow infers the column type from the first row; if subsequent rows have a
427+
different Python type for the same column the table creation fails. This method
428+
detects those mismatches and coerces:
429+
- {int, float} -> float (lossless numeric promotion)
430+
- anything else mixed -> str (universal safe fallback)
431+
"""
432+
if len(rows) <= 1:
433+
return
434+
435+
col_types: dict[str, set[type]] = defaultdict(set)
436+
for row in rows:
437+
for key, value in row.items():
438+
if value is not None:
439+
col_types[key].add(type(value))
440+
441+
cols_to_coerce: dict[str, type] = {}
442+
for col, types in col_types.items():
443+
if len(types) <= 1:
444+
continue
445+
target: type = float if types <= {int, float} else str
446+
cols_to_coerce[col] = target
447+
logger.warning(
448+
"Mixed types for property '%s': %s — coercing to %s",
449+
col,
450+
{t.__name__ for t in types},
451+
target.__name__,
452+
)
453+
454+
for row in rows:
455+
for col, target_type in cols_to_coerce.items():
456+
if col in row and row[col] is not None:
457+
try:
458+
row[col] = target_type(row[col])
459+
except (ValueError, TypeError):
460+
row[col] = str(row[col])
461+
422462
def format_parquet(
423463
self,
424464
rows: list[dict[str, Any]],
@@ -440,6 +480,7 @@ def format_parquet(
440480
import pyarrow as pa
441481
import pyarrow.parquet as pq
442482

483+
self._normalize_column_types(rows)
443484
table = pa.Table.from_pylist(rows)
444485
# Write to BytesIO buffer
445486
buffer = BytesIO()

tests/unit/experimental/components/test_kg_writer.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
FilenameCollisionHandler,
2626
)
2727
from neo4j_graphrag.experimental.components.parquet_formatter import (
28+
Neo4jGraphParquetFormatter,
2829
sanitize_parquet_filestem,
2930
)
3031
from neo4j_graphrag.experimental.components.kg_writer import (
@@ -697,3 +698,72 @@ async def test_parquet_writer_run_empty_graph() -> None:
697698
assert stats["nodes_per_label"] == {}
698699
assert stats["rel_per_type"] == {}
699700
assert result.metadata["files"] == []
701+
702+
703+
# ---------------------------------------------------------------------------
704+
# Neo4jGraphParquetFormatter._normalize_column_types
705+
# ---------------------------------------------------------------------------
706+
707+
708+
def test_normalize_column_types_single_row() -> None:
709+
rows = [{"age": 30, "name": "Alice"}]
710+
Neo4jGraphParquetFormatter._normalize_column_types(rows)
711+
assert rows == [{"age": 30, "name": "Alice"}]
712+
713+
714+
def test_normalize_column_types_homogeneous() -> None:
715+
rows = [{"age": 30}, {"age": 25}]
716+
Neo4jGraphParquetFormatter._normalize_column_types(rows)
717+
assert rows == [{"age": 30}, {"age": 25}]
718+
719+
720+
def test_normalize_column_types_mixed_str_int() -> None:
721+
rows: list[dict[str, Any]] = [{"age": "45"}, {"age": 30}]
722+
Neo4jGraphParquetFormatter._normalize_column_types(rows)
723+
assert rows == [{"age": "45"}, {"age": "30"}]
724+
725+
726+
def test_normalize_column_types_mixed_int_float() -> None:
727+
rows: list[dict[str, Any]] = [{"score": 3}, {"score": 3.5}]
728+
Neo4jGraphParquetFormatter._normalize_column_types(rows)
729+
assert rows == [{"score": 3.0}, {"score": 3.5}]
730+
731+
732+
def test_normalize_column_types_none_ignored() -> None:
733+
"""None values should not influence type detection."""
734+
rows: list[dict[str, Any]] = [{"age": None}, {"age": 30}]
735+
Neo4jGraphParquetFormatter._normalize_column_types(rows)
736+
assert rows == [{"age": None}, {"age": 30}]
737+
738+
739+
@pytest.mark.asyncio
740+
async def test_parquet_writer_mixed_property_types() -> None:
741+
"""ParquetWriter succeeds when nodes of the same label have mixed property types."""
742+
pytest.importorskip("pyarrow")
743+
import pyarrow.parquet as pq
744+
745+
with tempfile.TemporaryDirectory() as tmpdir:
746+
out = Path(tmpdir)
747+
dest = _LocalParquetDestination(out)
748+
writer = ParquetWriter(
749+
nodes_dest=dest,
750+
relationships_dest=dest,
751+
collision_handler=FilenameCollisionHandler(),
752+
)
753+
754+
node1 = Neo4jNode(
755+
id="p1", label="Patient", properties={"name": "John", "age": "45"}
756+
)
757+
node2 = Neo4jNode(
758+
id="p2", label="Patient", properties={"name": "Jane", "age": 30}
759+
)
760+
graph = Neo4jGraph(nodes=[node1, node2], relationships=[])
761+
762+
result = await writer.run(graph=graph)
763+
764+
assert result.status == "SUCCESS"
765+
table = pq.read_table(out / "Patient.parquet")
766+
assert table.num_rows == 2
767+
# Both ages should have been coerced to str
768+
ages = {v.as_py() for v in table.column("age")}
769+
assert ages == {"45", "30"}

0 commit comments

Comments
 (0)