Skip to content

Commit 1b4d1d3

Browse files
committed
Fix memory issue in excel_parser
1 parent e80c173 commit 1b4d1d3

File tree

1 file changed

+24
-3
lines changed

1 file changed

+24
-3
lines changed

airbyte_cdk/sources/file_based/file_types/excel_parser.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@
55
import logging
66
from io import IOBase
77
from pathlib import Path
8+
import sys
9+
import tempfile
10+
import io
811
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
12+
from collections import deque
913

1014
import orjson
1115
import pandas as pd
@@ -31,6 +35,23 @@
3135
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
3236

3337

38+
39+
def iter_records_via_tempfile(df: pd.DataFrame):
40+
"""
41+
Stream records using Pandas' to_json (so datetime strings match exactly),
42+
without building a giant string in RAM.
43+
44+
- Writes NDJSON to a temporary file (text-wrapped over a binary file)
45+
- Reads back line-by-line and yields parsed dicts
46+
"""
47+
with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8", delete=True) as f:
48+
df.to_json(f, orient="records", lines=True, date_format="iso", date_unit="us")
49+
f.seek(0)
50+
for line in f: # line is str
51+
if line.strip():
52+
yield orjson.loads(line)
53+
54+
3455
class ExcelParser(FileTypeParser):
3556
ENCODING = None
3657

@@ -118,9 +139,9 @@ def parse_records(
118139
# DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
119140
# DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
120141
# see PR description: https://github.com/airbytehq/airbyte/pull/44444/
121-
yield from orjson.loads(
122-
df.to_json(orient="records", date_format="iso", date_unit="us")
123-
)
142+
for index, row in df.iterrows():
143+
# Convert each row (as a Series) to a JSON string
144+
yield orjson.loads(row.to_json(date_format="iso", date_unit="us"))
124145

125146
except Exception as exc:
126147
# Raise a RecordParseError if any exception occurs during parsing

0 commit comments

Comments
 (0)