Skip to content

Commit fdd86e3

Browse files
witlateyurtsev
andauthored
fix: Change JSON loader to be able to handle UTF-8-BOM files (#138)
Current parser will fail to ingest files that were encoded with the BOM bytes at the start. This is common for Windows-saved files and has been an issue for datasets where I can't ensure the default Unix encoding of UTF-8. As far as I'm aware, decoding using utf-8-sig has no downsides when used on basic UTF-8 beyond small per-file processing overhead to check for the 3 bytes at the start, but it enables the code to correctly open files that have the BOM prefix. --------- Co-authored-by: Eugene Yurtsev <[email protected]>
1 parent 7999e3d commit fdd86e3

File tree

2 files changed

+35
-2
lines changed

2 files changed

+35
-2
lines changed

libs/community/langchain_community/document_loaders/json_loader.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,15 +136,17 @@ def lazy_load(self) -> Iterator[Document]:
136136
"""Load and return documents from the JSON file."""
137137
index = 0
138138
if self._json_lines:
139-
with self.file_path.open(encoding="utf-8") as f:
139+
with self.file_path.open(encoding="utf-8-sig") as f:
140140
for line in f:
141141
line = line.strip()
142142
if line:
143143
for doc in self._parse(line, index):
144144
yield doc
145145
index += 1
146146
else:
147-
for doc in self._parse(self.file_path.read_text(encoding="utf-8"), index):
147+
for doc in self._parse(
148+
self.file_path.read_text(encoding="utf-8-sig"), index
149+
):
148150
yield doc
149151
index += 1
150152

libs/community/tests/unit_tests/document_loaders/test_json_loader.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,3 +440,34 @@ def _metadata_func(record: dict, metadata: dict) -> dict:
440440
result = loader.load()
441441

442442
assert result == expected_docs
443+
444+
445+
def test_load_json_with_utf8_bom() -> None:
446+
import tempfile
447+
448+
with tempfile.NamedTemporaryFile(
449+
mode="w", suffix=".json", delete=False, encoding="utf-8-sig"
450+
) as temp_file:
451+
temp_file.write('[{"text": "value1"}, {"text": "value2"}]')
452+
temp_file_path = temp_file.name
453+
454+
try:
455+
expected_docs = [
456+
Document(
457+
page_content="value1",
458+
metadata={"source": temp_file_path, "seq_num": 1},
459+
),
460+
Document(
461+
page_content="value2",
462+
metadata={"source": temp_file_path, "seq_num": 2},
463+
),
464+
]
465+
466+
loader = JSONLoader(
467+
file_path=temp_file_path, jq_schema=".[].text", text_content=True
468+
)
469+
result = loader.load()
470+
471+
assert result == expected_docs
472+
finally:
473+
Path(temp_file_path).unlink()

0 commit comments

Comments
 (0)