fix: Change JSON loader to be able to handle UTF-8-BOM files (#138)

witlat · eyurtsev · web-flow · commit fdd86e3b8c49 · 2025-07-01T19:35:07.000Z
Current parser will fail to ingest files that were encoded with the BOM
bytes at the start. This is common for Windows-saved files and has been
an issue for datasets where I can't ensure the default Unix encoding of
UTF-8.

As far as I'm aware, decoding using utf-8-sig has no downsides when used
on basic UTF-8 beyond small per-file processing overhead to check for
the 3 bytes at the start, but it enables the code to correctly open
files that have the BOM prefix.

---------

Co-authored-by: Eugene Yurtsev &lt;eyurtsev@gmail.com&gt;
diff --git a/libs/community/langchain_community/document_loaders/json_loader.py b/libs/community/langchain_community/document_loaders/json_loader.py
@@ -136,15 +136,17 @@ def lazy_load(self) -> Iterator[Document]:
         """Load and return documents from the JSON file."""
         index = 0
         if self._json_lines:
-            with self.file_path.open(encoding="utf-8") as f:
+            with self.file_path.open(encoding="utf-8-sig") as f:
                 for line in f:
                     line = line.strip()
                     if line:
                         for doc in self._parse(line, index):
                             yield doc
                             index += 1
         else:
-            for doc in self._parse(self.file_path.read_text(encoding="utf-8"), index):
+            for doc in self._parse(
+                self.file_path.read_text(encoding="utf-8-sig"), index
+            ):
                 yield doc
                 index += 1
 
diff --git a/libs/community/tests/unit_tests/document_loaders/test_json_loader.py b/libs/community/tests/unit_tests/document_loaders/test_json_loader.py
@@ -440,3 +440,34 @@ def _metadata_func(record: dict, metadata: dict) -> dict:
     result = loader.load()
 
     assert result == expected_docs
+
+
+def test_load_json_with_utf8_bom() -> None:
+    import tempfile
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".json", delete=False, encoding="utf-8-sig"
+    ) as temp_file:
+        temp_file.write('[{"text": "value1"}, {"text": "value2"}]')
+        temp_file_path = temp_file.name
+
+    try:
+        expected_docs = [
+            Document(
+                page_content="value1",
+                metadata={"source": temp_file_path, "seq_num": 1},
+            ),
+            Document(
+                page_content="value2",
+                metadata={"source": temp_file_path, "seq_num": 2},
+            ),
+        ]
+
+        loader = JSONLoader(
+            file_path=temp_file_path, jq_schema=".[].text", text_content=True
+        )
+        result = loader.load()
+
+        assert result == expected_docs
+    finally:
+        Path(temp_file_path).unlink()