feat: json document processor (#1661)

cristofima · Cristopher Coronado Moreira · Roopan-Microsoft · web-flow · commit dc763c2f5466 · 2025-04-16T09:16:57.000+05:30
Co-authored-by: Cristopher Coronado Moreira &lt;crcorona@pichincha.com&gt;
Co-authored-by: Roopan-Microsoft &lt;168007406+Roopan-Microsoft@users.noreply.github.com&gt;
diff --git a/code/backend/batch/utilities/document_chunking/chunking_strategy.py b/code/backend/batch/utilities/document_chunking/chunking_strategy.py
@@ -6,6 +6,7 @@ class ChunkingStrategy(Enum):
     PAGE = "page"
     FIXED_SIZE_OVERLAP = "fixed_size_overlap"
     PARAGRAPH = "paragraph"
+    JSON = "json"
 
 
 class ChunkingSettings:
diff --git a/code/backend/batch/utilities/document_chunking/json.py b/code/backend/batch/utilities/document_chunking/json.py
@@ -0,0 +1,37 @@
+import json
+from typing import List
+from .document_chunking_base import DocumentChunkingBase
+from langchain.text_splitter import RecursiveJsonSplitter
+from .chunking_strategy import ChunkingSettings
+from ..common.source_document import SourceDocument
+
+
+class JSONDocumentChunking(DocumentChunkingBase):
+    def __init__(self) -> None:
+        pass
+
+    def chunk(
+        self, documents: List[SourceDocument], chunking: ChunkingSettings
+    ) -> List[SourceDocument]:
+        full_document_content = "".join(
+            list(map(lambda document: str(document.content), documents))
+        )
+        document_url = documents[0].source
+        json_data = json.loads(full_document_content)
+        splitter = RecursiveJsonSplitter(max_chunk_size=chunking.chunk_size)
+        chunked_content_list = splitter.split_json(json_data)
+        # Create document for each chunk
+        documents = []
+        chunk_offset = 0
+        for idx, chunked_content in enumerate(chunked_content_list):
+            documents.append(
+                SourceDocument.from_metadata(
+                    content=str(chunked_content),
+                    document_url=document_url,
+                    metadata={"offset": chunk_offset},
+                    idx=idx,
+                )
+            )
+
+            chunk_offset += len(chunked_content)
+        return documents
diff --git a/code/backend/batch/utilities/document_chunking/strategies.py b/code/backend/batch/utilities/document_chunking/strategies.py
@@ -3,6 +3,7 @@
 from .page import PageDocumentChunking
 from .fixed_size_overlap import FixedSizeOverlapDocumentChunking
 from .paragraph import ParagraphDocumentChunking
+from .json import JSONDocumentChunking
 
 
 def get_document_chunker(chunking_strategy: str):
@@ -14,5 +15,7 @@ def get_document_chunker(chunking_strategy: str):
         return FixedSizeOverlapDocumentChunking()
     elif chunking_strategy == ChunkingStrategy.PARAGRAPH.value:
         return ParagraphDocumentChunking()
+    elif chunking_strategy == ChunkingStrategy.JSON.value:
+        return JSONDocumentChunking()
     else:
         raise Exception(f"Unknown chunking strategy: {chunking_strategy}")
diff --git a/code/backend/batch/utilities/helpers/config/config_helper.py b/code/backend/batch/utilities/helpers/config/config_helper.py
@@ -68,6 +68,7 @@ def get_available_document_types(self) -> list[str]:
             "jpg",
             "png",
             "docx",
+            "json"
         }
         if self.env_helper.USE_ADVANCED_IMAGE_PROCESSING:
             document_types.update(ADVANCED_IMAGE_PROCESSING_FILE_TYPES)
diff --git a/code/backend/batch/utilities/helpers/config/default.json b/code/backend/batch/utilities/helpers/config/default.json
@@ -97,6 +97,17 @@
         "strategy": "docx"
       }
     },
+    {
+      "document_type": "json",
+      "chunking": {
+        "strategy": "json",
+        "size": 500,
+        "overlap": 100
+      },
+      "loading": {
+        "strategy": "web"
+      }
+    },
     {
       "document_type": "jpg",
       "chunking": {
diff --git a/code/tests/utilities/helpers/test_config_helper.py b/code/tests/utilities/helpers/test_config_helper.py
@@ -223,6 +223,11 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock):
             "chunking": expected_chunking,
             "loading": {"strategy": "docx"},
         },
+        {
+            "document_type": "json",
+            "chunking": {"strategy": "json", "size": 500, "overlap": 100},
+            "loading": {"strategy": "web"},
+        },
         {"document_type": "jpeg", "use_advanced_image_processing": True},
         {"document_type": "jpg", "use_advanced_image_processing": True},
         {"document_type": "png", "use_advanced_image_processing": True},
@@ -420,7 +425,7 @@ def test_get_available_document_types(config: Config):
 
     # then
     assert sorted(document_types) == sorted(
-        ["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx"]
+        ["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx", "json"]
     )
 
 
@@ -448,6 +453,7 @@ def test_get_available_document_types_when_advanced_image_processing_enabled(
             "docx",
             "tiff",
             "bmp",
+            "json"
         ]
     )
 
@@ -471,6 +477,7 @@ def test_get_available_chunking_strategies(config: Config):
             "page",
             "fixed_size_overlap",
             "paragraph",
+            "json"
         ]
     )
 
diff --git a/code/tests/utilities/helpers/test_document_chunking_helper.py b/code/tests/utilities/helpers/test_document_chunking_helper.py
@@ -109,3 +109,37 @@ def test_document_chunking_fixed_size_overlap():
         chunked_documents[6].content
         == " shows how the different chunking strategies work now!"
     )
+
+
+def test_document_chunking_json():
+    # Test json chunking strategy
+    chunking = ChunkingSettings({"strategy": ChunkingStrategy.JSON, "size": 175, "overlap": 0})
+
+    json_documents = [
+        SourceDocument(
+            content="""
+            {
+                "window":{
+                    "title":"Sample Widget",
+                    "name":"main_window",
+                    "width":500,
+                    "height":500
+                },
+                "image":{
+                    "src":"Images/Sun.png",
+                    "name":"sun1",
+                    "hOffset":250,
+                    "vOffset":250,
+                    "alignment":"center"
+                }
+            }
+            """,
+            source="https://example.com/sample_document.json",
+        ),
+    ]
+
+    document_chunking = DocumentChunking()
+    chunked_documents = document_chunking.chunk(json_documents, chunking)
+    assert len(chunked_documents) == 2
+    assert chunked_documents[0].content == "{'window': {'title': 'Sample Widget', 'name': 'main_window', 'width': 500, 'height': 500}}"
+    assert chunked_documents[1].content == "{'image': {'src': 'Images/Sun.png', 'name': 'sun1', 'hOffset': 250, 'vOffset': 250, 'alignment': 'center'}}"
diff --git a/docs/supported_file_types.md b/docs/supported_file_types.md
@@ -12,3 +12,4 @@ Out-of-the-box, you can upload the following file types:
 * HTML
 * MD (Markdown)
 * DOCX
+* JSON

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ def get_available_document_types(self) -> list[str]:`
`68`	`68`	`"jpg",`
`69`	`69`	`"png",`
`70`	`70`	`"docx",`
	`71`	`+ "json"`
`71`	`72`	`}`
`72`	`73`	`if self.env_helper.USE_ADVANCED_IMAGE_PROCESSING:`
`73`	`74`	`document_types.update(ADVANCED_IMAGE_PROCESSING_FILE_TYPES)`
Original file line number	Diff line number	Diff line change
`@@ -223,6 +223,11 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock):`
`223`	`223`	`"chunking": expected_chunking,`
`224`	`224`	`"loading": {"strategy": "docx"},`
`225`	`225`	`},`
	`226`	`+ {`
	`227`	`+ "document_type": "json",`
	`228`	`+ "chunking": {"strategy": "json", "size": 500, "overlap": 100},`
	`229`	`+ "loading": {"strategy": "web"},`
	`230`	`+ },`
`226`	`231`	`{"document_type": "jpeg", "use_advanced_image_processing": True},`
`227`	`232`	`{"document_type": "jpg", "use_advanced_image_processing": True},`
`228`	`233`	`{"document_type": "png", "use_advanced_image_processing": True},`
`@@ -420,7 +425,7 @@ def test_get_available_document_types(config: Config):`
`420`	`425`
`421`	`426`	`# then`
`422`	`427`	`assert sorted(document_types) == sorted(`
`423`		`- ["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx"]`
	`428`	`+ ["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx", "json"]`
`424`	`429`	`)`
`425`	`430`
`426`	`431`
`@@ -448,6 +453,7 @@ def test_get_available_document_types_when_advanced_image_processing_enabled(`
`448`	`453`	`"docx",`
`449`	`454`	`"tiff",`
`450`	`455`	`"bmp",`
	`456`	`+ "json"`
`451`	`457`	`]`
`452`	`458`	`)`
`453`	`459`
`@@ -471,6 +477,7 @@ def test_get_available_chunking_strategies(config: Config):`
`471`	`477`	`"page",`
`472`	`478`	`"fixed_size_overlap",`
`473`	`479`	`"paragraph",`
	`480`	`+ "json"`
`474`	`481`	`]`
`475`	`482`	`)`
`476`	`483`
-Original file line number
+Diff line change
 * HTML
 * MD (Markdown)
 * DOCX
 +* JSON