Skip to content

Commit dc763c2

Browse files
cristofimaCristopher Coronado MoreiraRoopan-Microsoft
authored
feat: json document processor (#1661)
Co-authored-by: Cristopher Coronado Moreira <[email protected]> Co-authored-by: Roopan-Microsoft <[email protected]>
1 parent f7a2ce9 commit dc763c2

File tree

8 files changed

+96
-1
lines changed

8 files changed

+96
-1
lines changed

code/backend/batch/utilities/document_chunking/chunking_strategy.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ class ChunkingStrategy(Enum):
66
PAGE = "page"
77
FIXED_SIZE_OVERLAP = "fixed_size_overlap"
88
PARAGRAPH = "paragraph"
9+
JSON = "json"
910

1011

1112
class ChunkingSettings:
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import json
2+
from typing import List
3+
from .document_chunking_base import DocumentChunkingBase
4+
from langchain.text_splitter import RecursiveJsonSplitter
5+
from .chunking_strategy import ChunkingSettings
6+
from ..common.source_document import SourceDocument
7+
8+
9+
class JSONDocumentChunking(DocumentChunkingBase):
10+
def __init__(self) -> None:
11+
pass
12+
13+
def chunk(
14+
self, documents: List[SourceDocument], chunking: ChunkingSettings
15+
) -> List[SourceDocument]:
16+
full_document_content = "".join(
17+
list(map(lambda document: str(document.content), documents))
18+
)
19+
document_url = documents[0].source
20+
json_data = json.loads(full_document_content)
21+
splitter = RecursiveJsonSplitter(max_chunk_size=chunking.chunk_size)
22+
chunked_content_list = splitter.split_json(json_data)
23+
# Create document for each chunk
24+
documents = []
25+
chunk_offset = 0
26+
for idx, chunked_content in enumerate(chunked_content_list):
27+
documents.append(
28+
SourceDocument.from_metadata(
29+
content=str(chunked_content),
30+
document_url=document_url,
31+
metadata={"offset": chunk_offset},
32+
idx=idx,
33+
)
34+
)
35+
36+
chunk_offset += len(chunked_content)
37+
return documents

code/backend/batch/utilities/document_chunking/strategies.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from .page import PageDocumentChunking
44
from .fixed_size_overlap import FixedSizeOverlapDocumentChunking
55
from .paragraph import ParagraphDocumentChunking
6+
from .json import JSONDocumentChunking
67

78

89
def get_document_chunker(chunking_strategy: str):
@@ -14,5 +15,7 @@ def get_document_chunker(chunking_strategy: str):
1415
return FixedSizeOverlapDocumentChunking()
1516
elif chunking_strategy == ChunkingStrategy.PARAGRAPH.value:
1617
return ParagraphDocumentChunking()
18+
elif chunking_strategy == ChunkingStrategy.JSON.value:
19+
return JSONDocumentChunking()
1720
else:
1821
raise Exception(f"Unknown chunking strategy: {chunking_strategy}")

code/backend/batch/utilities/helpers/config/config_helper.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def get_available_document_types(self) -> list[str]:
6868
"jpg",
6969
"png",
7070
"docx",
71+
"json"
7172
}
7273
if self.env_helper.USE_ADVANCED_IMAGE_PROCESSING:
7374
document_types.update(ADVANCED_IMAGE_PROCESSING_FILE_TYPES)

code/backend/batch/utilities/helpers/config/default.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,17 @@
9797
"strategy": "docx"
9898
}
9999
},
100+
{
101+
"document_type": "json",
102+
"chunking": {
103+
"strategy": "json",
104+
"size": 500,
105+
"overlap": 100
106+
},
107+
"loading": {
108+
"strategy": "web"
109+
}
110+
},
100111
{
101112
"document_type": "jpg",
102113
"chunking": {

code/tests/utilities/helpers/test_config_helper.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,11 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock):
223223
"chunking": expected_chunking,
224224
"loading": {"strategy": "docx"},
225225
},
226+
{
227+
"document_type": "json",
228+
"chunking": {"strategy": "json", "size": 500, "overlap": 100},
229+
"loading": {"strategy": "web"},
230+
},
226231
{"document_type": "jpeg", "use_advanced_image_processing": True},
227232
{"document_type": "jpg", "use_advanced_image_processing": True},
228233
{"document_type": "png", "use_advanced_image_processing": True},
@@ -420,7 +425,7 @@ def test_get_available_document_types(config: Config):
420425

421426
# then
422427
assert sorted(document_types) == sorted(
423-
["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx"]
428+
["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx", "json"]
424429
)
425430

426431

@@ -448,6 +453,7 @@ def test_get_available_document_types_when_advanced_image_processing_enabled(
448453
"docx",
449454
"tiff",
450455
"bmp",
456+
"json"
451457
]
452458
)
453459

@@ -471,6 +477,7 @@ def test_get_available_chunking_strategies(config: Config):
471477
"page",
472478
"fixed_size_overlap",
473479
"paragraph",
480+
"json"
474481
]
475482
)
476483

code/tests/utilities/helpers/test_document_chunking_helper.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,37 @@ def test_document_chunking_fixed_size_overlap():
109109
chunked_documents[6].content
110110
== " shows how the different chunking strategies work now!"
111111
)
112+
113+
114+
def test_document_chunking_json():
115+
# Test json chunking strategy
116+
chunking = ChunkingSettings({"strategy": ChunkingStrategy.JSON, "size": 175, "overlap": 0})
117+
118+
json_documents = [
119+
SourceDocument(
120+
content="""
121+
{
122+
"window":{
123+
"title":"Sample Widget",
124+
"name":"main_window",
125+
"width":500,
126+
"height":500
127+
},
128+
"image":{
129+
"src":"Images/Sun.png",
130+
"name":"sun1",
131+
"hOffset":250,
132+
"vOffset":250,
133+
"alignment":"center"
134+
}
135+
}
136+
""",
137+
source="https://example.com/sample_document.json",
138+
),
139+
]
140+
141+
document_chunking = DocumentChunking()
142+
chunked_documents = document_chunking.chunk(json_documents, chunking)
143+
assert len(chunked_documents) == 2
144+
assert chunked_documents[0].content == "{'window': {'title': 'Sample Widget', 'name': 'main_window', 'width': 500, 'height': 500}}"
145+
assert chunked_documents[1].content == "{'image': {'src': 'Images/Sun.png', 'name': 'sun1', 'hOffset': 250, 'vOffset': 250, 'alignment': 'center'}}"

docs/supported_file_types.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ Out-of-the-box, you can upload the following file types:
1212
* HTML
1313
* MD (Markdown)
1414
* DOCX
15+
* JSON

0 commit comments

Comments
 (0)