Skip to content

Commit a6d1009

Browse files
authored
feat: add docling document parser (#509)
1 parent 4fe3025 commit a6d1009

File tree

8 files changed

+1106
-52
lines changed

8 files changed

+1106
-52
lines changed

.libraries-whitelist.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ rerankers
88
py_rust_stemmers
99
mirakuru
1010
psycopg
11-
pytest-postgresql
11+
pytest-postgresql
12+
python-bidi

docs/api_reference/document_search/ingest/parsers.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,6 @@
88

99
::: ragbits.document_search.ingestion.parsers.base.ImageDocumentParser
1010

11+
::: ragbits.document_search.ingestion.parsers.docling.DoclingDocumentParser
12+
1113
::: ragbits.document_search.ingestion.parsers.unstructured.UnstructuredDocumentParser

packages/ragbits-document-search/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## Unreleased
44

5+
- add docling document parser (#509)
56
- move sources from ragbits-document-search to ragbits-core (#496)
67
- fix union types validation in element enricher (#499)
78

packages/ragbits-document-search/pyproject.toml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@ classifiers = [
3131
"Topic :: Scientific/Engineering :: Artificial Intelligence",
3232
"Topic :: Software Development :: Libraries :: Python Modules",
3333
]
34-
dependencies = ["unstructured>=0.16.9", "unstructured-client>=0.26.0", "rerankers>=0.6.1", "ragbits-core==0.13.0"]
34+
dependencies = [
35+
"unstructured>=0.16.9",
36+
"unstructured-client>=0.26.0",
37+
"rerankers>=0.6.1",
38+
"ragbits-core==0.13.0",
39+
]
3540

3641
[project.urls]
3742
"Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -40,6 +45,9 @@ dependencies = ["unstructured>=0.16.9", "unstructured-client>=0.26.0", "reranker
4045
"Source" = "https://github.com/deepsense-ai/ragbits"
4146

4247
[project.optional-dependencies]
48+
docling = [
49+
"docling>=2.15.1,<3.0.0",
50+
]
4351
ray = [
4452
"ray[data]>=2.43.0,<3.0.0",
4553
]
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
from docling.chunking import HierarchicalChunker
2+
from docling.datamodel.base_models import InputFormat
3+
from docling.datamodel.pipeline_options import AcceleratorOptions, EasyOcrOptions, PdfPipelineOptions, PipelineOptions
4+
from docling.document_converter import (
5+
DocumentConverter,
6+
ExcelFormatOption,
7+
HTMLFormatOption,
8+
MarkdownFormatOption,
9+
PdfFormatOption,
10+
PowerpointFormatOption,
11+
WordFormatOption,
12+
)
13+
from docling_core.types.doc import DocItem, DoclingDocument
14+
15+
from ragbits.document_search.documents.document import Document, DocumentType
16+
from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
17+
from ragbits.document_search.ingestion.parsers import DocumentParser
18+
19+
20+
class DoclingDocumentParser(DocumentParser):
21+
"""
22+
Parser that uses the Docling to process the documents.
23+
"""
24+
25+
supported_document_types = {
26+
DocumentType.DOCX,
27+
DocumentType.PPTX,
28+
DocumentType.XLSX,
29+
DocumentType.MD,
30+
DocumentType.PNG,
31+
DocumentType.JPG,
32+
DocumentType.HTML,
33+
DocumentType.TXT,
34+
DocumentType.PDF,
35+
}
36+
37+
def __init__(self, ignore_images: bool = False, num_threads: int = 1) -> None:
38+
"""
39+
Initialize the DoclingDocumentParser instance.
40+
41+
Args:
42+
ignore_images: If True images will be skipped.
43+
num_threads: The number of threads for parsing parallelism on CPU.
44+
"""
45+
self.ignore_images = ignore_images
46+
self.num_threads = num_threads
47+
48+
async def parse(self, document: Document) -> list[Element]:
49+
"""
50+
Parse the document using the Docling API.
51+
52+
Args:
53+
document: The document to parse.
54+
55+
Returns:
56+
The list of elements extracted from the document.
57+
"""
58+
self.validate_document_type(document.metadata.document_type)
59+
partitioned_document = await self._partition(document)
60+
return self._chunk(partitioned_document, document)
61+
62+
async def _partition(self, document: Document) -> DoclingDocument:
63+
"""
64+
Partition the document.
65+
66+
Args:
67+
document: The document to parse.
68+
69+
Returns:
70+
The docling document.
71+
72+
Raises:
73+
ConversionError: If converting the document to the Docling format fails.
74+
"""
75+
accelerator_options = AcceleratorOptions(num_threads=self.num_threads)
76+
pipeline_options = PipelineOptions(accelerator_options=accelerator_options)
77+
pdf_pipeline_options = PdfPipelineOptions(
78+
images_scale=2,
79+
generate_page_images=True,
80+
ocr_options=EasyOcrOptions(),
81+
accelerator_options=accelerator_options,
82+
)
83+
converter = DocumentConverter(
84+
format_options={
85+
InputFormat.XLSX: ExcelFormatOption(pipeline_options=pipeline_options),
86+
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
87+
InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pipeline_options),
88+
InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
89+
InputFormat.MD: MarkdownFormatOption(pipeline_options=pipeline_options),
90+
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pdf_pipeline_options),
91+
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
92+
},
93+
)
94+
# For txt files, temporarily rename to .md extension. Docling doesn't support text files natively.
95+
if document.metadata.document_type == DocumentType.TXT:
96+
original_suffix = document.local_path.suffix
97+
document.local_path = document.local_path.rename(document.local_path.with_suffix(".md"))
98+
99+
partitioned_document = converter.convert(document.local_path).document
100+
101+
# Convert back to the original file.
102+
if document.metadata.document_type == DocumentType.TXT:
103+
document.local_path = document.local_path.rename(document.local_path.with_suffix(original_suffix))
104+
105+
return partitioned_document
106+
107+
def _chunk(self, partitioned_document: DoclingDocument, document: Document) -> list[Element]:
108+
"""
109+
Chunk the partitioned document.
110+
111+
Args:
112+
partitioned_document: The partitioned document by Docling.
113+
document: The document to parse.
114+
115+
Returns:
116+
The list of chunked elements.
117+
"""
118+
chunker = HierarchicalChunker()
119+
text_elements: list[Element] = [
120+
TextElement(
121+
document_meta=document.metadata,
122+
location=self._extract_element_location(chunk.meta.doc_items[0]), # type: ignore
123+
content=chunk.text,
124+
)
125+
for chunk in chunker.chunk(partitioned_document)
126+
]
127+
128+
if self.ignore_images:
129+
return text_elements
130+
131+
return text_elements + [
132+
ImageElement(
133+
document_meta=document.metadata,
134+
location=self._extract_element_location(element),
135+
image_bytes=image_bytes,
136+
ocr_extracted_text=element.caption_text(partitioned_document),
137+
)
138+
for element in partitioned_document.pictures
139+
if (image := element.get_image(partitioned_document)) and (image_bytes := image._repr_jpeg_())
140+
]
141+
142+
@staticmethod
143+
def _extract_element_location(element: DocItem) -> ElementLocation:
144+
"""
145+
Convert docling element to element location.
146+
147+
Args:
148+
element: The element from docling.
149+
150+
Returns:
151+
The element location.
152+
"""
153+
metadata = element.prov[0].model_dump() if element.prov else {}
154+
return ElementLocation(
155+
page_number=metadata.get("page_no"),
156+
)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from pathlib import Path
2+
3+
import pytest
4+
5+
from ragbits.document_search.documents.document import DocumentMeta
6+
from ragbits.document_search.ingestion.parsers.docling import DoclingDocumentParser
7+
8+
9+
@pytest.mark.parametrize(
10+
("document_metadata", "expected_num_elements"),
11+
[
12+
pytest.param(
13+
DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George."),
14+
1,
15+
id="TextDocument",
16+
),
17+
pytest.param(
18+
DocumentMeta.from_local_path(Path(__file__).parent.parent / "assets" / "md" / "test_file.md"),
19+
1,
20+
id="MarkdownDocument",
21+
),
22+
pytest.param(
23+
DocumentMeta.from_local_path(
24+
Path(__file__).parent.parent / "assets" / "img" / "transformers_paper_page.png"
25+
),
26+
6,
27+
id="ImageDocument",
28+
),
29+
pytest.param(
30+
DocumentMeta.from_local_path(
31+
Path(__file__).parent.parent / "assets" / "pdf" / "transformers_paper_page.pdf"
32+
),
33+
7,
34+
id="PDFDocument",
35+
),
36+
],
37+
)
38+
async def test_docling_parser(document_metadata: DocumentMeta, expected_num_elements: int) -> None:
39+
document = await document_metadata.fetch()
40+
parser = DoclingDocumentParser()
41+
42+
elements = await parser.parse(document)
43+
44+
assert len(elements) == expected_num_elements

pyproject.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ requires-python = ">=3.10"
77
dependencies = [
88
"ragbits-cli",
99
"ragbits-core[chroma,lab,fastembed,local,otel,qdrant,pgvector,azure,gcs,hf,s3]",
10-
"ragbits-document-search[ray]",
10+
"ragbits-document-search[docling,ray]",
1111
"ragbits-evaluate[relari]",
1212
"ragbits-guardrails[openai]",
1313
"ragbits-conversations",
@@ -86,6 +86,12 @@ addopts = "--import-mode=importlib"
8686
asyncio_mode = "auto"
8787
asyncio_default_fixture_loop_scope = "function"
8888

89+
[tool.coverage.run]
90+
omit = [
91+
"config.py",
92+
"config-3.py",
93+
]
94+
8995
[tool.mypy]
9096
warn_unused_configs = true
9197
ignore_missing_imports = true

0 commit comments

Comments
 (0)