Skip to content

Commit 9f4bc5b

Browse files
authored
feat: [Beta] Extraction with schema (#2138)
* Add DocumentConverter.extract and full extraction pipeline Signed-off-by: Christoph Auer <[email protected]> * Add DocumentConverter.extract template arg Signed-off-by: Christoph Auer <[email protected]> * Add NuExtract model Signed-off-by: Christoph Auer <[email protected]> * Add Extraction pipeline Signed-off-by: Christoph Auer <[email protected]> * Add proper test, support pydantic class types Signed-off-by: Christoph Auer <[email protected]> * Add qr bill example Signed-off-by: Christoph Auer <[email protected]> * Add base_extraction_pipeline Signed-off-by: Christoph Auer <[email protected]> * Add types Signed-off-by: Christoph Auer <[email protected]> * Update typing of ExtractionResult and inner fields Signed-off-by: Christoph Auer <[email protected]> * Factor out extract to DocumentExtractor Signed-off-by: Christoph Auer <[email protected]> * Address mypy issues Signed-off-by: Christoph Auer <[email protected]> * Add DocumentExtractor Signed-off-by: Christoph Auer <[email protected]> * Resolve circular import issue Signed-off-by: Christoph Auer <[email protected]> * Clean up imports, remove Optional for template arg Signed-off-by: Christoph Auer <[email protected]> * Move new type definitions into datamodel Signed-off-by: Christoph Auer <[email protected]> * Update comments Signed-off-by: Christoph Auer <[email protected]> * Respect page-range, disable test_extraction for CI Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]>
1 parent a283ccf commit 9f4bc5b

File tree

14 files changed

+1171
-14
lines changed

14 files changed

+1171
-14
lines changed

docling/datamodel/base_models.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import math
22
from collections import defaultdict
33
from enum import Enum
4-
from typing import TYPE_CHECKING, Dict, List, Optional, Union
4+
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
55

66
import numpy as np
77
from docling_core.types.doc import (
@@ -32,6 +32,18 @@
3232
if TYPE_CHECKING:
3333
from docling.backend.pdf_backend import PdfPageBackend
3434

35+
from docling.backend.abstract_backend import AbstractDocumentBackend
36+
from docling.datamodel.pipeline_options import PipelineOptions
37+
38+
39+
class BaseFormatOption(BaseModel):
40+
"""Base class for format options used by _DocumentConversionInput."""
41+
42+
pipeline_options: Optional[PipelineOptions] = None
43+
backend: Type[AbstractDocumentBackend]
44+
45+
model_config = ConfigDict(arbitrary_types_allowed=True)
46+
3547

3648
class ConversionStatus(str, Enum):
3749
PENDING = "pending"

docling/datamodel/document.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
import logging
33
import re
44
import tarfile
5-
from collections.abc import Iterable
5+
from collections.abc import Iterable, Mapping
66
from enum import Enum
77
from io import BytesIO
88
from pathlib import Path, PurePath
99
from typing import (
1010
TYPE_CHECKING,
11+
Any,
1112
Dict,
1213
List,
1314
Literal,
@@ -72,7 +73,7 @@
7273
from docling.utils.utils import create_file_hash
7374

7475
if TYPE_CHECKING:
75-
from docling.document_converter import FormatOption
76+
from docling.datamodel.base_models import BaseFormatOption
7677

7778
_log = logging.getLogger(__name__)
7879

@@ -238,7 +239,8 @@ class _DocumentConversionInput(BaseModel):
238239
limits: Optional[DocumentLimits] = DocumentLimits()
239240

240241
def docs(
241-
self, format_options: Dict[InputFormat, "FormatOption"]
242+
self,
243+
format_options: Mapping[InputFormat, "BaseFormatOption"],
242244
) -> Iterable[InputDocument]:
243245
for item in self.path_or_stream_iterator:
244246
obj = (

docling/datamodel/extraction.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""Data models for document extraction functionality."""
2+
3+
from typing import Any, Dict, List, Optional, Type, Union
4+
5+
from pydantic import BaseModel, Field
6+
7+
from docling.datamodel.base_models import ConversionStatus, ErrorItem
8+
from docling.datamodel.document import InputDocument
9+
10+
11+
class ExtractedPageData(BaseModel):
12+
"""Data model for extracted content from a single page."""
13+
14+
page_no: int = Field(..., description="1-indexed page number")
15+
extracted_data: Optional[Dict[str, Any]] = Field(
16+
None, description="Extracted structured data from the page"
17+
)
18+
raw_text: Optional[str] = Field(None, description="Raw extracted text")
19+
errors: List[str] = Field(
20+
default_factory=list,
21+
description="Any errors encountered during extraction for this page",
22+
)
23+
24+
25+
class ExtractionResult(BaseModel):
26+
"""Result of document extraction."""
27+
28+
input: InputDocument
29+
status: ConversionStatus = ConversionStatus.PENDING
30+
errors: List[ErrorItem] = []
31+
32+
# Pages field - always a list for consistency
33+
pages: List[ExtractedPageData] = Field(
34+
default_factory=list, description="Extracted data from each page"
35+
)
36+
37+
38+
# Type alias for template parameters that can be string, dict, or BaseModel
39+
ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]]

docling/datamodel/pipeline_options.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from docling.datamodel.vlm_model_specs import (
3838
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
3939
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
40+
NU_EXTRACT_2B_TRANSFORMERS,
4041
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
4142
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
4243
VlmModelType,
@@ -247,12 +248,9 @@ class OcrEngine(str, Enum):
247248
RAPIDOCR = "rapidocr"
248249

249250

250-
class PipelineOptions(BaseModel):
251+
class PipelineOptions(BaseOptions):
251252
"""Base pipeline options."""
252253

253-
create_legacy_output: bool = (
254-
True # This default will be set to False on a future version of docling
255-
)
256254
document_timeout: Optional[float] = None
257255
accelerator_options: AcceleratorOptions = AcceleratorOptions()
258256
enable_remote_services: bool = False
@@ -296,6 +294,13 @@ class AsrPipelineOptions(PipelineOptions):
296294
artifacts_path: Optional[Union[Path, str]] = None
297295

298296

297+
class VlmExtractionPipelineOptions(PipelineOptions):
298+
"""Options for extraction pipeline."""
299+
300+
artifacts_path: Optional[Union[Path, str]] = None
301+
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
302+
303+
299304
class PdfPipelineOptions(PaginatedPipelineOptions):
300305
"""Options for the PDF pipeline."""
301306

docling/datamodel/vlm_model_specs.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,23 @@
247247
temperature=0.0,
248248
)
249249

250+
# NuExtract
251+
NU_EXTRACT_2B_TRANSFORMERS = InlineVlmOptions(
252+
repo_id="numind/NuExtract-2.0-2B",
253+
prompt="", # This won't be used, template is passed separately
254+
torch_dtype="bfloat16",
255+
inference_framework=InferenceFramework.TRANSFORMERS,
256+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
257+
response_format=ResponseFormat.PLAINTEXT,
258+
supported_devices=[
259+
AcceleratorDevice.CPU,
260+
AcceleratorDevice.CUDA,
261+
AcceleratorDevice.MPS,
262+
],
263+
scale=2.0,
264+
temperature=0.0,
265+
)
266+
250267

251268
class VlmModelType(str, Enum):
252269
SMOLDOCLING = "smoldocling"

docling/document_converter.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from docling.backend.xml.jats_backend import JatsDocumentBackend
2929
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
3030
from docling.datamodel.base_models import (
31+
BaseFormatOption,
3132
ConversionStatus,
3233
DoclingComponentType,
3334
DocumentStream,
@@ -57,12 +58,8 @@
5758
_PIPELINE_CACHE_LOCK = threading.Lock()
5859

5960

60-
class FormatOption(BaseModel):
61+
class FormatOption(BaseFormatOption):
6162
pipeline_cls: Type[BasePipeline]
62-
pipeline_options: Optional[PipelineOptions] = None
63-
backend: Type[AbstractDocumentBackend]
64-
65-
model_config = ConfigDict(arbitrary_types_allowed=True)
6663

6764
@model_validator(mode="after")
6865
def set_optional_field_default(self) -> "FormatOption":
@@ -191,7 +188,7 @@ def __init__(
191188
self.allowed_formats = (
192189
allowed_formats if allowed_formats is not None else list(InputFormat)
193190
)
194-
self.format_to_options = {
191+
self.format_to_options: Dict[InputFormat, FormatOption] = {
195192
format: (
196193
_get_default_option(format=format)
197194
if (custom_option := (format_options or {}).get(format)) is None

0 commit comments

Comments
 (0)