|
| 1 | +"""Data models for document extraction functionality.""" |
| 2 | + |
| 3 | +from typing import Any, Dict, List, Optional, Type, Union |
| 4 | + |
| 5 | +from pydantic import BaseModel, Field |
| 6 | + |
| 7 | +from docling.datamodel.base_models import ConversionStatus, ErrorItem |
| 8 | +from docling.datamodel.document import InputDocument |
| 9 | + |
| 10 | + |
| 11 | +class ExtractedPageData(BaseModel): |
| 12 | + """Data model for extracted content from a single page.""" |
| 13 | + |
| 14 | + page_no: int = Field(..., description="1-indexed page number") |
| 15 | + extracted_data: Optional[Dict[str, Any]] = Field( |
| 16 | + None, description="Extracted structured data from the page" |
| 17 | + ) |
| 18 | + raw_text: Optional[str] = Field(None, description="Raw extracted text") |
| 19 | + errors: List[str] = Field( |
| 20 | + default_factory=list, |
| 21 | + description="Any errors encountered during extraction for this page", |
| 22 | + ) |
| 23 | + |
| 24 | + |
| 25 | +class ExtractionResult(BaseModel): |
| 26 | + """Result of document extraction.""" |
| 27 | + |
| 28 | + input: InputDocument |
| 29 | + status: ConversionStatus = ConversionStatus.PENDING |
| 30 | + errors: List[ErrorItem] = [] |
| 31 | + |
| 32 | + # Pages field - always a list for consistency |
| 33 | + pages: List[ExtractedPageData] = Field( |
| 34 | + default_factory=list, description="Extracted data from each page" |
| 35 | + ) |
| 36 | + |
| 37 | + |
| 38 | +# Type alias for template parameters that can be string, dict, or BaseModel |
| 39 | +ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]] |
0 commit comments