99 DoclingDocument ,
1010 ImageRefMode ,
1111 NodeItem ,
12- PageItem ,
1312 TextItem ,
1413)
1514from docling_core .types .doc .document import (
5756POST_PROCESSED_JSON_DOC = "scratch/test_doc_ocr.json"
5857
5958
60- class OcrEnrichmentElement (BaseModel ):
59+ class PostOcrEnrichmentElement (BaseModel ):
6160 model_config = ConfigDict (arbitrary_types_allowed = True )
6261
6362 item : Union [DocItem , TableCell , RichTableCell , GraphCell ]
6463 image : list [
6564 Image .Image
66- ] # TODO maybe needs to be an array of images for multi-provenance things.
65+ ] # Needs to be an a list of images for multi-provenance elements
6766
6867
69- class OcrEnrichmentPipelineOptions (ConvertPipelineOptions ):
68+ class PostOcrEnrichmentPipelineOptions (ConvertPipelineOptions ):
7069 api_options : PictureDescriptionApiOptions
7170
7271
73- class OcrEnrichmentPipeline (SimplePipeline ):
74- def __init__ (self , pipeline_options : OcrEnrichmentPipelineOptions ):
72+ class PostOcrEnrichmentPipeline (SimplePipeline ):
73+ def __init__ (self , pipeline_options : PostOcrEnrichmentPipelineOptions ):
7574 super ().__init__ (pipeline_options )
76- self .pipeline_options : OcrEnrichmentPipelineOptions
75+ self .pipeline_options : PostOcrEnrichmentPipelineOptions
7776
7877 self .enrichment_pipe = [
7978 OcrApiEnrichmentModel (
@@ -86,8 +85,8 @@ def __init__(self, pipeline_options: OcrEnrichmentPipelineOptions):
8685 ]
8786
8887 @classmethod
89- def get_default_options (cls ) -> OcrEnrichmentPipelineOptions :
90- return OcrEnrichmentPipelineOptions ()
88+ def get_default_options (cls ) -> PostOcrEnrichmentPipelineOptions :
89+ return PostOcrEnrichmentPipelineOptions ()
9190
9291 def _enrich_document (self , conv_res : ConversionResult ) -> ConversionResult :
9392 def _prepare_elements (
@@ -122,13 +121,13 @@ def _prepare_elements(
122121
123122
124123class OcrApiEnrichmentModel (
125- GenericEnrichmentModel [OcrEnrichmentElement ], BaseModelWithOptions
124+ GenericEnrichmentModel [PostOcrEnrichmentElement ], BaseModelWithOptions
126125):
127126 expansion_factor : float = 0.001
128127
129128 def prepare_element (
130129 self , conv_res : ConversionResult , element : NodeItem
131- ) -> Optional [list [OcrEnrichmentElement ]]:
130+ ) -> Optional [list [PostOcrEnrichmentElement ]]:
132131 if not self .is_processable (doc = conv_res .document , element = element ):
133132 return None
134133
@@ -167,7 +166,9 @@ def prepare_element(
167166 page_ix
168167 ].image .pil_image .crop (expanded_bbox .as_tuple ())
169168 # cropped_image.show()
170- result .append (OcrEnrichmentElement (item = c , image = [cropped_image ]))
169+ result .append (
170+ PostOcrEnrichmentElement (item = c , image = [cropped_image ])
171+ )
171172 return result
172173 elif isinstance (element , TableItem ):
173174 element_prov = element .prov [0 ]
@@ -204,7 +205,7 @@ def prepare_element(
204205 ].image .pil_image .crop (expanded_bbox .as_tuple ())
205206 # cropped_image.show()
206207 result .append (
207- OcrEnrichmentElement (
208+ PostOcrEnrichmentElement (
208209 item = cell , image = [cropped_image ]
209210 )
210211 )
@@ -213,7 +214,7 @@ def prepare_element(
213214 multiple_crops = []
214215 # Crop the image form the page
215216 for element_prov in element .prov :
216- # element_prov = element.prov[0] # TODO: Not all items have prov
217+ # Iterate over provenances
217218 bbox = element_prov .bbox
218219
219220 page_ix = element_prov .page_no
@@ -243,7 +244,7 @@ def prepare_element(
243244 # Return the proper cropped image
244245 multiple_crops
245246 if len (multiple_crops ) > 0 :
246- return [OcrEnrichmentElement (item = element , image = multiple_crops )]
247+ return [PostOcrEnrichmentElement (item = element , image = multiple_crops )]
247248 else :
248249 return []
249250
@@ -375,8 +376,6 @@ def clean_html_tags(text):
375376
376377
377378def main () -> None :
378- # TODO: Properly process cases for the items which have more than one provenance
379-
380379 # Let's prepare a Docling document json with embedded page images
381380 pipeline_options = PdfPipelineOptions ()
382381 pipeline_options .generate_page_images = True
@@ -406,7 +405,7 @@ def main() -> None:
406405
407406 print ("Post-process all bounding boxes with OCR" )
408407 # Post-Process OCR on top of existing Docling document:
409- pipeline_options = OcrEnrichmentPipelineOptions (
408+ pipeline_options = PostOcrEnrichmentPipelineOptions (
410409 api_options = PictureDescriptionApiOptions (
411410 url = LM_STUDIO_URL ,
412411 prompt = DEFAULT_PROMPT ,
@@ -421,7 +420,7 @@ def main() -> None:
421420 doc_converter = DocumentConverter (
422421 format_options = {
423422 InputFormat .JSON_DOCLING : FormatOption (
424- pipeline_cls = OcrEnrichmentPipeline ,
423+ pipeline_cls = PostOcrEnrichmentPipeline ,
425424 pipeline_options = pipeline_options ,
426425 backend = DoclingJSONBackend ,
427426 )
0 commit comments