99 DoclingDocument ,
1010 ImageRefMode ,
1111 NodeItem ,
12- PageItem ,
1312 TextItem ,
1413)
1514from docling_core .types .doc .document import (
5756POST_PROCESSED_JSON_DOC = "scratch/test_doc_ocr.json"
5857
5958
60- class OcrEnrichmentElement (BaseModel ):
59+ class PostOcrEnrichmentElement (BaseModel ):
6160 model_config = ConfigDict (arbitrary_types_allowed = True )
6261
6362 item : Union [DocItem , TableCell , RichTableCell , GraphCell ]
6463 image : list [
6564 Image .Image
66- ] # TODO maybe needs to be an array of images for multi-provenance things.
65+ ] # Needs to be an a list of images for multi-provenance elements
6766
6867
69- class OcrEnrichmentPipelineOptions (ConvertPipelineOptions ):
68+ class PostOcrEnrichmentPipelineOptions (ConvertPipelineOptions ):
7069 api_options : PictureDescriptionApiOptions
7170
7271
73- class OcrEnrichmentPipeline (SimplePipeline ):
74- def __init__ (self , pipeline_options : OcrEnrichmentPipelineOptions ):
72+ class PostOcrEnrichmentPipeline (SimplePipeline ):
73+ def __init__ (self , pipeline_options : PostOcrEnrichmentPipelineOptions ):
7574 super ().__init__ (pipeline_options )
76- self .pipeline_options : OcrEnrichmentPipelineOptions
75+ self .pipeline_options : PostOcrEnrichmentPipelineOptions
7776
7877 self .enrichment_pipe = [
7978 OcrApiEnrichmentModel (
@@ -86,8 +85,8 @@ def __init__(self, pipeline_options: OcrEnrichmentPipelineOptions):
8685 ]
8786
8887 @classmethod
89- def get_default_options (cls ) -> OcrEnrichmentPipelineOptions :
90- return OcrEnrichmentPipelineOptions ()
88+ def get_default_options (cls ) -> PostOcrEnrichmentPipelineOptions :
89+ return PostOcrEnrichmentPipelineOptions ()
9190
9291 def _enrich_document (self , conv_res : ConversionResult ) -> ConversionResult :
9392 def _prepare_elements (
@@ -122,13 +121,13 @@ def _prepare_elements(
122121
123122
124123class OcrApiEnrichmentModel (
125- GenericEnrichmentModel [OcrEnrichmentElement ], BaseModelWithOptions
124+ GenericEnrichmentModel [PostOcrEnrichmentElement ], BaseModelWithOptions
126125):
127126 expansion_factor : float = 0.001
128127
129128 def prepare_element (
130129 self , conv_res : ConversionResult , element : NodeItem
131- ) -> Optional [list [OcrEnrichmentElement ]]:
130+ ) -> Optional [list [PostOcrEnrichmentElement ]]:
132131 if not self .is_processable (doc = conv_res .document , element = element ):
133132 return None
134133
@@ -167,7 +166,7 @@ def prepare_element(
167166 page_ix
168167 ].image .pil_image .crop (expanded_bbox .as_tuple ())
169168 # cropped_image.show()
170- result .append (OcrEnrichmentElement (item = c , image = [cropped_image ]))
169+ result .append (PostOcrEnrichmentElement (item = c , image = [cropped_image ]))
171170 return result
172171 elif isinstance (element , TableItem ):
173172 element_prov = element .prov [0 ]
@@ -204,7 +203,7 @@ def prepare_element(
204203 ].image .pil_image .crop (expanded_bbox .as_tuple ())
205204 # cropped_image.show()
206205 result .append (
207- OcrEnrichmentElement (
206+ PostOcrEnrichmentElement (
208207 item = cell , image = [cropped_image ]
209208 )
210209 )
@@ -213,7 +212,7 @@ def prepare_element(
213212 multiple_crops = []
214213 # Crop the image form the page
215214 for element_prov in element .prov :
216- # element_prov = element.prov[0] # TODO: Not all items have prov
215+ # Iterate over provenances
217216 bbox = element_prov .bbox
218217
219218 page_ix = element_prov .page_no
@@ -243,7 +242,7 @@ def prepare_element(
243242 # Return the proper cropped image
244243 multiple_crops
245244 if len (multiple_crops ) > 0 :
246- return [OcrEnrichmentElement (item = element , image = multiple_crops )]
245+ return [PostOcrEnrichmentElement (item = element , image = multiple_crops )]
247246 else :
248247 return []
249248
@@ -375,8 +374,6 @@ def clean_html_tags(text):
375374
376375
377376def main () -> None :
378- # TODO: Properly process cases for the items which have more than one provenance
379-
380377 # Let's prepare a Docling document json with embedded page images
381378 pipeline_options = PdfPipelineOptions ()
382379 pipeline_options .generate_page_images = True
@@ -406,7 +403,7 @@ def main() -> None:
406403
407404 print ("Post-process all bounding boxes with OCR" )
408405 # Post-Process OCR on top of existing Docling document:
409- pipeline_options = OcrEnrichmentPipelineOptions (
406+ pipeline_options = PostOcrEnrichmentPipelineOptions (
410407 api_options = PictureDescriptionApiOptions (
411408 url = LM_STUDIO_URL ,
412409 prompt = DEFAULT_PROMPT ,
@@ -421,7 +418,7 @@ def main() -> None:
421418 doc_converter = DocumentConverter (
422419 format_options = {
423420 InputFormat .JSON_DOCLING : FormatOption (
424- pipeline_cls = OcrEnrichmentPipeline ,
421+ pipeline_cls = PostOcrEnrichmentPipeline ,
425422 pipeline_options = pipeline_options ,
426423 backend = DoclingJSONBackend ,
427424 )
0 commit comments