Skip to content

Commit 40ed3b8

Browse files
author
Maksym Lysak
committed
cleaning up
Signed-off-by: Maksym Lysak <[email protected]>
1 parent f9d67fe commit 40ed3b8

File tree

1 file changed

+16
-19
lines changed

1 file changed

+16
-19
lines changed

docs/examples/post_process_ocr_with_vlm.py

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
DoclingDocument,
1010
ImageRefMode,
1111
NodeItem,
12-
PageItem,
1312
TextItem,
1413
)
1514
from docling_core.types.doc.document import (
@@ -57,23 +56,23 @@
5756
POST_PROCESSED_JSON_DOC = "scratch/test_doc_ocr.json"
5857

5958

60-
class OcrEnrichmentElement(BaseModel):
59+
class PostOcrEnrichmentElement(BaseModel):
6160
model_config = ConfigDict(arbitrary_types_allowed=True)
6261

6362
item: Union[DocItem, TableCell, RichTableCell, GraphCell]
6463
image: list[
6564
Image.Image
66-
] # TODO maybe needs to be an array of images for multi-provenance things.
65+
] # Needs to be an a list of images for multi-provenance elements
6766

6867

69-
class OcrEnrichmentPipelineOptions(ConvertPipelineOptions):
68+
class PostOcrEnrichmentPipelineOptions(ConvertPipelineOptions):
7069
api_options: PictureDescriptionApiOptions
7170

7271

73-
class OcrEnrichmentPipeline(SimplePipeline):
74-
def __init__(self, pipeline_options: OcrEnrichmentPipelineOptions):
72+
class PostOcrEnrichmentPipeline(SimplePipeline):
73+
def __init__(self, pipeline_options: PostOcrEnrichmentPipelineOptions):
7574
super().__init__(pipeline_options)
76-
self.pipeline_options: OcrEnrichmentPipelineOptions
75+
self.pipeline_options: PostOcrEnrichmentPipelineOptions
7776

7877
self.enrichment_pipe = [
7978
OcrApiEnrichmentModel(
@@ -86,8 +85,8 @@ def __init__(self, pipeline_options: OcrEnrichmentPipelineOptions):
8685
]
8786

8887
@classmethod
89-
def get_default_options(cls) -> OcrEnrichmentPipelineOptions:
90-
return OcrEnrichmentPipelineOptions()
88+
def get_default_options(cls) -> PostOcrEnrichmentPipelineOptions:
89+
return PostOcrEnrichmentPipelineOptions()
9190

9291
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
9392
def _prepare_elements(
@@ -122,13 +121,13 @@ def _prepare_elements(
122121

123122

124123
class OcrApiEnrichmentModel(
125-
GenericEnrichmentModel[OcrEnrichmentElement], BaseModelWithOptions
124+
GenericEnrichmentModel[PostOcrEnrichmentElement], BaseModelWithOptions
126125
):
127126
expansion_factor: float = 0.001
128127

129128
def prepare_element(
130129
self, conv_res: ConversionResult, element: NodeItem
131-
) -> Optional[list[OcrEnrichmentElement]]:
130+
) -> Optional[list[PostOcrEnrichmentElement]]:
132131
if not self.is_processable(doc=conv_res.document, element=element):
133132
return None
134133

@@ -167,7 +166,7 @@ def prepare_element(
167166
page_ix
168167
].image.pil_image.crop(expanded_bbox.as_tuple())
169168
# cropped_image.show()
170-
result.append(OcrEnrichmentElement(item=c, image=[cropped_image]))
169+
result.append(PostOcrEnrichmentElement(item=c, image=[cropped_image]))
171170
return result
172171
elif isinstance(element, TableItem):
173172
element_prov = element.prov[0]
@@ -204,7 +203,7 @@ def prepare_element(
204203
].image.pil_image.crop(expanded_bbox.as_tuple())
205204
# cropped_image.show()
206205
result.append(
207-
OcrEnrichmentElement(
206+
PostOcrEnrichmentElement(
208207
item=cell, image=[cropped_image]
209208
)
210209
)
@@ -213,7 +212,7 @@ def prepare_element(
213212
multiple_crops = []
214213
# Crop the image form the page
215214
for element_prov in element.prov:
216-
# element_prov = element.prov[0] # TODO: Not all items have prov
215+
# Iterate over provenances
217216
bbox = element_prov.bbox
218217

219218
page_ix = element_prov.page_no
@@ -243,7 +242,7 @@ def prepare_element(
243242
# Return the proper cropped image
244243
multiple_crops
245244
if len(multiple_crops) > 0:
246-
return [OcrEnrichmentElement(item=element, image=multiple_crops)]
245+
return [PostOcrEnrichmentElement(item=element, image=multiple_crops)]
247246
else:
248247
return []
249248

@@ -375,8 +374,6 @@ def clean_html_tags(text):
375374

376375

377376
def main() -> None:
378-
# TODO: Properly process cases for the items which have more than one provenance
379-
380377
# Let's prepare a Docling document json with embedded page images
381378
pipeline_options = PdfPipelineOptions()
382379
pipeline_options.generate_page_images = True
@@ -406,7 +403,7 @@ def main() -> None:
406403

407404
print("Post-process all bounding boxes with OCR")
408405
# Post-Process OCR on top of existing Docling document:
409-
pipeline_options = OcrEnrichmentPipelineOptions(
406+
pipeline_options = PostOcrEnrichmentPipelineOptions(
410407
api_options=PictureDescriptionApiOptions(
411408
url=LM_STUDIO_URL,
412409
prompt=DEFAULT_PROMPT,
@@ -421,7 +418,7 @@ def main() -> None:
421418
doc_converter = DocumentConverter(
422419
format_options={
423420
InputFormat.JSON_DOCLING: FormatOption(
424-
pipeline_cls=OcrEnrichmentPipeline,
421+
pipeline_cls=PostOcrEnrichmentPipeline,
425422
pipeline_options=pipeline_options,
426423
backend=DoclingJSONBackend,
427424
)

0 commit comments

Comments
 (0)