Skip to content

Commit 44f81c9

Browse files
author
Maksym Lysak
committed
cleaning up
Signed-off-by: Maksym Lysak <[email protected]>
1 parent f9d67fe commit 44f81c9

File tree

1 file changed

+18
-19
lines changed

1 file changed

+18
-19
lines changed

docs/examples/post_process_ocr_with_vlm.py

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
DoclingDocument,
1010
ImageRefMode,
1111
NodeItem,
12-
PageItem,
1312
TextItem,
1413
)
1514
from docling_core.types.doc.document import (
@@ -57,23 +56,23 @@
5756
POST_PROCESSED_JSON_DOC = "scratch/test_doc_ocr.json"
5857

5958

60-
class OcrEnrichmentElement(BaseModel):
59+
class PostOcrEnrichmentElement(BaseModel):
6160
model_config = ConfigDict(arbitrary_types_allowed=True)
6261

6362
item: Union[DocItem, TableCell, RichTableCell, GraphCell]
6463
image: list[
6564
Image.Image
66-
] # TODO maybe needs to be an array of images for multi-provenance things.
65+
] # Needs to be an a list of images for multi-provenance elements
6766

6867

69-
class OcrEnrichmentPipelineOptions(ConvertPipelineOptions):
68+
class PostOcrEnrichmentPipelineOptions(ConvertPipelineOptions):
7069
api_options: PictureDescriptionApiOptions
7170

7271

73-
class OcrEnrichmentPipeline(SimplePipeline):
74-
def __init__(self, pipeline_options: OcrEnrichmentPipelineOptions):
72+
class PostOcrEnrichmentPipeline(SimplePipeline):
73+
def __init__(self, pipeline_options: PostOcrEnrichmentPipelineOptions):
7574
super().__init__(pipeline_options)
76-
self.pipeline_options: OcrEnrichmentPipelineOptions
75+
self.pipeline_options: PostOcrEnrichmentPipelineOptions
7776

7877
self.enrichment_pipe = [
7978
OcrApiEnrichmentModel(
@@ -86,8 +85,8 @@ def __init__(self, pipeline_options: OcrEnrichmentPipelineOptions):
8685
]
8786

8887
@classmethod
89-
def get_default_options(cls) -> OcrEnrichmentPipelineOptions:
90-
return OcrEnrichmentPipelineOptions()
88+
def get_default_options(cls) -> PostOcrEnrichmentPipelineOptions:
89+
return PostOcrEnrichmentPipelineOptions()
9190

9291
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
9392
def _prepare_elements(
@@ -122,13 +121,13 @@ def _prepare_elements(
122121

123122

124123
class OcrApiEnrichmentModel(
125-
GenericEnrichmentModel[OcrEnrichmentElement], BaseModelWithOptions
124+
GenericEnrichmentModel[PostOcrEnrichmentElement], BaseModelWithOptions
126125
):
127126
expansion_factor: float = 0.001
128127

129128
def prepare_element(
130129
self, conv_res: ConversionResult, element: NodeItem
131-
) -> Optional[list[OcrEnrichmentElement]]:
130+
) -> Optional[list[PostOcrEnrichmentElement]]:
132131
if not self.is_processable(doc=conv_res.document, element=element):
133132
return None
134133

@@ -167,7 +166,9 @@ def prepare_element(
167166
page_ix
168167
].image.pil_image.crop(expanded_bbox.as_tuple())
169168
# cropped_image.show()
170-
result.append(OcrEnrichmentElement(item=c, image=[cropped_image]))
169+
result.append(
170+
PostOcrEnrichmentElement(item=c, image=[cropped_image])
171+
)
171172
return result
172173
elif isinstance(element, TableItem):
173174
element_prov = element.prov[0]
@@ -204,7 +205,7 @@ def prepare_element(
204205
].image.pil_image.crop(expanded_bbox.as_tuple())
205206
# cropped_image.show()
206207
result.append(
207-
OcrEnrichmentElement(
208+
PostOcrEnrichmentElement(
208209
item=cell, image=[cropped_image]
209210
)
210211
)
@@ -213,7 +214,7 @@ def prepare_element(
213214
multiple_crops = []
214215
# Crop the image form the page
215216
for element_prov in element.prov:
216-
# element_prov = element.prov[0] # TODO: Not all items have prov
217+
# Iterate over provenances
217218
bbox = element_prov.bbox
218219

219220
page_ix = element_prov.page_no
@@ -243,7 +244,7 @@ def prepare_element(
243244
# Return the proper cropped image
244245
multiple_crops
245246
if len(multiple_crops) > 0:
246-
return [OcrEnrichmentElement(item=element, image=multiple_crops)]
247+
return [PostOcrEnrichmentElement(item=element, image=multiple_crops)]
247248
else:
248249
return []
249250

@@ -375,8 +376,6 @@ def clean_html_tags(text):
375376

376377

377378
def main() -> None:
378-
# TODO: Properly process cases for the items which have more than one provenance
379-
380379
# Let's prepare a Docling document json with embedded page images
381380
pipeline_options = PdfPipelineOptions()
382381
pipeline_options.generate_page_images = True
@@ -406,7 +405,7 @@ def main() -> None:
406405

407406
print("Post-process all bounding boxes with OCR")
408407
# Post-Process OCR on top of existing Docling document:
409-
pipeline_options = OcrEnrichmentPipelineOptions(
408+
pipeline_options = PostOcrEnrichmentPipelineOptions(
410409
api_options=PictureDescriptionApiOptions(
411410
url=LM_STUDIO_URL,
412411
prompt=DEFAULT_PROMPT,
@@ -421,7 +420,7 @@ def main() -> None:
421420
doc_converter = DocumentConverter(
422421
format_options={
423422
InputFormat.JSON_DOCLING: FormatOption(
424-
pipeline_cls=OcrEnrichmentPipeline,
423+
pipeline_cls=PostOcrEnrichmentPipeline,
425424
pipeline_options=pipeline_options,
426425
backend=DoclingJSONBackend,
427426
)

0 commit comments

Comments
 (0)