Skip to content

Commit 53c810a

Browse files
Merge pull request #18 from 2000krysztof/bug/fixMd5NonSecurityContext
Replace the default OCR engine with RapidOCR
2 parents 70fd493 + 293bd1b commit 53c810a

File tree

4 files changed

+115
-99
lines changed

4 files changed

+115
-99
lines changed

demos/kfp/docling/pdf-conversion/docling_convert_pipeline.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,16 +121,17 @@ def create_pdf_splits(
121121
@dsl.component(
122122
base_image=PYTORCH_CUDA_IMAGE,
123123
packages_to_install=[
124-
"docling",
124+
"docling>=2.43.0",
125125
"transformers",
126126
"sentence-transformers",
127127
"llama-stack",
128128
"llama-stack-client",
129129
"pymilvus",
130130
"fire",
131+
"rapidocr-onnxruntime",
131132
],
132133
)
133-
def docling_convert_and_ingest(
134+
def docling_convert(
134135
input_path: dsl.InputPath("input-pdfs"),
135136
pdf_split: List[str],
136137
output_path: dsl.OutputPath("output-md"),
@@ -142,14 +143,15 @@ def docling_convert_and_ingest(
142143
import pathlib
143144

144145
from docling.datamodel.base_models import InputFormat, ConversionStatus
145-
from docling.datamodel.pipeline_options import PdfPipelineOptions
146+
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
146147
from docling.document_converter import DocumentConverter, PdfFormatOption
147148
from transformers import AutoTokenizer
148149
from sentence_transformers import SentenceTransformer
149150
from docling.chunking import HybridChunker
150151
import logging
151152
from llama_stack_client import LlamaStackClient
152153
import uuid
154+
153155
import json
154156

155157
_log = logging.getLogger(__name__)
@@ -239,6 +241,7 @@ def process_and_insert_embeddings(conv_results):
239241
pipeline_options = PdfPipelineOptions()
240242
pipeline_options.do_ocr = True
241243
pipeline_options.generate_page_images = True
244+
pipeline_options.ocr_options = RapidOcrOptions()
242245

243246
doc_converter = DocumentConverter(
244247
format_options={
@@ -304,7 +307,7 @@ def docling_convert_pipeline(
304307

305308
with dsl.ParallelFor(pdf_splits.output) as pdf_split:
306309
with dsl.If(use_gpu == True):
307-
convert_task = docling_convert_and_ingest(
310+
convert_task = docling_convert(
308311
input_path=import_task.output,
309312
pdf_split=pdf_split,
310313
embed_model_id=embed_model_id,
@@ -331,7 +334,7 @@ def docling_convert_pipeline(
331334
)
332335
add_node_selector_json(convert_task, {})
333336
with dsl.Else():
334-
convert_task = docling_convert_and_ingest(
337+
convert_task = docling_convert(
335338
input_path=import_task.output,
336339
pdf_split=pdf_split,
337340
embed_model_id=embed_model_id,

0 commit comments

Comments
 (0)