@@ -121,16 +121,17 @@ def create_pdf_splits(
121
121
@dsl .component (
122
122
base_image = PYTORCH_CUDA_IMAGE ,
123
123
packages_to_install = [
124
- "docling" ,
124
+ "docling>=2.43.0 " ,
125
125
"transformers" ,
126
126
"sentence-transformers" ,
127
127
"llama-stack" ,
128
128
"llama-stack-client" ,
129
129
"pymilvus" ,
130
130
"fire" ,
131
+ "rapidocr-onnxruntime" ,
131
132
],
132
133
)
133
- def docling_convert_and_ingest (
134
+ def docling_convert (
134
135
input_path : dsl .InputPath ("input-pdfs" ),
135
136
pdf_split : List [str ],
136
137
output_path : dsl .OutputPath ("output-md" ),
@@ -142,14 +143,15 @@ def docling_convert_and_ingest(
142
143
import pathlib
143
144
144
145
from docling .datamodel .base_models import InputFormat , ConversionStatus
145
- from docling .datamodel .pipeline_options import PdfPipelineOptions
146
+ from docling .datamodel .pipeline_options import PdfPipelineOptions , RapidOcrOptions
146
147
from docling .document_converter import DocumentConverter , PdfFormatOption
147
148
from transformers import AutoTokenizer
148
149
from sentence_transformers import SentenceTransformer
149
150
from docling .chunking import HybridChunker
150
151
import logging
151
152
from llama_stack_client import LlamaStackClient
152
153
import uuid
154
+
153
155
import json
154
156
155
157
_log = logging .getLogger (__name__ )
@@ -239,6 +241,7 @@ def process_and_insert_embeddings(conv_results):
239
241
pipeline_options = PdfPipelineOptions ()
240
242
pipeline_options .do_ocr = True
241
243
pipeline_options .generate_page_images = True
244
+ pipeline_options .ocr_options = RapidOcrOptions ()
242
245
243
246
doc_converter = DocumentConverter (
244
247
format_options = {
@@ -304,7 +307,7 @@ def docling_convert_pipeline(
304
307
305
308
with dsl .ParallelFor (pdf_splits .output ) as pdf_split :
306
309
with dsl .If (use_gpu == True ):
307
- convert_task = docling_convert_and_ingest (
310
+ convert_task = docling_convert (
308
311
input_path = import_task .output ,
309
312
pdf_split = pdf_split ,
310
313
embed_model_id = embed_model_id ,
@@ -331,7 +334,7 @@ def docling_convert_pipeline(
331
334
)
332
335
add_node_selector_json (convert_task , {})
333
336
with dsl .Else ():
334
- convert_task = docling_convert_and_ingest (
337
+ convert_task = docling_convert (
335
338
input_path = import_task .output ,
336
339
pdf_split = pdf_split ,
337
340
embed_model_id = embed_model_id ,
0 commit comments