@@ -95,12 +95,12 @@ def create_pdf_splits(
95
95
splits = [batch for batch in (all_pdfs [i ::num_splits ] for i in range (num_splits )) if batch ]
96
96
return splits or [[]]
97
97
98
-
98
+ # This component converts PDFs to Markdown and ingests the embeddings into LlamaStack's vector store
99
99
@dsl .component (
100
100
base_image = PYTORCH_CUDA_IMAGE ,
101
101
packages_to_install = ["docling" , "transformers" , "sentence-transformers" , "llama-stack" , "llama-stack-client" , "pymilvus" , "fire" ],
102
102
)
103
- def docling_convert (
103
+ def docling_convert_and_ingest (
104
104
input_path : dsl .InputPath ("input-pdfs" ),
105
105
pdf_split : List [str ],
106
106
output_path : dsl .OutputPath ("output-md" ),
@@ -264,7 +264,7 @@ def docling_convert_pipeline(
264
264
265
265
with dsl .ParallelFor (pdf_splits .output ) as pdf_split :
266
266
with dsl .If (use_gpu == True ):
267
- convert_task = docling_convert (
267
+ convert_task = docling_convert_and_ingest (
268
268
input_path = import_task .output ,
269
269
pdf_split = pdf_split ,
270
270
embed_model_id = embed_model_id ,
@@ -282,7 +282,7 @@ def docling_convert_pipeline(
282
282
add_toleration_json (convert_task , [{"effect" : "NoSchedule" , "key" : "nvidia.com/gpu" , "operator" : "Exists" }])
283
283
add_node_selector_json (convert_task , {})
284
284
with dsl .Else ():
285
- convert_task = docling_convert (
285
+ convert_task = docling_convert_and_ingest (
286
286
input_path = import_task .output ,
287
287
pdf_split = pdf_split ,
288
288
embed_model_id = embed_model_id ,
0 commit comments