Skip to content

Commit 4fc07c1

Browse files
committed
Removed the OCR option from the local_run.py file and regenerated yaml files
1 parent 7aa16b7 commit 4fc07c1

File tree

3 files changed

+246
-1
lines changed

3 files changed

+246
-1
lines changed

kubeflow-pipelines/docling-standard/local_run.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def convert_pipeline_local():
4444
input_path=importer.outputs["output_path"],
4545
artifacts_path=artifacts.outputs["output_path"],
4646
pdf_filenames=first_split.output,
47-
ocr=False,
4847
)
4948

5049
docling_chunk(

kubeflow-pipelines/docling-standard/standard_convert_pipeline_compiled.yaml

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,129 @@ deploymentSpec:
384384
\ for i in range(num_splits)]\n filled_splits = list(filter(None, all_splits))\n\
385385
\ return filled_splits\n\n"
386386
image: quay.io/aipcc/docling/cuda-ubi9
387+
exec-docling-chunk:
388+
container:
389+
args:
390+
- --executor_input
391+
- '{{$}}'
392+
- --function_to_execute
393+
- docling_chunk
394+
command:
395+
- sh
396+
- -c
397+
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
398+
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
399+
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.6'\
400+
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
401+
$0\" \"$@\"\n"
402+
- sh
403+
- -ec
404+
- 'program_path=$(mktemp -d)
405+
406+
407+
printf "%s" "$0" > "$program_path/ephemeral_component.py"
408+
409+
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
410+
411+
'
412+
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
413+
\ *\n\ndef docling_chunk(\n input_path: dsl.Input[dsl.Artifact],\n \
414+
\ output_path: dsl.Output[dsl.Artifact],\n max_tokens: int = 512,\n\
415+
\ merge_peers: bool = True,\n):\n \"\"\"\n Chunk Docling documents\
416+
\ using HybridChunker. Takes converted docling JSON files as input\n \
417+
\ and produces chunked JSONL files with semantic chunks suitable for RAG.\n\
418+
\n Output format is JSONL (one JSON object per line) for easy inspection\
419+
\ and streaming.\n\n Args:\n input_path: Path to the input directory\
420+
\ containing Docling JSON files\n output_path: Path to the output\
421+
\ directory for the chunked JSONL files\n max_tokens: Maximum number\
422+
\ of tokens per chunk\n merge_peers: Whether to merge smaller chunks\
423+
\ at the same level\n \"\"\"\n import json # pylint: disable=import-outside-toplevel\n\
424+
\ from datetime import datetime, timezone # pylint: disable=import-outside-toplevel\n\
425+
\ from pathlib import Path # pylint: disable=import-outside-toplevel\n\
426+
\n # HybridChunker = Docling's smart chunking class that combines:\n\
427+
\ # 1. Document structure awareness\n # 2. Token-based splitting\n\
428+
\ from docling.chunking import HybridChunker # pylint: disable=import-outside-toplevel\n\
429+
\ from docling_core.transforms.chunker.tokenizer.huggingface import (\n\
430+
\ HuggingFaceTokenizer,\n ) # pylint: disable=import-outside-toplevel\n\
431+
\ from docling_core.types import DoclingDocument # pylint: disable=import-outside-toplevel\n\
432+
\ from transformers import AutoTokenizer # pylint: disable=import-outside-toplevel\n\
433+
\n # Convert KFP artifact paths to Path objects\n input_path_p = Path(input_path.path)\n\
434+
\ output_path_p = Path(output_path.path)\n output_path_p.mkdir(parents=True,\
435+
\ exist_ok=True)\n\n # Initialize tokenizer for HybridChunker (new API)\n\
436+
\ # Using a lightweight sentence-transformer model for tokenization\n\
437+
\ EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n try:\n\
438+
\ hf_tokenizer = AutoTokenizer.from_pretrained(\n EMBED_MODEL_ID,\n\
439+
\ resume_download=True,\n timeout=60,\n )\n\
440+
\ print(f\"docling-chunk: loaded tokenizer from {EMBED_MODEL_ID}\"\
441+
, flush=True)\n except Exception as e:\n print(f\"docling-chunk:\
442+
\ ERROR loading tokenizer: {e}\", flush=True)\n raise RuntimeError(\n\
443+
\ f\"Failed to load tokenizer model {EMBED_MODEL_ID}. \"\n \
444+
\ \"Ensure network access to HuggingFace Hub or pre-download the\
445+
\ model.\"\n ) from e\n\n tokenizer = HuggingFaceTokenizer(\n\
446+
\ tokenizer=hf_tokenizer,\n max_tokens=max_tokens,\n )\n\
447+
\n # Initialize Hybrid chunker with user-specified parameters\n #\
448+
\ tokenizer: The tokenizer wrapper to use for counting tokens (includes\
449+
\ max_tokens)\n # merge_peers: if true, smaller adjacent chunks will\
450+
\ be merged together\n chunker = HybridChunker(\n tokenizer=tokenizer,\n\
451+
\ merge_peers=merge_peers,\n )\n\n # Find all JSON files in\
452+
\ the input directory\n json_files = list(input_path_p.glob(\"*.json\"\
453+
))\n if not json_files:\n print(f\"docling-chunk: No JSON files\
454+
\ found in {input_path_p}\", flush=True)\n return\n\n print(\n\
455+
\ f\"docling-chunk: processing {len(json_files)} files with max_tokens={max_tokens}\
456+
\ and merge_peers={merge_peers}\",\n flush=True,\n )\n\n #\
457+
\ Track processing results\n processed_count = 0\n skipped_files =\
458+
\ []\n\n # Process each file\n for json_file in json_files:\n \
459+
\ print(f\"docling-chunk: processing {json_file}\", flush=True)\n\n \
460+
\ # Load and validate the JSON file\n try:\n with\
461+
\ open(json_file, \"r\", encoding=\"utf-8\") as f:\n doc_data\
462+
\ = json.load(f)\n except json.JSONDecodeError as e:\n \
463+
\ print(\n f\"docling-chunk: skipping {json_file.name} -\
464+
\ invalid JSON: {e}\",\n flush=True,\n )\n \
465+
\ skipped_files.append((json_file.name, f\"invalid JSON: {e}\"))\n\
466+
\ continue\n\n # Parse the JSON data into a DoclingDocument\
467+
\ object\n # This validates that the JSON conforms to the DoclingDocument\
468+
\ schema\n try:\n doc = DoclingDocument.model_validate(doc_data)\n\
469+
\ except Exception as e:\n # Catches pydantic.ValidationError\
470+
\ and any other validation issues\n print(\n f\"\
471+
docling-chunk: skipping {json_file.name} - not a valid DoclingDocument:\
472+
\ {e}\",\n flush=True,\n )\n skipped_files.append((json_file.name,\
473+
\ f\"validation failed: {e}\"))\n continue\n\n # Chunk\
474+
\ the document using HybridChunker\n chunks = list(chunker.chunk(dl_doc=doc))\n\
475+
\n # Generate output filename: original_name_chunks.jsonl\n \
476+
\ output_filename = f\"{json_file.stem}_chunks.jsonl\"\n output_file\
477+
\ = output_path_p / output_filename\n\n # Get current timestamp in\
478+
\ ISO format\n timestamp = datetime.now(timezone.utc).isoformat()\n\
479+
\n # Chunking config (for reproducibility)\n chunking_config\
480+
\ = {\n \"max_tokens\": max_tokens,\n \"merge_peers\"\
481+
: merge_peers,\n \"tokenizer_model\": EMBED_MODEL_ID,\n \
482+
\ }\n\n # Write chunks as JSONL (one JSON object per line)\n \
483+
\ with open(output_file, \"w\", encoding=\"utf-8\") as f:\n \
484+
\ for idx, chunk in enumerate(chunks):\n # Get contextualized\
485+
\ text for this chunk\n chunk_text = chunker.contextualize(chunk=chunk)\n\
486+
\n # Build the chunk object\n chunk_obj =\
487+
\ {\n \"timestamp\": timestamp,\n \
488+
\ \"source_document\": json_file.name,\n \"chunk_index\"\
489+
: idx,\n \"chunking_config\": chunking_config,\n \
490+
\ \"text\": chunk_text,\n }\n\n \
491+
\ # Write as a single line of JSON\n f.write(json.dumps(chunk_obj,\
492+
\ ensure_ascii=False) + \"\\n\")\n\n print(\n f\"docling-chunk:\
493+
\ saved {len(chunks)} chunks to {output_filename}\",\n flush=True,\n\
494+
\ )\n processed_count += 1\n\n # Report summary\n print(\n\
495+
\ f\"docling-chunk: done - processed {processed_count}/{len(json_files)}\
496+
\ files\",\n flush=True,\n )\n if skipped_files:\n print(\n\
497+
\ f\"docling-chunk: skipped {len(skipped_files)} invalid files:\"\
498+
,\n flush=True,\n )\n for filename, reason in skipped_files:\n\
499+
\ print(f\" - {filename}: {reason}\", flush=True)\n\n"
500+
image: quay.io/aipcc/docling/cuda-ubi9
501+
resources:
502+
cpuLimit: 2.0
503+
cpuRequest: 0.25
504+
memoryLimit: 2.0
505+
memoryRequest: 0.512
506+
resourceCpuLimit: '2'
507+
resourceCpuRequest: 250m
508+
resourceMemoryLimit: 2G
509+
resourceMemoryRequest: 512M
387510
exec-docling-convert-standard:
388511
container:
389512
args:

kubeflow-pipelines/docling-vlm/vlm_convert_pipeline_compiled.yaml

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,129 @@ deploymentSpec:
302302
\ for i in range(num_splits)]\n filled_splits = list(filter(None, all_splits))\n\
303303
\ return filled_splits\n\n"
304304
image: quay.io/aipcc/docling/cuda-ubi9
305+
exec-docling-chunk:
306+
container:
307+
args:
308+
- --executor_input
309+
- '{{$}}'
310+
- --function_to_execute
311+
- docling_chunk
312+
command:
313+
- sh
314+
- -c
315+
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
316+
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
317+
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.6'\
318+
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
319+
$0\" \"$@\"\n"
320+
- sh
321+
- -ec
322+
- 'program_path=$(mktemp -d)
323+
324+
325+
printf "%s" "$0" > "$program_path/ephemeral_component.py"
326+
327+
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
328+
329+
'
330+
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
331+
\ *\n\ndef docling_chunk(\n input_path: dsl.Input[dsl.Artifact],\n \
332+
\ output_path: dsl.Output[dsl.Artifact],\n max_tokens: int = 512,\n\
333+
\ merge_peers: bool = True,\n):\n \"\"\"\n Chunk Docling documents\
334+
\ using HybridChunker. Takes converted docling JSON files as input\n \
335+
\ and produces chunked JSONL files with semantic chunks suitable for RAG.\n\
336+
\n Output format is JSONL (one JSON object per line) for easy inspection\
337+
\ and streaming.\n\n Args:\n input_path: Path to the input directory\
338+
\ containing Docling JSON files\n output_path: Path to the output\
339+
\ directory for the chunked JSONL files\n max_tokens: Maximum number\
340+
\ of tokens per chunk\n merge_peers: Whether to merge smaller chunks\
341+
\ at the same level\n \"\"\"\n import json # pylint: disable=import-outside-toplevel\n\
342+
\ from datetime import datetime, timezone # pylint: disable=import-outside-toplevel\n\
343+
\ from pathlib import Path # pylint: disable=import-outside-toplevel\n\
344+
\n # HybridChunker = Docling's smart chunking class that combines:\n\
345+
\ # 1. Document structure awareness\n # 2. Token-based splitting\n\
346+
\ from docling.chunking import HybridChunker # pylint: disable=import-outside-toplevel\n\
347+
\ from docling_core.transforms.chunker.tokenizer.huggingface import (\n\
348+
\ HuggingFaceTokenizer,\n ) # pylint: disable=import-outside-toplevel\n\
349+
\ from docling_core.types import DoclingDocument # pylint: disable=import-outside-toplevel\n\
350+
\ from transformers import AutoTokenizer # pylint: disable=import-outside-toplevel\n\
351+
\n # Convert KFP artifact paths to Path objects\n input_path_p = Path(input_path.path)\n\
352+
\ output_path_p = Path(output_path.path)\n output_path_p.mkdir(parents=True,\
353+
\ exist_ok=True)\n\n # Initialize tokenizer for HybridChunker (new API)\n\
354+
\ # Using a lightweight sentence-transformer model for tokenization\n\
355+
\ EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n try:\n\
356+
\ hf_tokenizer = AutoTokenizer.from_pretrained(\n EMBED_MODEL_ID,\n\
357+
\ resume_download=True,\n timeout=60,\n )\n\
358+
\ print(f\"docling-chunk: loaded tokenizer from {EMBED_MODEL_ID}\"\
359+
, flush=True)\n except Exception as e:\n print(f\"docling-chunk:\
360+
\ ERROR loading tokenizer: {e}\", flush=True)\n raise RuntimeError(\n\
361+
\ f\"Failed to load tokenizer model {EMBED_MODEL_ID}. \"\n \
362+
\ \"Ensure network access to HuggingFace Hub or pre-download the\
363+
\ model.\"\n ) from e\n\n tokenizer = HuggingFaceTokenizer(\n\
364+
\ tokenizer=hf_tokenizer,\n max_tokens=max_tokens,\n )\n\
365+
\n # Initialize Hybrid chunker with user-specified parameters\n #\
366+
\ tokenizer: The tokenizer wrapper to use for counting tokens (includes\
367+
\ max_tokens)\n # merge_peers: if true, smaller adjacent chunks will\
368+
\ be merged together\n chunker = HybridChunker(\n tokenizer=tokenizer,\n\
369+
\ merge_peers=merge_peers,\n )\n\n # Find all JSON files in\
370+
\ the input directory\n json_files = list(input_path_p.glob(\"*.json\"\
371+
))\n if not json_files:\n print(f\"docling-chunk: No JSON files\
372+
\ found in {input_path_p}\", flush=True)\n return\n\n print(\n\
373+
\ f\"docling-chunk: processing {len(json_files)} files with max_tokens={max_tokens}\
374+
\ and merge_peers={merge_peers}\",\n flush=True,\n )\n\n #\
375+
\ Track processing results\n processed_count = 0\n skipped_files =\
376+
\ []\n\n # Process each file\n for json_file in json_files:\n \
377+
\ print(f\"docling-chunk: processing {json_file}\", flush=True)\n\n \
378+
\ # Load and validate the JSON file\n try:\n with\
379+
\ open(json_file, \"r\", encoding=\"utf-8\") as f:\n doc_data\
380+
\ = json.load(f)\n except json.JSONDecodeError as e:\n \
381+
\ print(\n f\"docling-chunk: skipping {json_file.name} -\
382+
\ invalid JSON: {e}\",\n flush=True,\n )\n \
383+
\ skipped_files.append((json_file.name, f\"invalid JSON: {e}\"))\n\
384+
\ continue\n\n # Parse the JSON data into a DoclingDocument\
385+
\ object\n # This validates that the JSON conforms to the DoclingDocument\
386+
\ schema\n try:\n doc = DoclingDocument.model_validate(doc_data)\n\
387+
\ except Exception as e:\n # Catches pydantic.ValidationError\
388+
\ and any other validation issues\n print(\n f\"\
389+
docling-chunk: skipping {json_file.name} - not a valid DoclingDocument:\
390+
\ {e}\",\n flush=True,\n )\n skipped_files.append((json_file.name,\
391+
\ f\"validation failed: {e}\"))\n continue\n\n # Chunk\
392+
\ the document using HybridChunker\n chunks = list(chunker.chunk(dl_doc=doc))\n\
393+
\n # Generate output filename: original_name_chunks.jsonl\n \
394+
\ output_filename = f\"{json_file.stem}_chunks.jsonl\"\n output_file\
395+
\ = output_path_p / output_filename\n\n # Get current timestamp in\
396+
\ ISO format\n timestamp = datetime.now(timezone.utc).isoformat()\n\
397+
\n # Chunking config (for reproducibility)\n chunking_config\
398+
\ = {\n \"max_tokens\": max_tokens,\n \"merge_peers\"\
399+
: merge_peers,\n \"tokenizer_model\": EMBED_MODEL_ID,\n \
400+
\ }\n\n # Write chunks as JSONL (one JSON object per line)\n \
401+
\ with open(output_file, \"w\", encoding=\"utf-8\") as f:\n \
402+
\ for idx, chunk in enumerate(chunks):\n # Get contextualized\
403+
\ text for this chunk\n chunk_text = chunker.contextualize(chunk=chunk)\n\
404+
\n # Build the chunk object\n chunk_obj =\
405+
\ {\n \"timestamp\": timestamp,\n \
406+
\ \"source_document\": json_file.name,\n \"chunk_index\"\
407+
: idx,\n \"chunking_config\": chunking_config,\n \
408+
\ \"text\": chunk_text,\n }\n\n \
409+
\ # Write as a single line of JSON\n f.write(json.dumps(chunk_obj,\
410+
\ ensure_ascii=False) + \"\\n\")\n\n print(\n f\"docling-chunk:\
411+
\ saved {len(chunks)} chunks to {output_filename}\",\n flush=True,\n\
412+
\ )\n processed_count += 1\n\n # Report summary\n print(\n\
413+
\ f\"docling-chunk: done - processed {processed_count}/{len(json_files)}\
414+
\ files\",\n flush=True,\n )\n if skipped_files:\n print(\n\
415+
\ f\"docling-chunk: skipped {len(skipped_files)} invalid files:\"\
416+
,\n flush=True,\n )\n for filename, reason in skipped_files:\n\
417+
\ print(f\" - {filename}: {reason}\", flush=True)\n\n"
418+
image: quay.io/aipcc/docling/cuda-ubi9
419+
resources:
420+
cpuLimit: 2.0
421+
cpuRequest: 0.25
422+
memoryLimit: 2.0
423+
memoryRequest: 0.512
424+
resourceCpuLimit: '2'
425+
resourceCpuRequest: 250m
426+
resourceMemoryLimit: 2G
427+
resourceMemoryRequest: 512M
305428
exec-docling-convert-vlm:
306429
container:
307430
args:

0 commit comments

Comments
 (0)