Removed the OCR option from the local_run.py file and regenerated yaml files

shruthis4 · shruthis4 · commit 4fc07c127d81 · 2025-12-20T22:45:08.000-05:00
diff --git a/kubeflow-pipelines/docling-standard/local_run.py b/kubeflow-pipelines/docling-standard/local_run.py
@@ -44,7 +44,6 @@ def convert_pipeline_local():
         input_path=importer.outputs["output_path"],
         artifacts_path=artifacts.outputs["output_path"],
         pdf_filenames=first_split.output,
-        ocr=False,
     )
 
     docling_chunk(
diff --git a/kubeflow-pipelines/docling-standard/standard_convert_pipeline_compiled.yaml b/kubeflow-pipelines/docling-standard/standard_convert_pipeline_compiled.yaml
@@ -384,6 +384,129 @@ deploymentSpec:
           \ for i in range(num_splits)]\n    filled_splits = list(filter(None, all_splits))\n\
           \    return filled_splits\n\n"
         image: quay.io/aipcc/docling/cuda-ubi9
+    exec-docling-chunk:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - docling_chunk
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.6'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef docling_chunk(\n    input_path: dsl.Input[dsl.Artifact],\n  \
+          \  output_path: dsl.Output[dsl.Artifact],\n    max_tokens: int = 512,\n\
+          \    merge_peers: bool = True,\n):\n    \"\"\"\n    Chunk Docling documents\
+          \ using HybridChunker. Takes converted docling JSON files as input\n   \
+          \ and produces chunked JSONL files with semantic chunks suitable for RAG.\n\
+          \n    Output format is JSONL (one JSON object per line) for easy inspection\
+          \ and streaming.\n\n    Args:\n        input_path: Path to the input directory\
+          \ containing Docling JSON files\n        output_path: Path to the output\
+          \ directory for the chunked JSONL files\n        max_tokens: Maximum number\
+          \ of tokens per chunk\n        merge_peers: Whether to merge smaller chunks\
+          \ at the same level\n    \"\"\"\n    import json  # pylint: disable=import-outside-toplevel\n\
+          \    from datetime import datetime, timezone  # pylint: disable=import-outside-toplevel\n\
+          \    from pathlib import Path  # pylint: disable=import-outside-toplevel\n\
+          \n    # HybridChunker = Docling's smart chunking class that combines:\n\
+          \    # 1. Document structure awareness\n    # 2. Token-based splitting\n\
+          \    from docling.chunking import HybridChunker  # pylint: disable=import-outside-toplevel\n\
+          \    from docling_core.transforms.chunker.tokenizer.huggingface import (\n\
+          \        HuggingFaceTokenizer,\n    )  # pylint: disable=import-outside-toplevel\n\
+          \    from docling_core.types import DoclingDocument  # pylint: disable=import-outside-toplevel\n\
+          \    from transformers import AutoTokenizer  # pylint: disable=import-outside-toplevel\n\
+          \n    # Convert KFP artifact paths to Path objects\n    input_path_p = Path(input_path.path)\n\
+          \    output_path_p = Path(output_path.path)\n    output_path_p.mkdir(parents=True,\
+          \ exist_ok=True)\n\n    # Initialize tokenizer for HybridChunker (new API)\n\
+          \    # Using a lightweight sentence-transformer model for tokenization\n\
+          \    EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n    try:\n\
+          \        hf_tokenizer = AutoTokenizer.from_pretrained(\n            EMBED_MODEL_ID,\n\
+          \            resume_download=True,\n            timeout=60,\n        )\n\
+          \        print(f\"docling-chunk: loaded tokenizer from {EMBED_MODEL_ID}\"\
+          , flush=True)\n    except Exception as e:\n        print(f\"docling-chunk:\
+          \ ERROR loading tokenizer: {e}\", flush=True)\n        raise RuntimeError(\n\
+          \            f\"Failed to load tokenizer model {EMBED_MODEL_ID}. \"\n  \
+          \          \"Ensure network access to HuggingFace Hub or pre-download the\
+          \ model.\"\n        ) from e\n\n    tokenizer = HuggingFaceTokenizer(\n\
+          \        tokenizer=hf_tokenizer,\n        max_tokens=max_tokens,\n    )\n\
+          \n    # Initialize Hybrid chunker with user-specified parameters\n    #\
+          \ tokenizer: The tokenizer wrapper to use for counting tokens (includes\
+          \ max_tokens)\n    # merge_peers: if true, smaller adjacent chunks will\
+          \ be merged together\n    chunker = HybridChunker(\n        tokenizer=tokenizer,\n\
+          \        merge_peers=merge_peers,\n    )\n\n    # Find all JSON files in\
+          \ the input directory\n    json_files = list(input_path_p.glob(\"*.json\"\
+          ))\n    if not json_files:\n        print(f\"docling-chunk: No JSON files\
+          \ found in {input_path_p}\", flush=True)\n        return\n\n    print(\n\
+          \        f\"docling-chunk: processing {len(json_files)} files with max_tokens={max_tokens}\
+          \ and merge_peers={merge_peers}\",\n        flush=True,\n    )\n\n    #\
+          \ Track processing results\n    processed_count = 0\n    skipped_files =\
+          \ []\n\n    # Process each file\n    for json_file in json_files:\n    \
+          \    print(f\"docling-chunk: processing {json_file}\", flush=True)\n\n \
+          \       # Load and validate the JSON file\n        try:\n            with\
+          \ open(json_file, \"r\", encoding=\"utf-8\") as f:\n                doc_data\
+          \ = json.load(f)\n        except json.JSONDecodeError as e:\n          \
+          \  print(\n                f\"docling-chunk: skipping {json_file.name} -\
+          \ invalid JSON: {e}\",\n                flush=True,\n            )\n   \
+          \         skipped_files.append((json_file.name, f\"invalid JSON: {e}\"))\n\
+          \            continue\n\n        # Parse the JSON data into a DoclingDocument\
+          \ object\n        # This validates that the JSON conforms to the DoclingDocument\
+          \ schema\n        try:\n            doc = DoclingDocument.model_validate(doc_data)\n\
+          \        except Exception as e:\n            # Catches pydantic.ValidationError\
+          \ and any other validation issues\n            print(\n                f\"\
+          docling-chunk: skipping {json_file.name} - not a valid DoclingDocument:\
+          \ {e}\",\n                flush=True,\n            )\n            skipped_files.append((json_file.name,\
+          \ f\"validation failed: {e}\"))\n            continue\n\n        # Chunk\
+          \ the document using HybridChunker\n        chunks = list(chunker.chunk(dl_doc=doc))\n\
+          \n        # Generate output filename: original_name_chunks.jsonl\n     \
+          \   output_filename = f\"{json_file.stem}_chunks.jsonl\"\n        output_file\
+          \ = output_path_p / output_filename\n\n        # Get current timestamp in\
+          \ ISO format\n        timestamp = datetime.now(timezone.utc).isoformat()\n\
+          \n        # Chunking config (for reproducibility)\n        chunking_config\
+          \ = {\n            \"max_tokens\": max_tokens,\n            \"merge_peers\"\
+          : merge_peers,\n            \"tokenizer_model\": EMBED_MODEL_ID,\n     \
+          \   }\n\n        # Write chunks as JSONL (one JSON object per line)\n  \
+          \      with open(output_file, \"w\", encoding=\"utf-8\") as f:\n       \
+          \     for idx, chunk in enumerate(chunks):\n                # Get contextualized\
+          \ text for this chunk\n                chunk_text = chunker.contextualize(chunk=chunk)\n\
+          \n                # Build the chunk object\n                chunk_obj =\
+          \ {\n                    \"timestamp\": timestamp,\n                   \
+          \ \"source_document\": json_file.name,\n                    \"chunk_index\"\
+          : idx,\n                    \"chunking_config\": chunking_config,\n    \
+          \                \"text\": chunk_text,\n                }\n\n          \
+          \      # Write as a single line of JSON\n                f.write(json.dumps(chunk_obj,\
+          \ ensure_ascii=False) + \"\\n\")\n\n        print(\n            f\"docling-chunk:\
+          \ saved {len(chunks)} chunks to {output_filename}\",\n            flush=True,\n\
+          \        )\n        processed_count += 1\n\n    # Report summary\n    print(\n\
+          \        f\"docling-chunk: done - processed {processed_count}/{len(json_files)}\
+          \ files\",\n        flush=True,\n    )\n    if skipped_files:\n        print(\n\
+          \            f\"docling-chunk: skipped {len(skipped_files)} invalid files:\"\
+          ,\n            flush=True,\n        )\n        for filename, reason in skipped_files:\n\
+          \            print(f\"  - {filename}: {reason}\", flush=True)\n\n"
+        image: quay.io/aipcc/docling/cuda-ubi9
+        resources:
+          cpuLimit: 2.0
+          cpuRequest: 0.25
+          memoryLimit: 2.0
+          memoryRequest: 0.512
+          resourceCpuLimit: '2'
+          resourceCpuRequest: 250m
+          resourceMemoryLimit: 2G
+          resourceMemoryRequest: 512M
     exec-docling-convert-standard:
       container:
         args:
diff --git a/kubeflow-pipelines/docling-vlm/vlm_convert_pipeline_compiled.yaml b/kubeflow-pipelines/docling-vlm/vlm_convert_pipeline_compiled.yaml
@@ -302,6 +302,129 @@ deploymentSpec:
           \ for i in range(num_splits)]\n    filled_splits = list(filter(None, all_splits))\n\
           \    return filled_splits\n\n"
         image: quay.io/aipcc/docling/cuda-ubi9
+    exec-docling-chunk:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - docling_chunk
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.6'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef docling_chunk(\n    input_path: dsl.Input[dsl.Artifact],\n  \
+          \  output_path: dsl.Output[dsl.Artifact],\n    max_tokens: int = 512,\n\
+          \    merge_peers: bool = True,\n):\n    \"\"\"\n    Chunk Docling documents\
+          \ using HybridChunker. Takes converted docling JSON files as input\n   \
+          \ and produces chunked JSONL files with semantic chunks suitable for RAG.\n\
+          \n    Output format is JSONL (one JSON object per line) for easy inspection\
+          \ and streaming.\n\n    Args:\n        input_path: Path to the input directory\
+          \ containing Docling JSON files\n        output_path: Path to the output\
+          \ directory for the chunked JSONL files\n        max_tokens: Maximum number\
+          \ of tokens per chunk\n        merge_peers: Whether to merge smaller chunks\
+          \ at the same level\n    \"\"\"\n    import json  # pylint: disable=import-outside-toplevel\n\
+          \    from datetime import datetime, timezone  # pylint: disable=import-outside-toplevel\n\
+          \    from pathlib import Path  # pylint: disable=import-outside-toplevel\n\
+          \n    # HybridChunker = Docling's smart chunking class that combines:\n\
+          \    # 1. Document structure awareness\n    # 2. Token-based splitting\n\
+          \    from docling.chunking import HybridChunker  # pylint: disable=import-outside-toplevel\n\
+          \    from docling_core.transforms.chunker.tokenizer.huggingface import (\n\
+          \        HuggingFaceTokenizer,\n    )  # pylint: disable=import-outside-toplevel\n\
+          \    from docling_core.types import DoclingDocument  # pylint: disable=import-outside-toplevel\n\
+          \    from transformers import AutoTokenizer  # pylint: disable=import-outside-toplevel\n\
+          \n    # Convert KFP artifact paths to Path objects\n    input_path_p = Path(input_path.path)\n\
+          \    output_path_p = Path(output_path.path)\n    output_path_p.mkdir(parents=True,\
+          \ exist_ok=True)\n\n    # Initialize tokenizer for HybridChunker (new API)\n\
+          \    # Using a lightweight sentence-transformer model for tokenization\n\
+          \    EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n    try:\n\
+          \        hf_tokenizer = AutoTokenizer.from_pretrained(\n            EMBED_MODEL_ID,\n\
+          \            resume_download=True,\n            timeout=60,\n        )\n\
+          \        print(f\"docling-chunk: loaded tokenizer from {EMBED_MODEL_ID}\"\
+          , flush=True)\n    except Exception as e:\n        print(f\"docling-chunk:\
+          \ ERROR loading tokenizer: {e}\", flush=True)\n        raise RuntimeError(\n\
+          \            f\"Failed to load tokenizer model {EMBED_MODEL_ID}. \"\n  \
+          \          \"Ensure network access to HuggingFace Hub or pre-download the\
+          \ model.\"\n        ) from e\n\n    tokenizer = HuggingFaceTokenizer(\n\
+          \        tokenizer=hf_tokenizer,\n        max_tokens=max_tokens,\n    )\n\
+          \n    # Initialize Hybrid chunker with user-specified parameters\n    #\
+          \ tokenizer: The tokenizer wrapper to use for counting tokens (includes\
+          \ max_tokens)\n    # merge_peers: if true, smaller adjacent chunks will\
+          \ be merged together\n    chunker = HybridChunker(\n        tokenizer=tokenizer,\n\
+          \        merge_peers=merge_peers,\n    )\n\n    # Find all JSON files in\
+          \ the input directory\n    json_files = list(input_path_p.glob(\"*.json\"\
+          ))\n    if not json_files:\n        print(f\"docling-chunk: No JSON files\
+          \ found in {input_path_p}\", flush=True)\n        return\n\n    print(\n\
+          \        f\"docling-chunk: processing {len(json_files)} files with max_tokens={max_tokens}\
+          \ and merge_peers={merge_peers}\",\n        flush=True,\n    )\n\n    #\
+          \ Track processing results\n    processed_count = 0\n    skipped_files =\
+          \ []\n\n    # Process each file\n    for json_file in json_files:\n    \
+          \    print(f\"docling-chunk: processing {json_file}\", flush=True)\n\n \
+          \       # Load and validate the JSON file\n        try:\n            with\
+          \ open(json_file, \"r\", encoding=\"utf-8\") as f:\n                doc_data\
+          \ = json.load(f)\n        except json.JSONDecodeError as e:\n          \
+          \  print(\n                f\"docling-chunk: skipping {json_file.name} -\
+          \ invalid JSON: {e}\",\n                flush=True,\n            )\n   \
+          \         skipped_files.append((json_file.name, f\"invalid JSON: {e}\"))\n\
+          \            continue\n\n        # Parse the JSON data into a DoclingDocument\
+          \ object\n        # This validates that the JSON conforms to the DoclingDocument\
+          \ schema\n        try:\n            doc = DoclingDocument.model_validate(doc_data)\n\
+          \        except Exception as e:\n            # Catches pydantic.ValidationError\
+          \ and any other validation issues\n            print(\n                f\"\
+          docling-chunk: skipping {json_file.name} - not a valid DoclingDocument:\
+          \ {e}\",\n                flush=True,\n            )\n            skipped_files.append((json_file.name,\
+          \ f\"validation failed: {e}\"))\n            continue\n\n        # Chunk\
+          \ the document using HybridChunker\n        chunks = list(chunker.chunk(dl_doc=doc))\n\
+          \n        # Generate output filename: original_name_chunks.jsonl\n     \
+          \   output_filename = f\"{json_file.stem}_chunks.jsonl\"\n        output_file\
+          \ = output_path_p / output_filename\n\n        # Get current timestamp in\
+          \ ISO format\n        timestamp = datetime.now(timezone.utc).isoformat()\n\
+          \n        # Chunking config (for reproducibility)\n        chunking_config\
+          \ = {\n            \"max_tokens\": max_tokens,\n            \"merge_peers\"\
+          : merge_peers,\n            \"tokenizer_model\": EMBED_MODEL_ID,\n     \
+          \   }\n\n        # Write chunks as JSONL (one JSON object per line)\n  \
+          \      with open(output_file, \"w\", encoding=\"utf-8\") as f:\n       \
+          \     for idx, chunk in enumerate(chunks):\n                # Get contextualized\
+          \ text for this chunk\n                chunk_text = chunker.contextualize(chunk=chunk)\n\
+          \n                # Build the chunk object\n                chunk_obj =\
+          \ {\n                    \"timestamp\": timestamp,\n                   \
+          \ \"source_document\": json_file.name,\n                    \"chunk_index\"\
+          : idx,\n                    \"chunking_config\": chunking_config,\n    \
+          \                \"text\": chunk_text,\n                }\n\n          \
+          \      # Write as a single line of JSON\n                f.write(json.dumps(chunk_obj,\
+          \ ensure_ascii=False) + \"\\n\")\n\n        print(\n            f\"docling-chunk:\
+          \ saved {len(chunks)} chunks to {output_filename}\",\n            flush=True,\n\
+          \        )\n        processed_count += 1\n\n    # Report summary\n    print(\n\
+          \        f\"docling-chunk: done - processed {processed_count}/{len(json_files)}\
+          \ files\",\n        flush=True,\n    )\n    if skipped_files:\n        print(\n\
+          \            f\"docling-chunk: skipped {len(skipped_files)} invalid files:\"\
+          ,\n            flush=True,\n        )\n        for filename, reason in skipped_files:\n\
+          \            print(f\"  - {filename}: {reason}\", flush=True)\n\n"
+        image: quay.io/aipcc/docling/cuda-ubi9
+        resources:
+          cpuLimit: 2.0
+          cpuRequest: 0.25
+          memoryLimit: 2.0
+          memoryRequest: 0.512
+          resourceCpuLimit: '2'
+          resourceCpuRequest: 250m
+          resourceMemoryLimit: 2G
+          resourceMemoryRequest: 512M
     exec-docling-convert-vlm:
       container:
         args:

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,6 @@ def convert_pipeline_local():`
`44`	`44`	`input_path=importer.outputs["output_path"],`
`45`	`45`	`artifacts_path=artifacts.outputs["output_path"],`
`46`	`46`	`pdf_filenames=first_split.output,`
`47`		`- ocr=False,`
`48`	`47`	`)`
`49`	`48`
`50`	`49`	`docling_chunk(`