Setup execute-kfp-localrunner on EC2 instance and add an option to provide the BASE_IMAGE URLs for qualification (#76)

shruthis4 · web-flow · commit 37b97a05c61b · 2025-12-22T21:16:57.000Z
##
diff --git a/.github/workflows/compile-kfp.yml b/.github/workflows/compile-kfp.yml
@@ -15,7 +15,6 @@ on:
       - "kubeflow-pipelines/docling-vlm/**"
       - "kubeflow-pipelines/common/**"
       - ".github/workflows/compile-kfp.yml"
-
 permissions:
   contents: read
 
@@ -48,7 +47,7 @@ jobs:
         working-directory: ${{ matrix.dir }}
         run: |
           python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          pip install -r requirements.txt 
 
       - name: Compile and compare (${{ matrix.name }})
         run: |
diff --git a/.github/workflows/execute-kfp-localrunners.yml b/.github/workflows/execute-kfp-localrunners.yml
@@ -19,35 +19,174 @@ on:
         description: "Pull request number or branch name"
         required: true
         default: "main"
+      python_base_image:
+        description: "Override PYTHON_BASE_IMAGE (leave empty to use default)"
+        required: false
+        default: ""
+      docling_base_image:
+        description: "Override DOCLING_BASE_IMAGE (leave empty to use default)"
+        required: false
+        default: ""
+env:
+  QUAY_REGISTRY: quay.io
+  INSTANCE_TYPE: "g6e.xlarge"
 
 jobs:
-  test-local-pipelines:
+  # This job always runs and provides clear feedback to contributors
+  pr-check:
+    runs-on: ubuntu-latest
+    steps:
+      - name: PR Check
+        run: |
+          echo "✅ PR received!"
+          echo ""
+          if [ "${{ github.repository }}" != "opendatahub-io/data-processing" ]; then
+            echo "ℹ️  Note: Full CI tests (EC2 runners, pipeline tests) only run on PRs to opendatahub-io/data-processing."
+            echo "   Your PR will be tested automatically once submitted to the upstream repository's branch."
+          else
+            echo "🚀 Running full CI tests on upstream repository..."
+          fi
+
+  launch-ec2-runner:
+    if: github.repository == 'opendatahub-io/data-processing'
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write # This is required for OIDC (AWS auth)
+      contents: read
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          role-to-assume: "arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/${{ vars.DATA_PROCESSING_IAM_ROLE }}"
+          aws-region: us-east-2
+          role-session-name: odh-data-processing  # For tracking in CloudTrail
+
+      - name: Start Data Processing EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@a6dbcefcf8a31a861f5e078bb153ed332130c512 # v2.4.3
+        with:
+          mode: start
+          github-token: "${{ secrets.DATA_PROCESSING_GH_PERSONAL_ACCESS_TOKEN }}"
+          ec2-instance-type: "${{ env.INSTANCE_TYPE }}"
+          availability-zones-config: >
+            [
+              {"imageId": "${{ vars.US_EAST_2_AMI_ID }}", "subnetId": "${{ vars.US_EAST_2A_SUBNET_ID }}", "securityGroupId": "${{ vars.US_EAST_2_SG_ID }}"},
+              {"imageId": "${{ vars.US_EAST_2_AMI_ID }}", "subnetId": "${{ vars.US_EAST_2B_SUBNET_ID }}", "securityGroupId": "${{ vars.US_EAST_2_SG_ID }}"},
+              {"imageId": "${{ vars.US_EAST_2_AMI_ID }}", "subnetId": "${{ vars.US_EAST_2C_SUBNET_ID }}", "securityGroupId": "${{ vars.US_EAST_2_SG_ID }}"}
+            ]
+          iam-role-name: "${{ vars.DATA_PROCESSING_IAM_ROLE }}"
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "data-processing-gh-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]  
+  test-local-pipelines:
+    if: github.repository == 'opendatahub-io/data-processing'
+    needs:
+      - launch-ec2-runner
+    runs-on: ${{ needs.launch-ec2-runner.outputs.label }}
     strategy:
       fail-fast: false
       matrix:
         pipeline:
           - docling-standard
-          #- docling-vlm
+          - docling-vlm
     
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      
-      - name: Set up Python 3.12
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-      
+      - name: Setup Environment
+        run: echo "Running on EC2 ${{ needs.launch-ec2-runner.outputs.ec2-instance-id }}"
+
+      - uses: actions/checkout@v4
+      - name: Setup System Dependencies (Python + Docker)
+        run: |
+          # Install Python
+          sudo dnf install -y python3.11 python3-pip
+          
+          # Install Docker CE from official repository
+          sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
+          sudo dnf install -y docker-ce docker-ce-cli containerd.io
+          
+          # Start Docker daemon
+          sudo systemctl start docker
+          sudo systemctl enable docker
+          
+          # Add current user to docker group
+          sudo usermod -aG docker $(whoami)
+          
+          # Apply group membership without logout (temporary for this session)
+          sudo chmod 666 /var/run/docker.sock 
       - name: Install minimal requirements
-        run: pip install docker kfp
-      
+        run: |
+          # Setup Pip
+          /usr/bin/python3.11 -m ensurepip --upgrade >/dev/null 2>&1 
+          /usr/bin/python3.11 -m pip install --upgrade pip 
+          
+          # 1. Install Requirements (Generic)
+          # We do this first so we can overwrite any bad CPU-versions it pulls in
+          /usr/bin/python3.11 -m pip install docker kfp==2.14.6    
       - name: Create output directory
         working-directory: kubeflow-pipelines/${{ matrix.pipeline }}
         run: |
           mkdir -p local_outputs
-          chmod 755 local_outputs
+          chmod 777 local_outputs
+      - name: Log in to Quay Container Registry
+        uses: docker/login-action@v3
+        with:
+         registry: ${{ env.QUAY_REGISTRY }}
+         username: ${{ secrets.QUAY_USERNAME }}
+         password: ${{ secrets.QUAY_PASSWORD }}
       
       - name: Run local pipeline
+        timeout-minutes: 15
         working-directory: kubeflow-pipelines/${{ matrix.pipeline }}
-        run: python local_run.py
+        run: |
+          # Only set env vars if inputs are provided (non-empty)
+          if [ -n "${{ github.event.inputs.python_base_image }}" ]; then
+            export PYTHON_BASE_IMAGE="${{ github.event.inputs.python_base_image }}"
+          fi
+          if [ -n "${{ github.event.inputs.docling_base_image }}" ]; then
+            export DOCLING_BASE_IMAGE="${{ github.event.inputs.docling_base_image }}"
+          fi
+          /usr/bin/python3.11 local_run.py
+      - name: Upload logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.pipeline }}-logs
+          path: kubeflow-pipelines/${{ matrix.pipeline }}/local_outputs/
+          retention-days: 7
+  stop-ec2-runner:
+    if: github.repository == 'opendatahub-io/data-processing' && always()
+    permissions:
+      id-token: write # This is required for OIDC (AWS auth)
+      contents: read
+    needs:
+      - launch-ec2-runner
+      - test-local-pipelines
+      
+    runs-on: ubuntu-latest
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          role-to-assume: "arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/${{ vars.DATA_PROCESSING_IAM_ROLE }}"
+          aws-region: us-east-2
+          role-session-name: odh-data-processing  # For tracking in CloudTrail
+
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@a6dbcefcf8a31a861f5e078bb153ed332130c512 # v2.4.3
+        with:
+          mode: stop
+          github-token: "${{ secrets.DATA_PROCESSING_GH_PERSONAL_ACCESS_TOKEN }}"
+          label: ${{ needs.launch-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.launch-ec2-runner.outputs.ec2-instance-id }}
+
diff --git a/kubeflow-pipelines/common/constants.py b/kubeflow-pipelines/common/constants.py
@@ -1,3 +1,9 @@
+import os
+
 # Base container images used across all Docling Kubeflow Pipelines
-PYTHON_BASE_IMAGE = "registry.access.redhat.com/ubi9/python-311:9.6-1755074620"
-DOCLING_BASE_IMAGE = "quay.io/fabianofranz/docling-ubi9:2.54.0"
+PYTHON_BASE_IMAGE = os.getenv(
+    "PYTHON_BASE_IMAGE", "quay.io/amaredia/aipcc-docling-image"
+)
+DOCLING_BASE_IMAGE = os.getenv(
+    "DOCLING_BASE_IMAGE", "quay.io/amaredia/aipcc-docling-image"
+)
diff --git a/kubeflow-pipelines/docling-standard/standard_convert_pipeline_compiled.yaml b/kubeflow-pipelines/docling-standard/standard_convert_pipeline_compiled.yaml
@@ -383,7 +383,7 @@ deploymentSpec:
           \ for path in input_path_p.glob(\"*.pdf\")]\n    all_splits = [all_pdfs[i::num_splits]\
           \ for i in range(num_splits)]\n    filled_splits = list(filter(None, all_splits))\n\
           \    return filled_splits\n\n"
-        image: registry.access.redhat.com/ubi9/python-311:9.6-1755074620
+        image: quay.io/amaredia/aipcc-docling-image
     exec-docling-chunk:
       container:
         args:
@@ -497,7 +497,7 @@ deploymentSpec:
           \            f\"docling-chunk: skipped {len(skipped_files)} invalid files:\"\
           ,\n            flush=True,\n        )\n        for filename, reason in skipped_files:\n\
           \            print(f\"  - {filename}: {reason}\", flush=True)\n\n"
-        image: quay.io/fabianofranz/docling-ubi9:2.54.0
+        image: quay.io/amaredia/aipcc-docling-image
         resources:
           cpuLimit: 2.0
           cpuRequest: 0.25
@@ -634,7 +634,7 @@ deploymentSpec:
           \ flush=True)\n        result.document.save_as_markdown(\n            output_md_path,\
           \ image_mode=ImageRefMode(image_export_mode)\n        )\n\n    print(\"\
           docling-standard-convert: done\", flush=True)\n\n"
-        image: quay.io/fabianofranz/docling-ubi9:2.54.0
+        image: quay.io/amaredia/aipcc-docling-image
         resources:
           cpuLimit: 4.0
           cpuRequest: 0.5
@@ -711,7 +711,7 @@ deploymentSpec:
           \        )\n    else:\n        raise ValueError(\n            f\"Invalid\
           \ pipeline_type: {pipeline_type}. Must be 'standard' or 'vlm'\"\n      \
           \  )\n\n"
-        image: quay.io/fabianofranz/docling-ubi9:2.54.0
+        image: quay.io/amaredia/aipcc-docling-image
     exec-import-pdfs:
       container:
         args:
@@ -805,7 +805,7 @@ deploymentSpec:
           \ in resp.iter_content(chunk_size=8192):\n                        if chunk:\n\
           \                            f.write(chunk)\n\n    print(\"import-test-pdfs:\
           \ done\", flush=True)\n\n"
-        image: registry.access.redhat.com/ubi9/python-311:9.6-1755074620
+        image: quay.io/amaredia/aipcc-docling-image
 pipelineInfo:
   description: Docling standard convert pipeline by the Data Processing Team
   name: data-processing-docling-standard-pipeline
diff --git a/kubeflow-pipelines/docling-vlm/vlm_convert_pipeline_compiled.yaml b/kubeflow-pipelines/docling-vlm/vlm_convert_pipeline_compiled.yaml
@@ -301,7 +301,7 @@ deploymentSpec:
           \ for path in input_path_p.glob(\"*.pdf\")]\n    all_splits = [all_pdfs[i::num_splits]\
           \ for i in range(num_splits)]\n    filled_splits = list(filter(None, all_splits))\n\
           \    return filled_splits\n\n"
-        image: registry.access.redhat.com/ubi9/python-311:9.6-1755074620
+        image: quay.io/amaredia/aipcc-docling-image
     exec-docling-chunk:
       container:
         args:
@@ -415,7 +415,7 @@ deploymentSpec:
           \            f\"docling-chunk: skipped {len(skipped_files)} invalid files:\"\
           ,\n            flush=True,\n        )\n        for filename, reason in skipped_files:\n\
           \            print(f\"  - {filename}: {reason}\", flush=True)\n\n"
-        image: quay.io/fabianofranz/docling-ubi9:2.54.0
+        image: quay.io/amaredia/aipcc-docling-image
         resources:
           cpuLimit: 2.0
           cpuRequest: 0.25
@@ -547,7 +547,7 @@ deploymentSpec:
           \        result.document.save_as_markdown(\n            output_md_path,\
           \ image_mode=ImageRefMode(image_export_mode)\n        )\n\n    print(\"\
           docling-vlm-convert: done\", flush=True)\n\n"
-        image: quay.io/fabianofranz/docling-ubi9:2.54.0
+        image: quay.io/amaredia/aipcc-docling-image
         resources:
           cpuLimit: 4.0
           cpuRequest: 0.5
@@ -624,7 +624,7 @@ deploymentSpec:
           \        )\n    else:\n        raise ValueError(\n            f\"Invalid\
           \ pipeline_type: {pipeline_type}. Must be 'standard' or 'vlm'\"\n      \
           \  )\n\n"
-        image: quay.io/fabianofranz/docling-ubi9:2.54.0
+        image: quay.io/amaredia/aipcc-docling-image
     exec-import-pdfs:
       container:
         args:
@@ -718,7 +718,7 @@ deploymentSpec:
           \ in resp.iter_content(chunk_size=8192):\n                        if chunk:\n\
           \                            f.write(chunk)\n\n    print(\"import-test-pdfs:\
           \ done\", flush=True)\n\n"
-        image: registry.access.redhat.com/ubi9/python-311:9.6-1755074620
+        image: quay.io/amaredia/aipcc-docling-image
 pipelineInfo:
   description: Docling VLM convert pipeline by the Data Processing Team
   name: data-processing-docling-vlm-pipeline