[llm-d] Keep working (#911)

kpouget · web-flow · commit c64063bd71f8 · 2026-03-25T12:47:19.000+01:00
&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

* **Bug Fixes**
* Improved benchmark cleanup so job and storage are reliably removed
even if artifact-copy or wait steps fail.
* When skipping preparation, the test now downloads the configured model
to ensure required assets are present.

* **Refactor**
* Simplified pod selection for artifact and log capture so diagnostics
include the intended service pods (broader match).

* **Chores**
* Centralized test preset for a large model and removed redundant
per-preset model overrides.

* **New Features**
* Downloader pod can run as root or non-root based on configuration,
avoiding unnecessary root operations.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;
diff --git a/projects/llm-d/testing/config.yaml b/projects/llm-d/testing/config.yaml
@@ -35,7 +35,6 @@ ci_presets:
     prepare.cluster.skip: true
     prepare.rhoai.skip: true
 
-    tests.llmd.inference_service.model: llama3-1-8b
     tests.capture_prom: false
     tests.capture_prom_uwm: false
     tests.llmd.skip_prepare: false
@@ -44,11 +43,9 @@ ci_presets:
     extends: [azure, opt-125m]
     prepare.pvc.storage_class: managed-csi
 
-
   cks:
-    extends: [pvc_rwx]
+    extends: [pvc_rwx, llama-70b]
 
-    tests.llmd.inference_service.model: llama3-3-70b
     tests.capture_prom: false
     tests.capture_prom_uwm: false
     tests.llmd.skip_prepare: true
@@ -92,6 +89,9 @@ ci_presets:
   gpt-oss:
     tests.llmd.inference_service.model: gpt-oss-120
 
+  llama-70b:
+    tests.llmd.inference_service.model: llama3-3-70b
+
 clusters:
   cleanup_on_exit: false
 
diff --git a/projects/llm-d/testing/test_llmd.py b/projects/llm-d/testing/test_llmd.py
@@ -126,6 +126,8 @@ def test():
         prepare_for_test()
     else:
         logging.info("Skipping test preparation (tests.llmd.skip_prepare=True)")
+        model_ref = config.project.get_config("tests.llmd.inference_service.model")
+        prepare_llmd.download_single_model(model_ref)
 
     logging.info(f"Running tests for flavors: {flavors}")
 
diff --git a/projects/llm-d/toolbox/llmd_capture_isvc_state/tasks/main.yml b/projects/llm-d/toolbox/llmd_capture_isvc_state/tasks/main.yml
@@ -35,7 +35,7 @@
 - name: Capture all pods related to the LLMInferenceService
   shell:
     oc get pods \
-       -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
+       -l "app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
        -n "{{ target_namespace }}" \
        -oyaml > "{{ artifact_extra_logs_dir }}/artifacts/llminferenceservice.pods.yaml"
   ignore_errors: true
@@ -96,7 +96,7 @@
 - name: Capture logs from LLMInferenceService pods
   shell: |
     for pod in $(oc get pods \
-       -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
+       -l "app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
        -n "{{ target_namespace }}" \
        -o jsonpath='{.items[*].metadata.name}'); do
       echo "=== Logs for pod: $pod ===" >> "{{ artifact_extra_logs_dir }}/artifacts/llminferenceservice.pods.logs"
@@ -108,7 +108,7 @@
 - name: Capture previous logs from LLMInferenceService pods if available
   shell: |
     for pod in $(oc get pods \
-       -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
+       -l "app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
        -n "{{ target_namespace }}" \
        -o jsonpath='{.items[*].metadata.name}'); do
       echo "=== Previous logs for pod: $pod ===" >> "{{ artifact_extra_logs_dir }}/artifacts/llminferenceservice.pods.previous.logs"
@@ -127,7 +127,7 @@
 - name: Capture describe output for related pods
   shell: |
     for pod in $(oc get pods \
-       -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
+       -l "app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
        -n "{{ target_namespace }}" \
        -o jsonpath='{.items[*].metadata.name}'); do
       echo "=== Describe for pod: $pod ===" >> "{{ artifact_extra_logs_dir }}/artifacts/llminferenceservice.pods.describe.txt"
diff --git a/projects/llm-d/toolbox/llmd_deploy_llm_inference_service/tasks/main.yml b/projects/llm-d/toolbox/llmd_deploy_llm_inference_service/tasks/main.yml
@@ -79,7 +79,7 @@
     shell: |
       set -o pipefail;
       oc get pods \
-         -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
+         -l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
          -n "{{ llmd_deploy_llm_inference_service_namespace }}" \
          -o json | jq -r '.items[] | select(.status.phase == "Pending") | .metadata.name' | wc -l
     register: llm_pods_pending
@@ -92,7 +92,7 @@
     shell: |
       set -o pipefail;
       oc get pods \
-         -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
+         -l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
          -n "{{ llmd_deploy_llm_inference_service_namespace }}" \
          -o json | jq -r '.items[] | select(.status.phase != "Running" and .status.phase != "Succeeded") | .metadata.name' | wc -l
     register: llm_pods_not_running
@@ -104,20 +104,11 @@
   - name: Debug - show pod status during startup wait
     command:
       oc get pods \
-         -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
+         -l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
          -n "{{ llmd_deploy_llm_inference_service_namespace }}" \
          -o wide
     when: llm_pods_not_running.stdout | int > 0
 
-  - name: Debug - show pod events if pods are still not running
-    shell:
-      set -o pipefail;
-      oc get events --field-selector involvedObject.kind=Pod \
-         -n "{{ llmd_deploy_llm_inference_service_namespace }}" \
-         --sort-by='.lastTimestamp' | tail -10
-    when: llm_pods_not_running.stdout | int > 0
-    failed_when: false
-
   - name: Wait for the LLM inference service to be ready
     command:
       oc get llminferenceservice "{{ llmd_deploy_llm_inference_service_name }}" \
@@ -141,7 +132,7 @@
   - name: Capture pod status with oc get pods
     shell:
       oc get pods \
-         -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
+         -l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
          -n "{{ llmd_deploy_llm_inference_service_namespace }}" \
          -o wide \
          > "{{ artifact_extra_logs_dir }}/artifacts/llm_inference_service.pods.status"
@@ -150,7 +141,7 @@
   - name: Capture deployment YAML
     shell:
       oc get deployments \
-         -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
+         -l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
          -n "{{ llmd_deploy_llm_inference_service_namespace }}" \
          -oyaml \
          > "{{ artifact_extra_logs_dir }}/artifacts/llm_inference_service.deployments.yaml"
@@ -159,7 +150,7 @@
   - name: Capture replicaset YAML
     shell:
       oc get replicasets \
-         -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
+         -l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
          -n "{{ llmd_deploy_llm_inference_service_namespace }}" \
          -oyaml \
          > "{{ artifact_extra_logs_dir }}/artifacts/llm_inference_service.replicasets.yaml"
@@ -168,7 +159,7 @@
   - name: Capture the LLM inference service pods YAML
     shell:
       oc get pods \
-         -l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
+         -l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
          -n "{{ llmd_deploy_llm_inference_service_namespace }}" \
          -oyaml \
          > "{{ artifact_extra_logs_dir }}/artifacts/llm_inference_service.pods.yaml"
diff --git a/projects/llm-d/toolbox/llmd_run_guidellm_benchmark/tasks/main.yml b/projects/llm-d/toolbox/llmd_run_guidellm_benchmark/tasks/main.yml
@@ -79,65 +79,68 @@
     when: job_status_result.stdout != 'Complete'
 
   always:
-  - name: Capture the final guidellm benchmark job YAML
-    shell:
-      oc get job "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" -oyaml \
-         > "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.final.yaml"
-
-  - name: Capture the guidellm benchmark job pods YAML
-    shell:
-      oc get pods -l "job-name={{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" -oyaml \
-         > "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.pods.yaml"
-
-  - name: Capture the guidellm benchmark job logs
-    shell:
-      oc logs job/{{ llmd_run_guidellm_benchmark_name }} -n "{{ target_namespace }}" \
-         > "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.logs"
-
-  - name: Create the results subdirectory
-    file:
-      path: "{{ artifact_extra_logs_dir }}/artifacts/results/"
-      state: directory
-      mode: '0755'
-
-  - name: Get the node where the GuideLLM job pod ran
-    shell: |
-      set -o pipefail;
-      oc get pods -l "job-name={{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --no-headers -o custom-columns=":spec.nodeName" | head -1
-    register: guidellm_node_result
-    failed_when: false
-
-  - name: Set the target node for helper pod
-    set_fact:
-      helper_pod_node: "{{ guidellm_node_result.stdout | trim }}"
-
-  - name: Create the copy helper pod YAML
-    template:
-      src: "{{ copy_helper_pod_template }}"
-      dest: "{{ artifact_extra_logs_dir }}/src/copy_helper_pod.yaml"
-      mode: '0700'
-
-  - name: Create the copy helper pod
-    command:
-      oc create -f "{{ artifact_extra_logs_dir }}/src/copy_helper_pod.yaml"
-
-  - name: Wait for copy helper pod to be ready
-    shell: |
-      oc wait --for=condition=Ready pod/{{ llmd_run_guidellm_benchmark_name }}-copy -n "{{ target_namespace }}" --timeout=60s
-
-  - name: Copy benchmarks.json from PVC to artifacts directory
-    shell: |
-      oc exec {{ llmd_run_guidellm_benchmark_name }}-copy -n "{{ target_namespace }}" -- cat /results/benchmarks.json > "{{ artifact_extra_logs_dir }}/artifacts/results/benchmarks.json" 2>/dev/null || echo "Warning: benchmarks.json not found in PVC"
-    failed_when: false
-
-  - name: Delete the copy helper pod
-    command:
-      oc delete pod "{{ llmd_run_guidellm_benchmark_name }}-copy" -n "{{ target_namespace }}" --ignore-not-found
-
-  - name: Delete the guidellm benchmark job
-    command:
-      oc delete job "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --ignore-not-found
-
-  - name: Delete the guidellm benchmark PVC
-    command:
-      oc delete pvc "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --ignore-not-found
+  - name: Capture artifacts
+    block:
+    - name: Capture the final guidellm benchmark job YAML
+      shell:
+        oc get job "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" -oyaml \
+           > "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.final.yaml"
+
+    - name: Capture the guidellm benchmark job pods YAML
+      shell:
+        oc get pods -l "job-name={{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" -oyaml \
+           > "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.pods.yaml"
+
+    - name: Capture the guidellm benchmark job logs
+      shell:
+        oc logs job/{{ llmd_run_guidellm_benchmark_name }} -n "{{ target_namespace }}" \
+           > "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.logs"
+    always:
+    - name: Create the results subdirectory
+      file:
+        path: "{{ artifact_extra_logs_dir }}/artifacts/results/"
+        state: directory
+        mode: '0755'
+
+    - name: Get the node where the GuideLLM job pod ran
+      shell: |
+        set -o pipefail;
+        oc get pods -l "job-name={{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --no-headers -o custom-columns=":spec.nodeName" | head -1
+      register: guidellm_node_result
+      failed_when: false
+
+    - name: Set the target node for helper pod
+      set_fact:
+        helper_pod_node: "{{ guidellm_node_result.stdout | trim }}"
+
+    - name: Create the copy helper pod YAML
+      template:
+        src: "{{ copy_helper_pod_template }}"
+        dest: "{{ artifact_extra_logs_dir }}/src/copy_helper_pod.yaml"
+        mode: '0700'
+
+    - name: Create the copy helper pod
+      command:
+        oc create -f "{{ artifact_extra_logs_dir }}/src/copy_helper_pod.yaml"
+
+    - name: Retrieve the test results locally
+      block:
+      - name: Wait for copy helper pod to be ready
+        shell: |
+          oc wait --for=condition=Ready pod/{{ llmd_run_guidellm_benchmark_name }}-copy -n "{{ target_namespace }}" --timeout=60s
+
+      - name: Copy benchmarks.json from PVC to artifacts directory
+        shell: |
+          oc exec {{ llmd_run_guidellm_benchmark_name }}-copy -n "{{ target_namespace }}" -- cat /results/benchmarks.json > "{{ artifact_extra_logs_dir }}/artifacts/results/benchmarks.json" 2>/dev/null || echo "Warning: benchmarks.json not found in PVC"
+      always:
+      - name: Delete the copy helper pod
+        command:
+          oc delete pod "{{ llmd_run_guidellm_benchmark_name }}-copy" -n "{{ target_namespace }}" --ignore-not-found
+
+      - name: Delete the guidellm benchmark job
+        command:
+          oc delete job "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --ignore-not-found
+
+      - name: Delete the guidellm benchmark PVC
+        command:
+          oc delete pvc "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --ignore-not-found
diff --git a/projects/storage/toolbox/storage_download_to_pvc/templates/pod.yml.j2 b/projects/storage/toolbox/storage_download_to_pvc/templates/pod.yml.j2
@@ -10,11 +10,13 @@ spec:
   containers:
   - name: downloader
     image: {{ storage_download_to_pvc_image }}
-    command: [bash, -c, "/mnt/entrypoint/entrypoint.sh && chmod -R a+rX /storage"]
 {% if storage_download_to_pvc_run_as_root %}
+    command: [bash, -c, "/mnt/entrypoint/entrypoint.sh && chmod -R a+rX /storage"]
     securityContext:
       runAsUser: 0
       allowPrivilegeEscalation: true
+{% else %}
+    command: [bash, /mnt/entrypoint/entrypoint.sh]
 {% endif %}
     env:
     - name: DOWNLOAD_SOURCE