Skip to content

Commit c64063b

Browse files
authored
[llm-d] Keep working (#911)
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Improved benchmark cleanup so job and storage are reliably removed even if artifact-copy or wait steps fail. * When skipping preparation, the test now downloads the configured model to ensure required assets are present. * **Refactor** * Simplified pod selection for artifact and log capture so diagnostics include the intended service pods (broader match). * **Chores** * Centralized test preset for a large model and removed redundant per-preset model overrides. * **New Features** * Downloader pod can run as root or non-root based on configuration, avoiding unnecessary root operations. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
2 parents 227b1fa + 89dfec2 commit c64063b

File tree

6 files changed

+85
-87
lines changed

6 files changed

+85
-87
lines changed

projects/llm-d/testing/config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ ci_presets:
3535
prepare.cluster.skip: true
3636
prepare.rhoai.skip: true
3737

38-
tests.llmd.inference_service.model: llama3-1-8b
3938
tests.capture_prom: false
4039
tests.capture_prom_uwm: false
4140
tests.llmd.skip_prepare: false
@@ -44,11 +43,9 @@ ci_presets:
4443
extends: [azure, opt-125m]
4544
prepare.pvc.storage_class: managed-csi
4645

47-
4846
cks:
49-
extends: [pvc_rwx]
47+
extends: [pvc_rwx, llama-70b]
5048

51-
tests.llmd.inference_service.model: llama3-3-70b
5249
tests.capture_prom: false
5350
tests.capture_prom_uwm: false
5451
tests.llmd.skip_prepare: true
@@ -92,6 +89,9 @@ ci_presets:
9289
gpt-oss:
9390
tests.llmd.inference_service.model: gpt-oss-120
9491

92+
llama-70b:
93+
tests.llmd.inference_service.model: llama3-3-70b
94+
9595
clusters:
9696
cleanup_on_exit: false
9797

projects/llm-d/testing/test_llmd.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ def test():
126126
prepare_for_test()
127127
else:
128128
logging.info("Skipping test preparation (tests.llmd.skip_prepare=True)")
129+
model_ref = config.project.get_config("tests.llmd.inference_service.model")
130+
prepare_llmd.download_single_model(model_ref)
129131

130132
logging.info(f"Running tests for flavors: {flavors}")
131133

projects/llm-d/toolbox/llmd_capture_isvc_state/tasks/main.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
- name: Capture all pods related to the LLMInferenceService
3636
shell:
3737
oc get pods \
38-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
38+
-l "app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
3939
-n "{{ target_namespace }}" \
4040
-oyaml > "{{ artifact_extra_logs_dir }}/artifacts/llminferenceservice.pods.yaml"
4141
ignore_errors: true
@@ -96,7 +96,7 @@
9696
- name: Capture logs from LLMInferenceService pods
9797
shell: |
9898
for pod in $(oc get pods \
99-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
99+
-l "app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
100100
-n "{{ target_namespace }}" \
101101
-o jsonpath='{.items[*].metadata.name}'); do
102102
echo "=== Logs for pod: $pod ===" >> "{{ artifact_extra_logs_dir }}/artifacts/llminferenceservice.pods.logs"
@@ -108,7 +108,7 @@
108108
- name: Capture previous logs from LLMInferenceService pods if available
109109
shell: |
110110
for pod in $(oc get pods \
111-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
111+
-l "app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
112112
-n "{{ target_namespace }}" \
113113
-o jsonpath='{.items[*].metadata.name}'); do
114114
echo "=== Previous logs for pod: $pod ===" >> "{{ artifact_extra_logs_dir }}/artifacts/llminferenceservice.pods.previous.logs"
@@ -127,7 +127,7 @@
127127
- name: Capture describe output for related pods
128128
shell: |
129129
for pod in $(oc get pods \
130-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
130+
-l "app.kubernetes.io/name={{ llmd_capture_isvc_state_llmisvc_name }}" \
131131
-n "{{ target_namespace }}" \
132132
-o jsonpath='{.items[*].metadata.name}'); do
133133
echo "=== Describe for pod: $pod ===" >> "{{ artifact_extra_logs_dir }}/artifacts/llminferenceservice.pods.describe.txt"

projects/llm-d/toolbox/llmd_deploy_llm_inference_service/tasks/main.yml

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@
7979
shell: |
8080
set -o pipefail;
8181
oc get pods \
82-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
82+
-l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
8383
-n "{{ llmd_deploy_llm_inference_service_namespace }}" \
8484
-o json | jq -r '.items[] | select(.status.phase == "Pending") | .metadata.name' | wc -l
8585
register: llm_pods_pending
@@ -92,7 +92,7 @@
9292
shell: |
9393
set -o pipefail;
9494
oc get pods \
95-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
95+
-l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
9696
-n "{{ llmd_deploy_llm_inference_service_namespace }}" \
9797
-o json | jq -r '.items[] | select(.status.phase != "Running" and .status.phase != "Succeeded") | .metadata.name' | wc -l
9898
register: llm_pods_not_running
@@ -104,20 +104,11 @@
104104
- name: Debug - show pod status during startup wait
105105
command:
106106
oc get pods \
107-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
107+
-l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
108108
-n "{{ llmd_deploy_llm_inference_service_namespace }}" \
109109
-o wide
110110
when: llm_pods_not_running.stdout | int > 0
111111

112-
- name: Debug - show pod events if pods are still not running
113-
shell:
114-
set -o pipefail;
115-
oc get events --field-selector involvedObject.kind=Pod \
116-
-n "{{ llmd_deploy_llm_inference_service_namespace }}" \
117-
--sort-by='.lastTimestamp' | tail -10
118-
when: llm_pods_not_running.stdout | int > 0
119-
failed_when: false
120-
121112
- name: Wait for the LLM inference service to be ready
122113
command:
123114
oc get llminferenceservice "{{ llmd_deploy_llm_inference_service_name }}" \
@@ -141,7 +132,7 @@
141132
- name: Capture pod status with oc get pods
142133
shell:
143134
oc get pods \
144-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
135+
-l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
145136
-n "{{ llmd_deploy_llm_inference_service_namespace }}" \
146137
-o wide \
147138
> "{{ artifact_extra_logs_dir }}/artifacts/llm_inference_service.pods.status"
@@ -150,7 +141,7 @@
150141
- name: Capture deployment YAML
151142
shell:
152143
oc get deployments \
153-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
144+
-l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
154145
-n "{{ llmd_deploy_llm_inference_service_namespace }}" \
155146
-oyaml \
156147
> "{{ artifact_extra_logs_dir }}/artifacts/llm_inference_service.deployments.yaml"
@@ -159,7 +150,7 @@
159150
- name: Capture replicaset YAML
160151
shell:
161152
oc get replicasets \
162-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
153+
-l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
163154
-n "{{ llmd_deploy_llm_inference_service_namespace }}" \
164155
-oyaml \
165156
> "{{ artifact_extra_logs_dir }}/artifacts/llm_inference_service.replicasets.yaml"
@@ -168,7 +159,7 @@
168159
- name: Capture the LLM inference service pods YAML
169160
shell:
170161
oc get pods \
171-
-l "app.kubernetes.io/component=llminferenceservice-workload,app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
162+
-l "app.kubernetes.io/name={{ llmd_deploy_llm_inference_service_name }}" \
172163
-n "{{ llmd_deploy_llm_inference_service_namespace }}" \
173164
-oyaml \
174165
> "{{ artifact_extra_logs_dir }}/artifacts/llm_inference_service.pods.yaml"

projects/llm-d/toolbox/llmd_run_guidellm_benchmark/tasks/main.yml

Lines changed: 65 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -79,65 +79,68 @@
7979
when: job_status_result.stdout != 'Complete'
8080

8181
always:
82-
- name: Capture the final guidellm benchmark job YAML
83-
shell:
84-
oc get job "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" -oyaml \
85-
> "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.final.yaml"
86-
87-
- name: Capture the guidellm benchmark job pods YAML
88-
shell:
89-
oc get pods -l "job-name={{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" -oyaml \
90-
> "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.pods.yaml"
91-
92-
- name: Capture the guidellm benchmark job logs
93-
shell:
94-
oc logs job/{{ llmd_run_guidellm_benchmark_name }} -n "{{ target_namespace }}" \
95-
> "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.logs"
96-
97-
- name: Create the results subdirectory
98-
file:
99-
path: "{{ artifact_extra_logs_dir }}/artifacts/results/"
100-
state: directory
101-
mode: '0755'
102-
103-
- name: Get the node where the GuideLLM job pod ran
104-
shell: |
105-
set -o pipefail;
106-
oc get pods -l "job-name={{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --no-headers -o custom-columns=":spec.nodeName" | head -1
107-
register: guidellm_node_result
108-
failed_when: false
109-
110-
- name: Set the target node for helper pod
111-
set_fact:
112-
helper_pod_node: "{{ guidellm_node_result.stdout | trim }}"
113-
114-
- name: Create the copy helper pod YAML
115-
template:
116-
src: "{{ copy_helper_pod_template }}"
117-
dest: "{{ artifact_extra_logs_dir }}/src/copy_helper_pod.yaml"
118-
mode: '0700'
119-
120-
- name: Create the copy helper pod
121-
command:
122-
oc create -f "{{ artifact_extra_logs_dir }}/src/copy_helper_pod.yaml"
123-
124-
- name: Wait for copy helper pod to be ready
125-
shell: |
126-
oc wait --for=condition=Ready pod/{{ llmd_run_guidellm_benchmark_name }}-copy -n "{{ target_namespace }}" --timeout=60s
127-
128-
- name: Copy benchmarks.json from PVC to artifacts directory
129-
shell: |
130-
oc exec {{ llmd_run_guidellm_benchmark_name }}-copy -n "{{ target_namespace }}" -- cat /results/benchmarks.json > "{{ artifact_extra_logs_dir }}/artifacts/results/benchmarks.json" 2>/dev/null || echo "Warning: benchmarks.json not found in PVC"
131-
failed_when: false
132-
133-
- name: Delete the copy helper pod
134-
command:
135-
oc delete pod "{{ llmd_run_guidellm_benchmark_name }}-copy" -n "{{ target_namespace }}" --ignore-not-found
136-
137-
- name: Delete the guidellm benchmark job
138-
command:
139-
oc delete job "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --ignore-not-found
140-
141-
- name: Delete the guidellm benchmark PVC
142-
command:
143-
oc delete pvc "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --ignore-not-found
82+
- name: Capture artifacts
83+
block:
84+
- name: Capture the final guidellm benchmark job YAML
85+
shell:
86+
oc get job "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" -oyaml \
87+
> "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.final.yaml"
88+
89+
- name: Capture the guidellm benchmark job pods YAML
90+
shell:
91+
oc get pods -l "job-name={{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" -oyaml \
92+
> "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.pods.yaml"
93+
94+
- name: Capture the guidellm benchmark job logs
95+
shell:
96+
oc logs job/{{ llmd_run_guidellm_benchmark_name }} -n "{{ target_namespace }}" \
97+
> "{{ artifact_extra_logs_dir }}/artifacts/guidellm_benchmark_job.logs"
98+
always:
99+
- name: Create the results subdirectory
100+
file:
101+
path: "{{ artifact_extra_logs_dir }}/artifacts/results/"
102+
state: directory
103+
mode: '0755'
104+
105+
- name: Get the node where the GuideLLM job pod ran
106+
shell: |
107+
set -o pipefail;
108+
oc get pods -l "job-name={{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --no-headers -o custom-columns=":spec.nodeName" | head -1
109+
register: guidellm_node_result
110+
failed_when: false
111+
112+
- name: Set the target node for helper pod
113+
set_fact:
114+
helper_pod_node: "{{ guidellm_node_result.stdout | trim }}"
115+
116+
- name: Create the copy helper pod YAML
117+
template:
118+
src: "{{ copy_helper_pod_template }}"
119+
dest: "{{ artifact_extra_logs_dir }}/src/copy_helper_pod.yaml"
120+
mode: '0700'
121+
122+
- name: Create the copy helper pod
123+
command:
124+
oc create -f "{{ artifact_extra_logs_dir }}/src/copy_helper_pod.yaml"
125+
126+
- name: Retrieve the test results locally
127+
block:
128+
- name: Wait for copy helper pod to be ready
129+
shell: |
130+
oc wait --for=condition=Ready pod/{{ llmd_run_guidellm_benchmark_name }}-copy -n "{{ target_namespace }}" --timeout=60s
131+
132+
- name: Copy benchmarks.json from PVC to artifacts directory
133+
shell: |
134+
oc exec {{ llmd_run_guidellm_benchmark_name }}-copy -n "{{ target_namespace }}" -- cat /results/benchmarks.json > "{{ artifact_extra_logs_dir }}/artifacts/results/benchmarks.json" 2>/dev/null || echo "Warning: benchmarks.json not found in PVC"
135+
always:
136+
- name: Delete the copy helper pod
137+
command:
138+
oc delete pod "{{ llmd_run_guidellm_benchmark_name }}-copy" -n "{{ target_namespace }}" --ignore-not-found
139+
140+
- name: Delete the guidellm benchmark job
141+
command:
142+
oc delete job "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --ignore-not-found
143+
144+
- name: Delete the guidellm benchmark PVC
145+
command:
146+
oc delete pvc "{{ llmd_run_guidellm_benchmark_name }}" -n "{{ target_namespace }}" --ignore-not-found

projects/storage/toolbox/storage_download_to_pvc/templates/pod.yml.j2

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@ spec:
1010
containers:
1111
- name: downloader
1212
image: {{ storage_download_to_pvc_image }}
13-
command: [bash, -c, "/mnt/entrypoint/entrypoint.sh && chmod -R a+rX /storage"]
1413
{% if storage_download_to_pvc_run_as_root %}
14+
command: [bash, -c, "/mnt/entrypoint/entrypoint.sh && chmod -R a+rX /storage"]
1515
securityContext:
1616
runAsUser: 0
1717
allowPrivilegeEscalation: true
18+
{% else %}
19+
command: [bash, /mnt/entrypoint/entrypoint.sh]
1820
{% endif %}
1921
env:
2022
- name: DOWNLOAD_SOURCE

0 commit comments

Comments
 (0)