Skip to content

Commit 227b1fa

Browse files
authored
[llm-d] Keep working (#909)
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Persistent PVC for benchmark results and a helper pod to copy results. * JSON-based GuideLLM benchmark parsing and load-shape subtitle in throughput plots. * **Improvements** * Configurable PVC size (default 1Gi), and new flags to control running containers as root/user. * Configurable gateway name, conditional GPU readiness/preload, and reduced memory request for an inference service. * **Documentation** * Added docs for storage and security-related parameters. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
2 parents 51c57bd + 83fccba commit 227b1fa

File tree

23 files changed

+647
-74
lines changed

23 files changed

+647
-74
lines changed

docs/toolbox.generated/Cluster.preload_image.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,10 @@ Parameters
5454

5555
* Pod toleration to apply to the DaemonSet.
5656

57+
58+
``run_as_user``
59+
60+
* User ID to run the preloader container as (defaults to 10001 if not specified).
61+
62+
* default value: ``10001``
63+

docs/toolbox.generated/Llmd.run_guidellm_benchmark.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,19 @@ Parameters
5656
* default value: ``900``
5757

5858

59+
``pvc_size``
60+
61+
* Size of the PersistentVolumeClaim for storing results
62+
63+
* default value: ``1Gi``
64+
65+
5966
``guidellm_args``
6067

6168
* List of additional guidellm arguments (e.g., ["--rate=10", "--max-seconds=30"])
6269

70+
71+
``run_as_root``
72+
73+
* Run the GuideLLM container as root user
74+

docs/toolbox.generated/Storage.download_to_pvc.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,8 @@ Parameters
8080

8181
* default value: ``registry.access.redhat.com/ubi9/ubi``
8282

83+
84+
``run_as_root``
85+
86+
* Run the download container as root user
87+

projects/cluster/toolbox/cluster.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,8 @@ def destroy_osd(self, cluster_name):
316316
def preload_image(self,
317317
name, image, namespace="default",
318318
node_selector_key="", node_selector_value="",
319-
pod_toleration_key="", pod_toleration_effect=""):
319+
pod_toleration_key="", pod_toleration_effect="",
320+
run_as_user="10001"):
320321
"""
321322
Preload a container image on all the nodes of a cluster.
322323
@@ -328,6 +329,7 @@ def preload_image(self,
328329
node_selector_value: NodeSelector value to apply to the DaemonSet.
329330
pod_toleration_key: Pod toleration to apply to the DaemonSet.
330331
pod_toleration_effect: Pod toleration to apply to the DaemonSet.
332+
run_as_user: User ID to run the preloader container as (defaults to 10001 if not specified).
331333
"""
332334

333335
toolbox_name_suffix = os.environ.get("ARTIFACT_TOOLBOX_NAME_SUFFIX", "")

projects/cluster/toolbox/cluster_preload_image/defaults/main/config.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ cluster_preload_image_pod_toleration_key:
2626
# Pod toleration to apply to the DaemonSet.
2727
cluster_preload_image_pod_toleration_effect:
2828

29+
# User ID to run the preloader container as (defaults to 10001 if not specified).
30+
cluster_preload_image_run_as_user: '10001'
31+
2932
# Default Ansible variables
3033
# Default value for ansible_os_family to ensure role remains standalone
3134
ansible_os_family: Linux

projects/cluster/toolbox/cluster_preload_image/tasks/main.yml

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,13 @@
1111
state: directory
1212
mode: '0755'
1313

14-
- name: Lookup the namespace user ID range
15-
shell:
16-
set -o pipefail;
17-
18-
oc get ns {{ cluster_preload_image_namespace }} -ojsonpath={.metadata.annotations} | jq -r '.["openshift.io/sa.scc.uid-range"]' | cut -d/ -f1
19-
register: namespace_uid_range_cmd
20-
21-
- name: Save the namespace uid as run_as_user
14+
- name: Set run_as_user from configuration
2215
set_fact:
23-
run_as_user: "{{ namespace_uid_range_cmd.stdout }}"
16+
run_as_user: "{{ cluster_preload_image_run_as_user }}"
17+
18+
- name: Log run_as_user value
19+
debug:
20+
msg: "Using run_as_user: {{ run_as_user }}"
2421

2522
- name: Apply the DaemonSet template
2623
template:

projects/llm-d/testing/config.yaml

Lines changed: 73 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,43 @@ ci_presets:
1212

1313
dev:
1414
matbench.enabled: false
15+
tests.llmd.inference_service.model: facebook-opt-125m
16+
17+
opt-125m:
18+
tests.llmd.inference_service.model: facebook-opt-125m
19+
tests.llmd.inference_service.vllm_args[5]: "--max-model-len=2048"
20+
21+
psap_h200:
22+
tests.capture_prom: false
23+
tests.capture_prom_uwm: false
24+
tests.llmd.skip_prepare: true
25+
prepare.namespace.name: kpouget-dev
26+
27+
pvc_rwx:
28+
prepare.pvc.name: storage-rwx
29+
prepare.pvc.access_mode: ReadWriteMany
30+
31+
azure:
32+
security.run_as_root: true
33+
prepare.preload.skip: true
34+
prepare.operators.skip: true
35+
prepare.cluster.skip: true
36+
prepare.rhoai.skip: true
37+
38+
tests.llmd.inference_service.model: llama3-1-8b
39+
tests.capture_prom: false
40+
tests.capture_prom_uwm: false
41+
tests.llmd.skip_prepare: false
42+
43+
azure_light:
44+
extends: [azure, opt-125m]
45+
prepare.pvc.storage_class: managed-csi
46+
1547

1648
cks:
49+
extends: [pvc_rwx]
50+
51+
tests.llmd.inference_service.model: llama3-3-70b
1752
tests.capture_prom: false
1853
tests.capture_prom_uwm: false
1954
tests.llmd.skip_prepare: true
@@ -60,6 +95,10 @@ ci_presets:
6095
clusters:
6196
cleanup_on_exit: false
6297

98+
security:
99+
# Run containers as root user (affects both GuideLLM benchmarks and storage download operations)
100+
run_as_root: false
101+
63102
secrets:
64103
dir:
65104
name: psap-ods-secret
@@ -149,15 +188,21 @@ prepare:
149188
namespaces:
150189
- "@prepare.namespace.name"
151190

191+
gpu:
192+
wait_for_readiness: false
193+
152194
preload:
195+
skip: false
153196
extra_images: {}
154197
node_selector_key: nvidia.com/gpu.present
155198
node_selector_value: "true"
156199

157200
pvc:
201+
enabled: true
158202
size: 2000Gi
159-
name: storage-rwx
160-
access_mode: ReadWriteMany
203+
name: storage
204+
access_mode: ReadWriteOnce
205+
storage_class: null
161206

162207
model_downloader:
163208
image: ghcr.io/opendatahub-io/rhaii-on-xks/kserve-storage-initializer:e6b5db0@sha256:b305264fe2211be2c6063500c4c11da79e8357af4b34dd8567b0d8e8dea7e1d4
@@ -166,21 +211,38 @@ prepare:
166211
skip: false
167212

168213
models:
169-
llama3.1-8b:
170-
name: RedHatAI/podllama-3-1-8b-instruct-fp8
214+
facebook-opt-125m:
215+
name: facebook/opt-125m
216+
source: hf://facebook/opt-125m
217+
resources:
218+
cpu: 2
219+
memory: 8Gi
220+
221+
llama3-1-8b:
222+
name: RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic
171223
uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5
224+
# source: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic
225+
resources: {}
172226

173-
llama3.3-70b:
227+
llama3-3-70b:
174228
name: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
175229
source: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
230+
resources:
231+
cpu: 4
232+
memory: 64Gi
176233

177234
gpt-oss-120:
178235
name: openai/gpt-oss-120b
179236
source: hf://openai/gpt-oss-120b
237+
resources:
238+
cpu: 4
239+
memory: 64Gi
180240

181241
granite4-tiny:
182242
name: RedHatAI/granite-4.0-h-tiny-FP8-dynamic
183243
source: hf://RedHatAI/granite-4.0-h-tiny-FP8-dynamic
244+
resources: {}
245+
184246
tests:
185247
llmd:
186248
skip: false
@@ -194,8 +256,9 @@ tests:
194256
yaml_file: llama-3-1-8b-instruct-fp8.yaml
195257
timeout: 900
196258
do_simple_test: true
197-
198-
model: llama3.3-70b
259+
gateway:
260+
name: gateway-external
261+
model: llama3-1-8b
199262

200263
# vLLM arguments (always applied)
201264
vllm_args:
@@ -204,7 +267,7 @@ tests:
204267
- "--uvicorn-log-level=debug"
205268
- "--trust-remote-code"
206269
- "--disable-log-requests"
207-
- "--max-model-len=40960"
270+
- "--max-model-len=40960" # keep in 5th position or udpdate the presets
208271
- "--gpu-memory-utilization=0.92"
209272

210273
kueue:
@@ -225,11 +288,12 @@ tests:
225288
name: guidellm-benchmark
226289
backend_type: openai_http
227290
rate_type: concurrent
228-
max_seconds: null
291+
max_seconds: 120
229292
max_requests: null
230293
timeout: 900
231294
data: prompt_tokens=256,output_tokens=128
232295
rate: 1
296+
sample_requests: 20
233297

234298
capture_prom: true
235299
capture_prom_uwm: true

projects/llm-d/testing/llmisvcs/llama-3-1-8b-instruct-fp8.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ spec:
7171
emptyDir: {}
7272
- name: cachi2-cache
7373
emptyDir: {}
74+
nodeSelector:
75+
nvidia.com/gpu.present: "true"
7476
route: {}
7577
gateway: {}
7678
template:
@@ -85,7 +87,7 @@ spec:
8587
nvidia.com/gpu: "1"
8688
requests:
8789
cpu: '4'
88-
memory: 64Gi
90+
memory: 8Gi
8991
nvidia.com/gpu: "1"
9092
livenessProbe:
9193
httpGet:

projects/llm-d/testing/prepare_llmd.py

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -195,10 +195,15 @@ def prepare():
195195
prepare_gateway()
196196
scale_up()
197197

198+
model_ref = config.project.get_config("tests.llmd.inference_service.model")
198199
with run.Parallel("prepare_node") as parallel:
199-
parallel.delayed(download_models_to_pvc)
200-
parallel.delayed(wait_for_gpu_readiness)
201-
parallel.delayed(preload_llm_model_image)
200+
parallel.delayed(download_single_model, model_ref)
201+
202+
if config.project.get_config("prepare.gpu.wait_for_readiness"):
203+
parallel.delayed(wait_for_gpu_readiness)
204+
205+
if not config.project.get_config("prepare.preload.skip"):
206+
parallel.delayed(preload_llm_model_image)
202207

203208

204209
def prepare_operators():
@@ -493,14 +498,21 @@ def download_models_to_pvc():
493498
"""
494499
logging.info("Starting model download process")
495500

501+
# Check if PVC prefetch is enabled
502+
pvc_enabled = config.project.get_config("prepare.pvc.enabled", True) # Default to True for backward compatibility
503+
504+
if not pvc_enabled:
505+
logging.info("PVC prefetch disabled (prepare.pvc.enabled: false) - skipping model downloads")
506+
return
507+
496508
try:
497509
# Get models configuration
498510
models_config = config.project.get_config("models")
499511
if not models_config:
500512
logging.info("No models configured for download - skipping")
501513
return
502514

503-
logging.info(f"Downloading {len(models_config)} model(s) to PVC...")
515+
logging.info(f"PVC prefetch enabled - downloading {len(models_config)} model(s) to PVC...")
504516

505517
# Download models in parallel for efficiency
506518
with run.Parallel("download_models") as parallel:
@@ -522,6 +534,13 @@ def download_single_model(model_key):
522534
model_key: The key identifying the model in the models configuration
523535
"""
524536
try:
537+
# Check if PVC prefetch is enabled
538+
pvc_enabled = config.project.get_config("prepare.pvc.enabled", True) # Default to True for backward compatibility
539+
540+
if not pvc_enabled:
541+
logging.info(f"PVC prefetch disabled - skipping download for model '{model_key}'")
542+
return
543+
525544
logging.info(f"Starting download for model '{model_key}'")
526545

527546
# Get model configuration
@@ -540,6 +559,7 @@ def download_single_model(model_key):
540559
pvc_name = config.project.get_config("prepare.pvc.name")
541560
pvc_size = config.project.get_config("prepare.pvc.size")
542561
pvc_access_mode = config.project.get_config("prepare.pvc.access_mode")
562+
pvc_storage_class = config.project.get_config("prepare.pvc.storage_class", None)
543563
namespace = config.project.get_config("prepare.namespace.name")
544564
downloader_image = config.project.get_config("prepare.model_downloader.image")
545565

@@ -559,15 +579,26 @@ def download_single_model(model_key):
559579
secret_dir = config.project.get_config("secrets.dir.env_key")
560580
hf_creds_path = pathlib.Path(os.environ[secret_dir]) / hf_token_secret
561581

562-
run.run_toolbox("storage", "download_to_pvc",
563-
name=model_key,
564-
source=source,
565-
pvc_name=pvc_name,
566-
namespace=namespace,
567-
pvc_size=pvc_size,
568-
image=downloader_image,
569-
creds=str(hf_creds_path),
570-
clean_first=False) # Don't clean to allow multiple models in same PVC
582+
# Prepare download_to_pvc arguments
583+
download_args = {
584+
"name": model_key,
585+
"source": source,
586+
"pvc_name": pvc_name,
587+
"namespace": namespace,
588+
"pvc_size": pvc_size,
589+
"pvc_access_mode": pvc_access_mode,
590+
"image": downloader_image,
591+
"creds": str(hf_creds_path),
592+
"clean_first": False, # Don't clean to allow multiple models in same PVC
593+
"run_as_root": config.project.get_config("security.run_as_root")
594+
}
595+
596+
# Add storage class if configured
597+
if pvc_storage_class:
598+
download_args["pvc_storage_class_name"] = pvc_storage_class
599+
logging.info(f"Using PVC storage class: {pvc_storage_class}")
600+
601+
run.run_toolbox("storage", "download_to_pvc", **download_args)
571602

572603
logging.info(f"Successfully downloaded model '{model_key}'")
573604

0 commit comments

Comments
 (0)