diff --git a/docs/toolbox.generated/Cluster.capture_servicemonitor_metrics.rst b/docs/toolbox.generated/Cluster.capture_servicemonitor_metrics.rst new file mode 100644 index 0000000000..e856bc8e54 --- /dev/null +++ b/docs/toolbox.generated/Cluster.capture_servicemonitor_metrics.rst @@ -0,0 +1,48 @@ +:orphan: + +.. + _Auto-generated file, do not edit manually ... + _Toolbox generate command: repo generate_toolbox_rst_documentation + _ Source component: Cluster.capture_servicemonitor_metrics + + +cluster capture_servicemonitor_metrics +====================================== + +Captures ServiceMonitor or PodMonitor YAML and status for a given service + +Captures the ServiceMonitor/PodMonitor configuration and status information for +a specific service in a namespace, including related service/pod and +endpoints information for troubleshooting monitoring setup. + + +Parameters +---------- + + +``service_name`` + +* Name of the service to capture ServiceMonitor/PodMonitor metrics for + + +``namespace`` + +* Namespace where the service and ServiceMonitor/PodMonitor are located (empty string auto-detects current namespace) + + +``capture_frequency`` + +* How often to capture metrics in seconds (default: 15) + +* default value: ``60`` + + +``is_podmonitor`` + +* Whether to use PodMonitor instead of ServiceMonitor (default: False) + + +``finalize`` + +* Whether to finalize (capture logs and delete) an existing pod instead of creating new one (default: False) + diff --git a/docs/toolbox.generated/Llmd.run_guidellm_benchmark.rst b/docs/toolbox.generated/Llmd.run_guidellm_benchmark.rst index 6b6e08e843..70eaf7bc94 100644 --- a/docs/toolbox.generated/Llmd.run_guidellm_benchmark.rst +++ b/docs/toolbox.generated/Llmd.run_guidellm_benchmark.rst @@ -39,14 +39,14 @@ Parameters * Container image for the benchmark -* default value: ``ghcr.io/vllm-project/guidellm`` +* default value: ``ghcr.io/albertoperdomo2/guidellm`` ``version`` * Version tag for the benchmark image -* default value: ``pr-590`` +* default value: ``nightly`` ``timeout`` diff --git a/docs/toolbox.generated/index.rst b/docs/toolbox.generated/index.rst index 0668c6950c..acd27da3f7 100644 --- a/docs/toolbox.generated/index.rst +++ b/docs/toolbox.generated/index.rst @@ -14,6 +14,7 @@ Toolbox Documentation * :doc:`build_push_image ` Build and publish an image to quay using either a Dockerfile or git repo. * :doc:`capture_environment ` Captures the cluster environment +* :doc:`capture_servicemonitor_metrics ` Captures ServiceMonitor or PodMonitor YAML and status for a given service * :doc:`create_htpasswd_adminuser ` Create an htpasswd admin user. * :doc:`create_osd ` Create an OpenShift Dedicated cluster. * :doc:`deploy_operator ` Deploy an operator from OperatorHub catalog entry. diff --git a/projects/cluster/toolbox/cluster.py b/projects/cluster/toolbox/cluster.py index ee8c777dce..170123e2f8 100644 --- a/projects/cluster/toolbox/cluster.py +++ b/projects/cluster/toolbox/cluster.py @@ -547,3 +547,23 @@ def enable_userworkload_monitoring(self, namespaces: list = []): """ return RunAnsibleRole(locals()) + + @AnsibleRole("cluster_capture_servicemonitor_metrics") + @AnsibleMappedParams + def capture_servicemonitor_metrics(self, service_name, namespace="", capture_frequency=60, is_podmonitor=False, finalize=False): + """ + Captures ServiceMonitor or PodMonitor YAML and status for a given service + + Captures the ServiceMonitor/PodMonitor configuration and status information for + a specific service in a namespace, including related service/pod and + endpoints information for troubleshooting monitoring setup. + + Args: + service_name: Name of the service to capture ServiceMonitor/PodMonitor metrics for + namespace: Namespace where the service and ServiceMonitor/PodMonitor are located (empty string auto-detects current namespace) + capture_frequency: How often to capture metrics in seconds (default: 15) + is_podmonitor: Whether to use PodMonitor instead of ServiceMonitor (default: False) + finalize: Whether to finalize (capture logs and delete) an existing pod instead of creating new one (default: False) + """ + + return RunAnsibleRole(locals()) diff --git a/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/defaults/main/config.yml b/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/defaults/main/config.yml new file mode 100644 index 0000000000..cf6de0a76f --- /dev/null +++ b/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/defaults/main/config.yml @@ -0,0 +1,24 @@ +# Auto-generated file, do not edit manually ... +# Toolbox generate command: repo generate_ansible_default_settings +# Source component: Cluster.capture_servicemonitor_metrics + +# Parameters +# Name of the service to capture ServiceMonitor/PodMonitor metrics for +# Mandatory value +cluster_capture_servicemonitor_metrics_service_name: + +# Namespace where the service and ServiceMonitor/PodMonitor are located (empty string auto-detects current namespace) +cluster_capture_servicemonitor_metrics_namespace: + +# How often to capture metrics in seconds (default: 15) +cluster_capture_servicemonitor_metrics_capture_frequency: 60 + +# Whether to use PodMonitor instead of ServiceMonitor (default: False) +cluster_capture_servicemonitor_metrics_is_podmonitor: false + +# Whether to finalize (capture logs and delete) an existing pod instead of creating new one (default: False) +cluster_capture_servicemonitor_metrics_finalize: false + +# Default Ansible variables +# Default value for ansible_os_family to ensure role remains standalone +ansible_os_family: Linux diff --git a/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/tasks/main.yml b/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/tasks/main.yml new file mode 100644 index 0000000000..fd9d1d0c02 --- /dev/null +++ b/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/tasks/main.yml @@ -0,0 +1,98 @@ +--- +- name: Get current namespace if not specified + command: oc project -q + register: current_namespace_result + when: cluster_capture_servicemonitor_metrics_namespace == "" + +- name: Set the target namespace + set_fact: + target_namespace: "{{ current_namespace_result.stdout if cluster_capture_servicemonitor_metrics_namespace == '' else cluster_capture_servicemonitor_metrics_namespace }}" + +- name: Create capture directory + file: + path: "{{ artifact_extra_logs_dir }}/artifacts" + state: directory + mode: '0755' + +- name: "[Finalize mode] capture logs and delete existing pod" + when: cluster_capture_servicemonitor_metrics_finalize + block: + - name: Check if pod exists for finalization + shell: | + oc get pod topsail-metrics-capture-{{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} --no-headers -o name + register: pod_exists_check + + - name: Capture pod logs + shell: | + oc logs topsail-metrics-capture-{{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} > "{{ artifact_extra_logs_dir }}/artifacts/metrics_capture_logs.txt" + +- name: Delete metrics capture pod + shell: | + oc delete pod topsail-metrics-capture-{{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} --grace-period=0 --ignore-not-found + +- name: "[Finalize mode] capture logs and delete existing pod" + when: cluster_capture_servicemonitor_metrics_finalize + meta: end_play + +# Normal deployment mode: discover resources and create pod +- name: Include ServiceMonitor tasks + include_tasks: servicemonitor.yml + when: not cluster_capture_servicemonitor_metrics_is_podmonitor + +- name: Include PodMonitor tasks + include_tasks: podmonitor.yml + when: cluster_capture_servicemonitor_metrics_is_podmonitor + +# Ensure auth_secret_name is always defined with proper structure (fallback for edge cases) +- name: Set default auth secret name if not defined + set_fact: + auth_secret_name: "{% if auth_secret_name_cmd is defined %}{{ auth_secret_name_cmd.stdout }}{% endif %}" + +# Common tasks for deployment mode +- name: Read metrics URL + shell: cat "{{ artifact_extra_logs_dir }}/artifacts/metrics_url.txt" + register: metrics_url_content + +- name: Check if auth secret exists + shell: | + oc get secret {{ auth_secret_name }} -n {{ target_namespace }} --no-headers -o name + register: auth_secret_exists + when: auth_secret_name != "" + +- name: Save authentication info + shell: | + {% if cluster_capture_servicemonitor_metrics_is_podmonitor %} + echo "PodMonitor: {{ cluster_capture_servicemonitor_metrics_service_name }}" > "{{ artifact_extra_logs_dir }}/artifacts/auth_info.txt" + {% else %} + echo "ServiceMonitor: {{ cluster_capture_servicemonitor_metrics_service_name }}" > "{{ artifact_extra_logs_dir }}/artifacts/auth_info.txt" + {% endif %} + echo "Auth secret name: {{ auth_secret_name | default('none') }}" >> "{{ artifact_extra_logs_dir }}/artifacts/auth_info.txt" + if [ -n "{{ auth_secret_name }}" ]; then + {% if auth_secret_name != '' and auth_secret_exists.rc == 0 %} + echo "Secret exists: yes" >> "{{ artifact_extra_logs_dir }}/artifacts/auth_info.txt" + echo "Secret will be mounted at: /var/run/secrets/auth/token" >> "{{ artifact_extra_logs_dir }}/artifacts/auth_info.txt" + {% else %} + echo "Secret exists: no" >> "{{ artifact_extra_logs_dir }}/artifacts/auth_info.txt" + echo "WARNING: Secret not found!" >> "{{ artifact_extra_logs_dir }}/artifacts/auth_info.txt" + {% endif %} + else + echo "No authentication required" >> "{{ artifact_extra_logs_dir }}/artifacts/auth_info.txt" + fi + +- name: Create metrics capture pod manifest + template: + src: metrics_capture_pod.yaml.j2 + dest: "{{ artifact_extra_logs_dir }}/artifacts/metrics_capture_pod.yaml" + mode: '0644' + vars: + metrics_url: "{{ metrics_url_content.stdout }}" + auth_secret_name: "{{ auth_secret_name | default('') }}" + capture_frequency: "{{ cluster_capture_servicemonitor_metrics_capture_frequency }}" + +- name: Create metrics capture pod + shell: | + oc create -f "{{ artifact_extra_logs_dir }}/artifacts/metrics_capture_pod.yaml" + +- name: Wait for pod to start + shell: | + oc wait --for=condition=Ready pod/topsail-metrics-capture-{{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} --timeout=60s diff --git a/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/tasks/podmonitor.yml b/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/tasks/podmonitor.yml new file mode 100644 index 0000000000..98d9936ea6 --- /dev/null +++ b/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/tasks/podmonitor.yml @@ -0,0 +1,75 @@ +--- +# PodMonitor-specific tasks + +- name: Capture PodMonitor YAML + shell: | + oc get podmonitor {{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} -oyaml > "{{ artifact_extra_logs_dir }}/artifacts/podmonitor.yaml" + +- name: Get PodMonitor status + shell: | + oc get podmonitor {{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} > "{{ artifact_extra_logs_dir }}/artifacts/podmonitor.status" + +- name: Capture pod targets by PodMonitor selector + shell: | + oc get pod -l "app.kubernetes.io/component in (llminferenceservice-workload,llminferenceservice-workload-prefill,llminferenceservice-workload-worker,llminferenceservice-workload-leader,llminferenceservice-workload-leader-prefill,llminferenceservice-workload-worker-prefill),app.kubernetes.io/part-of=llminferenceservice" -n {{ target_namespace }} > "{{ artifact_extra_logs_dir }}/artifacts/target_pods.status" + +- name: Capture pod targets by PodMonitor selector YAML + shell: | + oc get pod -l "app.kubernetes.io/component in (llminferenceservice-workload,llminferenceservice-workload-prefill,llminferenceservice-workload-worker,llminferenceservice-workload-leader,llminferenceservice-workload-leader-prefill,llminferenceservice-workload-worker-prefill),app.kubernetes.io/part-of=llminferenceservice" -n {{ target_namespace }} -oyaml > "{{ artifact_extra_logs_dir }}/artifacts/target_pods.yaml" + +- name: Get all target pod IPs and names + shell: | + oc get pod -l "app.kubernetes.io/component in (llminferenceservice-workload,llminferenceservice-workload-prefill,llminferenceservice-workload-worker,llminferenceservice-workload-leader,llminferenceservice-workload-leader-prefill,llminferenceservice-workload-worker-prefill),app.kubernetes.io/part-of=llminferenceservice" -n {{ target_namespace }} --no-headers -o custom-columns=":metadata.name,:status.podIP" + register: target_pods_info + +- name: Extract scheme from PodMonitor + shell: | + oc get podmonitor {{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} -o jsonpath='{.spec.podMetricsEndpoints[0].scheme}' 2>/dev/null || echo "http" + register: metrics_scheme + +- name: Extract target port from PodMonitor + shell: | + oc get podmonitor {{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} -o jsonpath='{.spec.podMetricsEndpoints[0].targetPort}' 2>/dev/null || echo "9090" + register: metrics_port + +- name: Build metrics URLs for all matching pods + shell: | + set -o pipefail; + + SCHEME="{{ metrics_scheme.stdout | trim | default('http') }}" + PORT="{{ metrics_port.stdout | trim | default('9090') }}" + + # Count total pods and initialize files + TOTAL_PODS=$(echo "{{ target_pods_info.stdout }}" | wc -l) + + # Initialize files + echo "" > "{{ artifact_extra_logs_dir }}/artifacts/metrics_url.txt" + echo "PodMonitor target pods (found: $TOTAL_PODS):" > "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + echo "Scheme: $SCHEME" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + echo "Port: $PORT" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + echo "" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + + # Build URL list for each pod + while IFS= read -r line; do + if [ -n "$line" ]; then + POD_NAME=$(echo "$line" | awk '{print $1}') + POD_IP=$(echo "$line" | awk '{print $2}') + if [ -n "$POD_IP" ] && [ "$POD_IP" != "" ]; then + URL="$SCHEME://$POD_IP:$PORT/metrics" + echo "$URL" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_url.txt" + echo "Pod: $POD_NAME ($POD_IP) -> $URL" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + else + echo "Pod: $POD_NAME (no IP available)" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + fi + fi + done <<< "{{ target_pods_info.stdout }}" + + # Count valid URLs and add summary + VALID_URLS=$(grep -c "^http" "{{ artifact_extra_logs_dir }}/artifacts/metrics_url.txt" || echo "0") + echo "" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + echo "Total metrics URLs: $VALID_URLS" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + +- name: Extract authorization secret name from PodMonitor + shell: | + oc get podmonitor {{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} -o jsonpath='{.spec.podMetricsEndpoints[0].authorization.credentials.name}' 2>/dev/null || echo "" + register: auth_secret_name diff --git a/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/tasks/servicemonitor.yml b/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/tasks/servicemonitor.yml new file mode 100644 index 0000000000..7e4dc51620 --- /dev/null +++ b/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/tasks/servicemonitor.yml @@ -0,0 +1,76 @@ +--- +# ServiceMonitor-specific tasks + +- name: Capture ServiceMonitor YAML + shell: | + oc get servicemonitor {{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} -oyaml > "{{ artifact_extra_logs_dir }}/artifacts/servicemonitor.yaml" + +- name: Get ServiceMonitor status + shell: | + oc get servicemonitor {{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} > "{{ artifact_extra_logs_dir }}/artifacts/servicemonitor.status" + +- name: Capture service target by ServiceMonitor selector + shell: | + oc get service -l "app.kubernetes.io/component=llminferenceservice-router-scheduler,app.kubernetes.io/part-of=llminferenceservice" -n {{ target_namespace }} > "{{ artifact_extra_logs_dir }}/artifacts/target_service.status" + +- name: Capture service target by ServiceMonitor selector YAML + shell: | + oc get service -l "app.kubernetes.io/component=llminferenceservice-router-scheduler,app.kubernetes.io/part-of=llminferenceservice" -n {{ target_namespace }} -oyaml > "{{ artifact_extra_logs_dir }}/artifacts/target_service.yaml" + +- name: Get target service name + shell: | + set -o pipefail; + oc get service -l "app.kubernetes.io/component=llminferenceservice-router-scheduler,app.kubernetes.io/part-of=llminferenceservice" -n {{ target_namespace }} --no-headers -o custom-columns=":metadata.name" | head -1 + register: target_service_name + +- name: Extract port name from ServiceMonitor + shell: | + oc get servicemonitor {{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} -o jsonpath='{.spec.endpoints[0].port}' 2>/dev/null || echo "metrics" + register: metrics_port_name + +- name: Get port number from Service by name + shell: | + SERVICE_NAME='{{ target_service_name.stdout | trim }}' + PORT_NAME='{{ metrics_port_name.stdout | trim | default("metrics") }}' + oc get service "$SERVICE_NAME" -n {{ target_namespace }} -o jsonpath="{.spec.ports[?(@.name=='$PORT_NAME')].port}" 2>/dev/null || echo "" + register: named_port_result + +- name: Get first port as fallback + shell: | + SERVICE_NAME='{{ target_service_name.stdout | trim }}' + oc get service "$SERVICE_NAME" -n {{ target_namespace }} -o jsonpath='{.spec.ports[0].port}' 2>/dev/null || echo "9090" + register: first_port_result + when: named_port_result.stdout == "" + +- name: Set final port number + set_fact: + final_port: "{{ named_port_result.stdout if named_port_result.stdout != '' else first_port_result.stdout | default('9090') }}" + +- name: Determine scheme from port + set_fact: + final_scheme: >- + {{ + 'https' if ( + final_port in ['443', '8443', '6443'] or + (metrics_port_name.stdout | trim | default('metrics')) is match('.*(https|secure|tls).*') + ) else 'http' + }} + +- name: Build metrics URL for ServiceMonitor + shell: | + SERVICE_NAME='{{ target_service_name.stdout | trim }}' + PORT_NAME='{{ metrics_port_name.stdout | trim | default("metrics") }}' + PORT_NUMBER='{{ final_port }}' + SCHEME='{{ final_scheme }}' + + echo "$SCHEME://$SERVICE_NAME.{{ target_namespace }}.svc:$PORT_NUMBER/metrics" > "{{ artifact_extra_logs_dir }}/artifacts/metrics_url.txt" + echo "Service: $SERVICE_NAME" > "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + echo "Port name: $PORT_NAME" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + echo "Port number: $PORT_NUMBER" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + echo "Scheme: $SCHEME" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + echo "URL: $SCHEME://$SERVICE_NAME.{{ target_namespace }}.svc:$PORT_NUMBER/metrics" >> "{{ artifact_extra_logs_dir }}/artifacts/metrics_info.txt" + +- name: Extract authorization secret name from ServiceMonitor + shell: | + oc get servicemonitor {{ cluster_capture_servicemonitor_metrics_service_name }} -n {{ target_namespace }} -o jsonpath='{.spec.endpoints[0].authorization.credentials.name}' 2>/dev/null || echo "" + register: auth_secret_name_cmd diff --git a/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/templates/metrics_capture_pod.yaml.j2 b/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/templates/metrics_capture_pod.yaml.j2 new file mode 100644 index 0000000000..d6c9b7044f --- /dev/null +++ b/projects/cluster/toolbox/cluster_capture_servicemonitor_metrics/templates/metrics_capture_pod.yaml.j2 @@ -0,0 +1,94 @@ +apiVersion: v1 +kind: Pod +metadata: + name: topsail-metrics-capture-{{ cluster_capture_servicemonitor_metrics_service_name }} + namespace: {{ target_namespace }} +spec: + restartPolicy: Never + containers: + - name: metrics-capture + image: registry.access.redhat.com/ubi9/ubi:latest + command: ["/bin/bash"] + args: + - -c + - | + {% if auth_secret_name -%} + echo "Using authorization token from mounted secret: {{ auth_secret_name }}" + {% endif %} + + # Function to fetch metrics with proper options + fetch_metrics() { + local url="$1" + local token="$2" + + # Build curl options + local curl_opts="-sSf" + if [[ "$url" =~ ^https:// ]]; then + curl_opts="${curl_opts}k" + fi + + # Execute curl with proper auth handling and filter output + local curl_cmd="curl $curl_opts" + if [ -n "$token" ]; then + if $curl_cmd -H "Authorization: Bearer $token" "$url" | grep -v "^# HELP\|^# TYPE\|^go_\|^process_\|^controller_runtime_\|^rest_client_\|^http_request_duration_\|^python_gc_"; then + return 0 + else + echo "Failed to fetch metrics from $url" + return 1 + fi + else + if $curl_cmd "$url" | grep -v "^# HELP\|^# TYPE\|^go_\|^process_\|^controller_runtime_\|^rest_client_\|^http_request_duration_\|^python_gc_"; then + return 0 + else + echo "Failed to fetch metrics from $url" + return 1 + fi + fi + } + + # Get list of URLs to monitor from environment variable + echo "Monitoring the following URLs:" + echo "$METRICS_URLS" + echo "" + + while true; do + echo "=== $(date) ===" + + echo "$METRICS_URLS" | while IFS= read -r url; do + if [ -n "$url" ] && [ "$url" != " " ]; then + echo "Fetching from: $url at $(date +%s)" + + {% if auth_secret_name %} + if [ -f /var/run/secrets/auth/token ]; then + TOKEN=$(cat /var/run/secrets/auth/token) + fetch_metrics "$url" "$TOKEN" + else + echo "Auth token file not found" + fi + {% else %} + fetch_metrics "$url" "" + {% endif %} + + echo "--- End of $url at $(date +%s) ---" + fi + done + + echo "" + sleep {{ capture_frequency }} + done + env: + - name: METRICS_URLS + value: | +{{ metrics_url | indent(8, true) }} +{% if auth_secret_name %} + volumeMounts: + - name: auth-token + mountPath: /var/run/secrets/auth + readOnly: true +{% endif %} +{% if auth_secret_name %} + volumes: + - name: auth-token + secret: + secretName: {{ auth_secret_name }} +{% endif %} diff --git a/projects/llm-d/testing/config.yaml b/projects/llm-d/testing/config.yaml index a62489de5c..52690df29f 100644 --- a/projects/llm-d/testing/config.yaml +++ b/projects/llm-d/testing/config.yaml @@ -48,6 +48,9 @@ ci_presets: tests.capture_prom: false tests.capture_prom_uwm: false + tests.llmd.inference_service.metrics.manual_capture: false + tests.llmd.inference_service.gateway.name: gateway-internal + tests.llmd.skip_prepare: true prepare.namespace.name: kpouget-dev prepare.preload.node_selector_key: gpu.nvidia.com/class @@ -259,6 +262,10 @@ tests: gateway: name: gateway-external model: llama3-1-8b + metrics: + manual_capture: false + scheduler_servicemonitor_name: kserve-llm-isvc-scheduler + vllm_podmonitor_name: kserve-llm-isvc-vllm-engine # vLLM arguments (always applied) vllm_args: @@ -286,6 +293,11 @@ tests: guidellm: enabled: true name: guidellm-benchmark + + image: + name: ghcr.io/vllm-project/guidellm + version: pr-590 + backend_type: openai_http rate_type: concurrent max_seconds: 120 @@ -294,6 +306,7 @@ tests: data: prompt_tokens=256,output_tokens=128 rate: 1 sample_requests: 20 + request_type: text_completions capture_prom: true capture_prom_uwm: true diff --git a/projects/llm-d/testing/test_llmd.py b/projects/llm-d/testing/test_llmd.py index 33ca3aff96..1f9a06d1f5 100644 --- a/projects/llm-d/testing/test_llmd.py +++ b/projects/llm-d/testing/test_llmd.py @@ -55,6 +55,9 @@ def test_single_flavor(flavor, flavor_index, total_flavors, namespace): # Deploy LLM inference service _, _, llmisvc_path = deploy_llm_inference_service(flavor, llmisvc_name, namespace, model_ref) + # Start metrics capture after deployment + start_metrics_capture(flavor) + models = config.project.get_config(f"models") model_name = models[model_ref]["name"] logging.info(f"Using model: {model_name} (from config reference: {model_ref})") @@ -75,6 +78,12 @@ def test_single_flavor(flavor, flavor_index, total_flavors, namespace): flavor_failed = e finally: + # Stop metrics capture after testing (success or failure) + try: + stop_metrics_capture(flavor) + except Exception as metrics_e: + logging.exception(f"Failed to stop metrics capture: {metrics_e}") + # Always capture LLM inference service state (success or failure) logging.info("Capturing LLM inference service state for debugging") try: @@ -892,6 +901,8 @@ def run_guidellm_benchmark(endpoint_url, llmisvc_name, namespace): data = config.project.get_config("tests.llmd.benchmarks.guidellm.data") sample_requests = config.project.get_config("tests.llmd.benchmarks.guidellm.sample_requests") + request_type = config.project.get_config("tests.llmd.benchmarks.guidellm.request_type") + failed = False # Handle rate as list/tuple - iterate over each rate value @@ -949,6 +960,9 @@ def evaluate_expression(match): if rate_type: guidellm_args.append(f"--rate-type={rate_type}") + if request_type: + guidellm_args.append(f"--request-type={request_type}") + # Add rate parameter guidellm_args.append(f"--rate={rate_value}") @@ -1007,6 +1021,84 @@ def capture_llm_inference_service_state(llmisvc_name, namespace): logging.error(f"Failed to capture LLM inference service state: {e}") +def start_metrics_capture(flavor): + """ + Starts metrics capture for both ServiceMonitor and PodMonitor if enabled + """ + if not config.project.get_config("tests.llmd.inference_service.metrics.manual_capture"): + return + + logging.info("Starting metrics capture") + + namespace = config.project.get_config("tests.llmd.namespace") + scheduler_name = config.project.get_config("tests.llmd.inference_service.metrics.scheduler_servicemonitor_name") + vllm_name = config.project.get_config("tests.llmd.inference_service.metrics.vllm_podmonitor_name") + + # Parse flavor to check if it's simple + components = parse_flavor_components(flavor) + is_simple_flavor = components['base'] == "simple" + + # Start vLLM PodMonitor capture (always) + logging.info(f"Starting PodMonitor metrics capture for {vllm_name}") + run.run_toolbox("cluster", "capture_servicemonitor_metrics", + service_name=vllm_name, + namespace=namespace, + is_podmonitor=True, + mute_stdout=True) + + # Start scheduler ServiceMonitor capture (only for non-simple flavors) + if not is_simple_flavor: + logging.info(f"Starting ServiceMonitor metrics capture for {scheduler_name}") + run.run_toolbox("cluster", "capture_servicemonitor_metrics", + service_name=scheduler_name, + namespace=namespace, + mute_stdout=True) + else: + logging.info("Skipping scheduler metrics capture for simple flavor") + + logging.info("Metrics capture started successfully") + + +def stop_metrics_capture(flavor): + """ + Stops metrics capture for both ServiceMonitor and PodMonitor if enabled + """ + if not config.project.get_config("tests.llmd.inference_service.metrics.manual_capture"): + return + + logging.info("Stopping metrics capture") + + namespace = config.project.get_config("tests.llmd.namespace") + scheduler_name = config.project.get_config("tests.llmd.inference_service.metrics.scheduler_servicemonitor_name") + vllm_name = config.project.get_config("tests.llmd.inference_service.metrics.vllm_podmonitor_name") + + # Parse flavor to check if it's simple + components = parse_flavor_components(flavor) + is_simple_flavor = components['base'] == "simple" + + # Stop vLLM PodMonitor capture (always) + logging.info(f"Stopping PodMonitor metrics capture for {vllm_name}") + run.run_toolbox("cluster", "capture_servicemonitor_metrics", + service_name=vllm_name, + namespace=namespace, + is_podmonitor=True, + finalize=True, + mute_stdout=True, + artifact_dir_suffix="_finalize",) + + # Stop scheduler ServiceMonitor capture (only for non-simple flavors) + if not is_simple_flavor: + logging.info(f"Stopping ServiceMonitor metrics capture for {scheduler_name}") + run.run_toolbox("cluster", "capture_servicemonitor_metrics", + service_name=scheduler_name, + namespace=namespace, + finalize=True, + mute_stdout=True, + artifact_dir_suffix="_finalize",) + + logging.info("Metrics capture stopped successfully") + + def cleanup_llm_inference_resources(): """ Clean up all llminferenceservice resources in the namespace before testing. diff --git a/projects/llm-d/toolbox/llmd.py b/projects/llm-d/toolbox/llmd.py index 1def5b6dd1..e55c42a3eb 100644 --- a/projects/llm-d/toolbox/llmd.py +++ b/projects/llm-d/toolbox/llmd.py @@ -52,7 +52,7 @@ def run_guidellm_benchmark( self, endpoint_url, name="guidellm-benchmark", namespace="", - image="ghcr.io/vllm-project/guidellm", version="pr-590", + image="ghcr.io/albertoperdomo2/guidellm", version="nightly", timeout=900, pvc_size="1Gi", guidellm_args=[], diff --git a/projects/llm-d/toolbox/llmd_run_guidellm_benchmark/defaults/main/config.yml b/projects/llm-d/toolbox/llmd_run_guidellm_benchmark/defaults/main/config.yml index c8fca0e8ab..0594b19207 100644 --- a/projects/llm-d/toolbox/llmd_run_guidellm_benchmark/defaults/main/config.yml +++ b/projects/llm-d/toolbox/llmd_run_guidellm_benchmark/defaults/main/config.yml @@ -14,10 +14,10 @@ llmd_run_guidellm_benchmark_name: guidellm-benchmark llmd_run_guidellm_benchmark_namespace: # Container image for the benchmark -llmd_run_guidellm_benchmark_image: ghcr.io/vllm-project/guidellm +llmd_run_guidellm_benchmark_image: ghcr.io/albertoperdomo2/guidellm # Version tag for the benchmark image -llmd_run_guidellm_benchmark_version: pr-590 +llmd_run_guidellm_benchmark_version: nightly # Timeout in seconds to wait for job completion llmd_run_guidellm_benchmark_timeout: 900 diff --git a/projects/llm-d/visualizations/llmd_inference/store/parsers.py b/projects/llm-d/visualizations/llmd_inference/store/parsers.py index 644ddf529e..c61a8cd0ee 100644 --- a/projects/llm-d/visualizations/llmd_inference/store/parsers.py +++ b/projects/llm-d/visualizations/llmd_inference/store/parsers.py @@ -31,6 +31,8 @@ # to support multiple benchmark directories (multi-rate scenarios) f"{artifact_dirnames.PROMETHEUS_DUMP_DIR}/prometheus.t*", f"{artifact_dirnames.PROMETHEUS_UWM_DUMP_DIR}/prometheus.t*", + f"{artifact_dirnames.GUIDELLM_BENCHMARK_DIR}/artifacts/results/benchmarks.json", + f"{artifact_dirnames.GUIDELLM_BENCHMARK_DIR}/artifacts/guidellm_benchmark_job.logs", ]