misc model-validation ROCm fixes/improvements (#103)

dtrifiro · web-flow · commit 929bbdbaa543 · 2025-05-21T12:31:22.000+02:00
- rocm: add mamba-ssm workaround, cleanup existing workaround 
- fix run-commands group title 
- neuralmagic performance and benchmark: allow config override based on accelerator type 
- accuracy/model-validation: add run-name 
- download-install-assets-nm-vllm-ent: install rocm dependencies in addition to constraints 
- env-test: rocm: disable pytorch tunableop 
- requirements-rocm: add xformers for mistral models 
- env-test: expand ROCm env vars 
- model-validation: improve run name display 
- accuracy: fix no config found warning 
- scripts: add invoke-model-validation 
- download-install-assets-nm-vllm-ent: make linter happy
diff --git a/.github/actions/download-install-assets-nm-vllm-ent/action.yml b/.github/actions/download-install-assets-nm-vllm-ent/action.yml
@@ -33,11 +33,11 @@ runs:
       fi
 
       if [[ "${TARGET_DEVICE}" == "rocm" ]]; then
-        uv pip install ${ASSETS} --constraints neuralmagic/requirements/rocm.txt
+        uv pip install ${ASSETS} --constraints neuralmagic/requirements/rocm.txt -r neuralmagic/requirements/rocm.txt
       elif [[ "${TARGET_DEVICE}" == "cuda" ]]; then
         uv pip install ${ASSETS}
       else
-        echo ::warning title=download and install assets::Unknown target device $TARGET_DEVICE, proceeding with install
+        echo ::warning title=download and install assets::Unknown target device "$TARGET_DEVICE", proceeding with install
         uv pip install ${ASSETS}
       fi
 
diff --git a/.github/actions/env-test-nm-vllm-ent/action.yml b/.github/actions/env-test-nm-vllm-ent/action.yml
@@ -51,7 +51,14 @@ runs:
       echo "RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1" | tee -a "$GITHUB_ENV" # ray >= 2.45
       echo "HIP_FORCE_DEV_KERNARG=1" | tee -a "$GITHUB_ENV"
       echo "VLLM_USE_TRITON_FLASH_ATTN=0" | tee -a "$GITHUB_ENV"
-      # tp>1 fails with ray
+      # https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/workload.html#vllm-performance-optimization
+      echo "NCCL_MIN_NCHANNELS=112 "| tee -a "$GITHUB_ENV"
+      echo "TORCH_BLAS_PREFER_HIPBLASLT=1" | tee -a "$GITHUB_ENV"
+      ## https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/workload.html#pytorch-tunableop
+      # echo "PYTORCH_TUNABLEOP_TUNING=1" | tee -a "$GITHUB_ENV"
+      # echo "PYTORCH_TUNABLEOP_ENABLED=1" | tee -a "$GITHUB_ENV"
+
+      # tp>1 fails with ray as of 2025-05-01
       echo "VLLM_DISTRIBUTED_EXECUTOR_BACKEND=mp" >> "$GITHUB_ENV"
     shell: bash
 
diff --git a/.github/actions/run-commands/action.yml b/.github/actions/run-commands/action.yml
@@ -32,15 +32,22 @@ runs:
       cd ${{ inputs.repo }}
       mv vllm vllm-ignore
       mv csrc csrc-ignore
-      echo "::group::Installing requirements via pip"
+
+      echo "::group::Installing requirements via uv pip"
       if [[ "${TARGET_DEVICE}" == "cuda" ]]; then
-        # temporarily installing mamba-ssm built from source due to missing numpy package
+        # install mamba-ssm built from source due to missing prebuilt wheel for torch 2.7, see https://github.com/state-spaces/mamba/pull/720
         gsutil cp gs://nm-vllm-certs/caches/whls/mamba_ssm-2.2.4-cp39-abi3-linux_x86_64.whl .
-        uv pip install mamba_ssm-2.2.4-cp39-abi3-linux_x86_64.whl
-        uv pip install -r requirements/test.txt
+
+        uv pip install -r requirements/test.txt --overrides <(echo $PWD/mamba_ssm-2.2.4-cp39-abi3-linux_x86_64.whl)
       elif [[ "${TARGET_DEVICE}" == "rocm" ]]; then
         sed -i '/torch/d' requirements/test.in
-        uv pip compile requirements/test.in -c ../neuralmagic/requirements/rocm.txt -o requirements/test.txt
+        # workaround for mamba-ssm==2.2.4 build failure on torch==2.7.0, see https://github.com/state-spaces/mamba/issues/720
+        gcloud storage cp gs://nm-vllm-certs/state-spaces/mamba/assets/15133810912/mamba_ssm-2.2.4-cp312-cp312-linux_x86_64.whl ./
+
+        uv pip compile requirements/test.in \
+          --constraints ../neuralmagic/requirements/rocm.txt \
+          --overrides <(echo mamba_ssm-2.2.4-cp312-cp312-linux_x86_64.whl) \
+          --output requirements/test.txt
         uv pip install -r requirements/test.txt
       else
         echo ::error title=run commands::Invalid target_device=${TARGET_DEVICE}
diff --git a/.github/workflows/accuracy.yml b/.github/workflows/accuracy.yml
@@ -1,4 +1,5 @@
 name: accuracy
+run-name: accuracy ${{inputs.model}} on ${{inputs.label}} (${{github.actor}})
 
 on:
   workflow_call:
diff --git a/.github/workflows/model-validation.yml b/.github/workflows/model-validation.yml
@@ -1,4 +1,5 @@
 name: model validation
+run-name: Model Validation ${{inputs.model}} on ${{inputs.label}} (${{github.actor}})
 
 on:
   workflow_call:
diff --git a/neuralmagic/accuracy/run_llm_eval_test.py b/neuralmagic/accuracy/run_llm_eval_test.py
@@ -1,6 +1,5 @@
 import argparse
 import os
-import pathlib
 import shlex
 import subprocess
 from pathlib import Path
@@ -10,22 +9,20 @@
 
 from neuralmagic.tools.logger import make_logger
 
-from ..performance.vllm_server import Server
+from ..utils import Accelerator, get_config_file
+from ..vllm_server import Server
 
 
-def get_server_config(model_name: str) -> Path:
+def get_server_config(model_name: str, accelerator: Accelerator) -> Path:
     # vllm server options need to be updated to remove "model"
     # if it's in there
-    server_config_file = pathlib.Path(
-        f"model-validation-configs/{model_name}/accuracy/server.yml"
+    server_config_file = get_config_file(
+        "model-validation-configs",
+        model_name,
+        config_type="server",
+        accelerator=accelerator,
+        workflow_kind="accuracy",
     )
-    if not server_config_file.exists():
-        print(
-            f"No server config found for {model_name}, using common/accuracy/server.yml"
-        )
-        server_config_file = pathlib.Path(
-            "model-validation-configs/common/accuracy/server.yml"
-        )
     server_yml = server_config_file.read_text(encoding="utf-8")
     server_config = yaml.safe_load(server_yml)
     server_config.pop("model", None)
@@ -35,7 +32,7 @@ def get_server_config(model_name: str) -> Path:
     return server_config_file.resolve()
 
 
-if __name__ == "__main__":
+def main() -> None:
     """
     Starts a vllm server then runs the llm-eval-test tool against that server.
 
@@ -54,22 +51,19 @@ def get_server_config(model_name: str) -> Path:
     model_name: str | None = os.getenv("MODEL_NAME")
     if model_name is None:
         raise ValueError("MODEL_NAME env var must be defined")
-    config_file: Path = get_server_config(model_name)
+
+    accelerator = Accelerator.from_env()
+    config_file: Path = get_server_config(model_name, accelerator)
     datasets_dir: str = str(Path().cwd() / "datasets")
     os.makedirs(datasets_dir, exist_ok=True)
 
-    # get client configuration
-    client_config_file = Path(
-        f"model-validation-configs/{model_name}/accuracy/client.yml"
+    client_config_file = get_config_file(
+        "model-validation-configs",
+        model_name,
+        config_type="client",
+        accelerator=accelerator,
+        workflow_kind="accuracy",
     )
-    if not client_config_file.exists():
-        print(
-            "No client config found for {model_name}, using common/accuracy/client.yml"
-        )
-        client_config_file = pathlib.Path(
-            "model-validation-configs/common/accuracy/client.yml"
-        )
-
     client_yml = client_config_file.read_text(encoding="utf-8")
     client_config = yaml.safe_load(client_yml)
 
@@ -78,6 +72,10 @@ def get_server_config(model_name: str) -> Path:
         f"model-validation-configs/{model_name}/accuracy/tasks.yml"
     )
     tasks_list: str = "openllm,leaderboard"
+    if tasks_config_file.exists():
+        print(
+            f"::warning title= run_llm_eval.py:: {tasks_config_file=} is currently overridden with {tasks_list=}"
+        )
 
     logger.info("launching server...")
     with Server(model_name, config_file=config_file) as server:
@@ -147,3 +145,7 @@ def get_server_config(model_name: str) -> Path:
         raise RuntimeError(
             f"llm-eval-test run failed to generate the expected result file {args.output}"
         )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/neuralmagic/performance/__init__.py b/neuralmagic/performance/__init__.py
diff --git a/neuralmagic/performance/benchmark_model.py b/neuralmagic/performance/benchmark_model.py
@@ -4,10 +4,9 @@
 from pathlib import Path
 from textwrap import indent
 
-from neuralmagic.tools.logger import make_logger
-
-from .utils import client_config_to_cli_args, get_config_file
-from .vllm_server import Server
+from ..tools.logger import make_logger
+from ..utils import Accelerator, client_config_to_cli_args, get_config_file
+from ..vllm_server import Server
 
 
 def parse_args() -> argparse.Namespace:
@@ -24,17 +23,21 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
-if __name__ == "__main__":
+def main() -> None:
     logger = make_logger("benchmark_model")
+
     args = parse_args()
+    accelerator = Accelerator.from_env()
     server_config_file = get_config_file(
         base_config_dir=args.base_config_dir,
         model_name=args.model,
+        accelerator=accelerator,
         config_type="server",
     )
     client_config_file = get_config_file(
         base_config_dir=args.base_config_dir,
         model_name=args.model,
+        accelerator=accelerator,
         config_type="client",
     )
     logger.info("launching server...")
@@ -60,3 +63,7 @@ def parse_args() -> argparse.Namespace:
         if result.returncode != 0:
             message = f"guidellm command failed with returncode={result.returncode}"
             raise RuntimeError(message)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/neuralmagic/performance/utils.py b/neuralmagic/performance/utils.py
diff --git a/neuralmagic/requirements/rocm.txt b/neuralmagic/requirements/rocm.txt
@@ -8,3 +8,4 @@ amdsmi@file:///opt/rocm/share/amd_smi
 
 sympy>=1.13.3 # temporarily pinned due to incompatibility with the pip-compiled version in requirements/test.txt
 ray==2.43.0 # newer ray versions are fubar
+xformers==0.0.30 # required for mistralai/Mistral-Small-3.1-24B-Instruct-2503
diff --git a/neuralmagic/utils.py b/neuralmagic/utils.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+import os
+from enum import StrEnum, auto
+from pathlib import Path
+from typing import Any, Literal
+
+import yaml
+
+from .tools.logger import make_logger
+
+LOGGER = make_logger(name=__name__)
+
+
+class Accelerator(StrEnum):
+    cuda = auto()
+    rocm = auto()
+    tpu = auto()
+
+    @staticmethod
+    def from_env() -> Accelerator:
+        if (vllm_version := os.getenv("VLLM_VERSION")) is None:
+            LOGGER.warning(
+                "Couldn't infer vllm version from environment, assuming cuda."
+            )
+            return Accelerator.cuda
+
+        if "rocm" in vllm_version:
+            accelerator = Accelerator.rocm
+        elif "tpu" in vllm_version:
+            accelerator = Accelerator.tpu
+        else:
+            # cuda versions do not have any device information: use it as default
+            accelerator = Accelerator.cuda
+
+        LOGGER.info(
+            f"Got vllm version from environment: vllm=={vllm_version}, inferred accelerator={accelerator}"
+        )
+
+        return accelerator
+
+
+def client_config_to_cli_args(config_file: str | Path) -> list[str]:
+    args: list[str] = []
+    with Path(config_file).open() as f:
+        config: dict[str, Any] = yaml.safe_load(f)
+
+    if data := config.pop("data", None):
+        args.extend(["--data", ",".join(f"{key}={val}" for key, val in data.items())])
+
+    for key, val in config.items():
+        args.append(f"--{key}")
+        if val is not True and val is not None and val != "":
+            args.append(str(val))
+
+    return args
+
+
+def get_config_file(
+    base_config_dir: str,
+    model_name: str,
+    config_type: Literal["server", "client"],
+    accelerator: Accelerator,
+    workflow_kind: Literal["performance", "accuracy"] = "performance",
+) -> Path:
+    """
+    Retrieves the configuration file for a specific model, accelerator and type (server/client).
+    Falls back to the common config if the model-specific config is not found.
+
+    Parameters:
+        - base_config_dir: Base directory containing configuration files.
+        - model_name: Name of the model for which the config is needed.
+        - config_type: Type of configuration file (either 'server' or 'client').
+        - workflow_kind: either "performance" or "accuracy"
+
+    accelerator type is inferred from environment variables.
+
+    Returns:
+        - Path: Path to the requested config file.
+    """
+    if config_type not in ("server", "client"):
+        raise ValueError('config_type can be either "server" or "client"')
+    if workflow_kind not in ("performance", "accuracy"):
+        raise ValueError('workflow_kind can be either "performance" or "accuracy"')
+
+    base_config_path = Path(base_config_dir)
+
+    config_names_to_try = (
+        f"{config_type}-{accelerator}.yml",
+        f"{config_type}.yml",
+    )
+
+    for config in config_names_to_try:
+        config_file = base_config_path / model_name / workflow_kind / config
+        if not config_file.exists():
+            continue
+
+        LOGGER.info(f"Found config: {config_file}")
+        return config_file
+
+    fallback = config_file = (
+        base_config_path / "common" / workflow_kind / f"{config_type}.yml"
+    )
+    LOGGER.warning(
+        f"No {workflow_kind} {config_type} config found for {model_name}, using {fallback=}"
+    )
+
+    return config_file
diff --git a/neuralmagic/vllm_server.py b/neuralmagic/vllm_server.py
diff --git a/scripts/invoke-model-validation b/scripts/invoke-model-validation

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`name: accuracy`
	`2`	`+run-name: accuracy ${{inputs.model}} on ${{inputs.label}} (${{github.actor}})`
`2`	`3`
`3`	`4`	`on:`
`4`	`5`	`workflow_call:`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`name: model validation`
	`2`	`+run-name: Model Validation ${{inputs.model}} on ${{inputs.label}} (${{github.actor}})`
`2`	`3`
`3`	`4`	`on:`
`4`	`5`	`workflow_call:`
Original file line number	Diff line number	Diff line change
`@@ -8,3 +8,4 @@ amdsmi@file:///opt/rocm/share/amd_smi`
`8`	`8`
`9`	`9`	`sympy>=1.13.3 # temporarily pinned due to incompatibility with the pip-compiled version in requirements/test.txt`
`10`	`10`	`ray==2.43.0 # newer ray versions are fubar`
	`11`	`+xformers==0.0.30 # required for mistralai/Mistral-Small-3.1-24B-Instruct-2503`