[TRTC-1943][feat] Env vars override support in LLM API (#9104)

venkywonka · web-flow · commit 639c939a4f91 · 2025-12-01T10:04:49.000-08:00
Signed-off-by: Venky Ganesh &lt;23023424+venkywonka@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py b/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py
@@ -1,6 +1,6 @@
 import torch
 
-from ..flashinfer_utils import ENABLE_PDL, IS_FLASHINFER_AVAILABLE
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE, get_env_enable_pdl
 
 if IS_FLASHINFER_AVAILABLE:
     from flashinfer.activation import silu_and_mul
@@ -11,7 +11,7 @@
     # Warp this into custom op since flashinfer didn't warp it properly and we want to avoid graph break between mlp layer for user buffer optimization
     @torch.library.custom_op("trtllm::flashinfer_silu_and_mul", mutates_args=())
     def flashinfer_silu_and_mul(x: torch.Tensor) -> torch.Tensor:
-        return silu_and_mul(x, enable_pdl=ENABLE_PDL)
+        return silu_and_mul(x, enable_pdl=get_env_enable_pdl())
 
     @flashinfer_silu_and_mul.register_fake
     def _(x: torch.Tensor) -> torch.Tensor:
@@ -21,7 +21,7 @@ def _(x: torch.Tensor) -> torch.Tensor:
     @torch.library.custom_op("trtllm::flashinfer_rmsnorm", mutates_args=())
     def flashinfer_rmsnorm(input: torch.Tensor, weight: torch.Tensor,
                            eps: float) -> torch.Tensor:
-        return rmsnorm(input, weight, eps, enable_pdl=ENABLE_PDL)
+        return rmsnorm(input, weight, eps, enable_pdl=get_env_enable_pdl())
 
     @flashinfer_rmsnorm.register_fake
     def _(input: torch.Tensor, weight: torch.Tensor,
@@ -32,7 +32,10 @@ def _(input: torch.Tensor, weight: torch.Tensor,
                              mutates_args=())
     def flashinfer_gemma_rmsnorm(input: torch.Tensor, weight: torch.Tensor,
                                  eps: float) -> torch.Tensor:
-        return gemma_rmsnorm(input, weight, eps, enable_pdl=ENABLE_PDL)
+        return gemma_rmsnorm(input,
+                             weight,
+                             eps,
+                             enable_pdl=get_env_enable_pdl())
 
     @flashinfer_gemma_rmsnorm.register_fake
     def _(input: torch.Tensor, weight: torch.Tensor,
@@ -44,7 +47,11 @@ def _(input: torch.Tensor, weight: torch.Tensor,
     def flashinfer_fused_add_rmsnorm(input: torch.Tensor,
                                      residual: torch.Tensor,
                                      weight: torch.Tensor, eps: float) -> None:
-        fused_add_rmsnorm(input, residual, weight, eps, enable_pdl=ENABLE_PDL)
+        fused_add_rmsnorm(input,
+                          residual,
+                          weight,
+                          eps,
+                          enable_pdl=get_env_enable_pdl())
 
     @torch.library.custom_op("trtllm::flashinfer_gemma_fused_add_rmsnorm",
                              mutates_args=("input", "residual"))
@@ -56,7 +63,7 @@ def flashinfer_gemma_fused_add_rmsnorm(input: torch.Tensor,
                                 residual,
                                 weight,
                                 eps,
-                                enable_pdl=ENABLE_PDL)
+                                enable_pdl=get_env_enable_pdl())
 
     @torch.library.custom_op(
         "trtllm::flashinfer_apply_rope_with_cos_sin_cache_inplace",
diff --git a/tensorrt_llm/_torch/flashinfer_utils.py b/tensorrt_llm/_torch/flashinfer_utils.py
@@ -8,13 +8,13 @@
 
 
 def get_env_enable_pdl():
-    return os.environ.get("TRTLLM_ENABLE_PDL", "0") == "1"
+    enabled = os.environ.get("TRTLLM_ENABLE_PDL", "0") == "1"
+    if enabled and not getattr(get_env_enable_pdl, "_printed", False):
+        logger.info("PDL enabled")
+        setattr(get_env_enable_pdl, "_printed", True)
+    return enabled
 
 
-ENABLE_PDL = get_env_enable_pdl()
-if ENABLE_PDL:
-    logger.info("PDL is enabled")
-
 if platform.system() != "Windows":
     try:
         import flashinfer
diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py
@@ -30,7 +30,7 @@
 else:
     from typing_extensions import override
 
-from ..flashinfer_utils import ENABLE_PDL
+from ..flashinfer_utils import get_env_enable_pdl
 from .sampling_utils import (
     GREEDY,
     GroupedStrategySampler,
@@ -113,7 +113,7 @@ def _prepare_probs_with_temperature(
             probs = flashinfer.sampling.softmax(
                 logits,
                 temperature,
-                enable_pdl=ENABLE_PDL,
+                enable_pdl=get_env_enable_pdl(),
             )
             return probs
 
diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import asyncio
-import os
 from functools import partial
 from pathlib import Path
 
@@ -46,12 +45,14 @@
     help="Path to a serialized TRT-LLM engine.",
 )
 @optgroup.option(
+    "--config",
     "--extra_llm_api_options",
+    "extra_llm_api_options",
     type=str,
     default=None,
     help=
-    "Path to a YAML file that overwrites the parameters specified by trtllm-bench."
-)
+    "Path to a YAML file that overwrites the parameters specified by trtllm-bench. "
+    "Can be specified as either --config or --extra_llm_api_options.")
 @optgroup.option(
     "--backend",
     type=click.Choice(ALL_SUPPORTED_BACKENDS),
@@ -192,6 +193,7 @@ def latency_command(
 ) -> None:
     """Run a latency test on a TRT-LLM engine."""
     logger.info("Preparing to run latency benchmark...")
+
     # Parameters from CLI
     # Model, experiment, and engine params
     options = get_general_cli_options(params, bench_env)
@@ -263,14 +265,6 @@ def latency_command(
     exec_settings["settings_config"][
         "scheduler_policy"] = CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
 
-    # Set environment variables for setting runtime options.
-    # TODO: Once passing of variables is fixed, these should work
-    # when using MPI in C++ runtime.
-    os.environ["TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG"] = "1"
-    os.environ["TRTLLM_MMHA_KERNEL_BLOCK_SIZE"] = "256"
-    os.environ["FORCE_MULTI_BLOCK_MODE"] = "1"
-    os.environ["TRTLLM_ENABLE_PDL"] = "1"
-
     # Performance options
     exec_settings["performance_options"]["cuda_graphs"] = True
     exec_settings["performance_options"]["multi_block_mode"] = True
@@ -290,6 +284,17 @@ def latency_command(
     kwargs = kwargs | runtime_config.get_llm_args()
     kwargs['backend'] = options.backend
 
+    # Set environment variables for setting runtime options.
+    default_env_overrides = {
+        "TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG": "1",
+        "TRTLLM_MMHA_KERNEL_BLOCK_SIZE": "256",
+        "FORCE_MULTI_BLOCK_MODE": "1",
+        "TRTLLM_ENABLE_PDL": "1",
+    }
+    # Update defaults with existing overrides (user preference takes priority)
+    default_env_overrides.update(kwargs.get("env_overrides", {}))
+    kwargs["env_overrides"] = default_env_overrides
+
     try:
         logger.info("Setting up latency benchmark.")
 
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
@@ -61,12 +61,14 @@
     help="Paths to custom module directories to import.",
 )
 @optgroup.option(
+    "--config",
     "--extra_llm_api_options",
+    "extra_llm_api_options",
     type=str,
     default=None,
     help=
-    "Path to a YAML file that overwrites the parameters specified by trtllm-bench."
-)
+    "Path to a YAML file that overwrites the parameters specified by trtllm-bench. "
+    "Can be specified as either --config or --extra_llm_api_options.")
 @optgroup.option("--sampler_options",
                  type=click.Path(exists=True,
                                  readable=True,
@@ -293,6 +295,7 @@ def throughput_command(
 ) -> None:
     """Run a throughput test on a TRT-LLM engine."""
     logger.info("Preparing to run throughput benchmark...")
+
     # Parameters from CLI
     image_data_format: str = params.get("image_data_format", "pt")
     data_device: str = params.get("data_device", "cpu")
diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
@@ -97,10 +97,13 @@
               default=None,
               help="The revision to use for the HuggingFace model "
               "(branch name, tag name, or commit id).")
-@click.option("--extra_llm_api_options",
+@click.option("--config",
+              "--extra_llm_api_options",
+              "extra_llm_api_options",
               type=str,
               default=None,
-              help="Path to a YAML file that overwrites the parameters")
+              help="Path to a YAML file that overwrites the parameters. "
+              "Can be specified as either --config or --extra_llm_api_options.")
 @click.option("--disable_kv_cache_reuse",
               is_flag=True,
               default=False,
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -342,12 +342,14 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
               help="The revision to use for the HuggingFace model "
               "(branch name, tag name, or commit id).")
 @click.option(
+    "--config",
     "--extra_llm_api_options",
+    "extra_llm_api_options",
     type=str,
     default=None,
     help=
-    "Path to a YAML file that overwrites the parameters specified by trtllm-serve."
-)
+    "Path to a YAML file that overwrites the parameters specified by trtllm-serve. "
+    "Can be specified as either --config or --extra_llm_api_options.")
 @click.option(
     "--reasoning_parser",
     type=click.Choice(ReasoningParserFactory.parsers.keys()),
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -241,8 +241,17 @@ def worker_main(
     tokenizer: Optional[TokenizerBase] = None,
     llm_args: Optional[BaseLlmArgs] = None,
 ) -> None:
+
     mpi_comm().barrier()
 
+    if llm_args is not None and llm_args.env_overrides:
+        # this is needed because MPI_Init seems to cache the env at import time.
+        # The cached env snapshot is used to spawn workers.
+        # Any env overrides to the main process after tensorrt_llm import
+        # may not get reflected in the spawned worker process, no matter how early,
+        # unless we update it explicitly here.
+        os.environ.update(llm_args.env_overrides)
+
     if llm_args is not None and llm_args.trust_remote_code:
         _init_hf_modules()
 
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -135,6 +135,9 @@ def __init__(self,
         logger.set_level("info")  # force display the backend
 
         try:
+            env_overrides = kwargs.get("env_overrides", None)
+            self._process_env_overrides(env_overrides)
+
             backend = kwargs.get('backend', None)
             if backend == "pytorch":
                 logger.info("Using LLM with PyTorch backend")
@@ -587,6 +590,25 @@ def get_kv_cache_events_async(self,
         '''
         return self._executor.aget_kv_events(timeout=timeout)
 
+    def _process_env_overrides(self,
+                               env_overrides: Optional[dict[str, str]]) -> None:
+        if env_overrides is None:
+            return
+        logger.info("Processing LLM API environment variable overrides")
+        # TODO: If an env var is cached at import-time in code, overriding os.environ will
+        # unfortunately not update wherever the var is used.
+        # This is a known issue and only way to fix it is at every such usage to access it
+        # from os.environ on-demand.
+        for key, value in env_overrides.items():
+            str_value = str(value)
+            if key in os.environ:
+                old_value = os.environ[key]
+                os.environ[key] = str_value
+                logger.info(f"Overriding {key}: '{old_value}' -> '{str_value}'")
+            else:
+                os.environ[key] = str_value
+                logger.info(f"Setting {key}='{str_value}'")
+
     def _prepare_sampling_params(
             self,
             sampling_params: Optional[SamplingParams] = None) -> SamplingParams:
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1931,6 +1931,12 @@ class BaseLlmArgs(StrictBaseModel):
         status="prototype",
     )
 
+    env_overrides: Optional[Dict[str, str]] = Field(
+        default=None,
+        description=
+        "[EXPERIMENTAL] Environment variable overrides. NOTE: import-time-cached env vars in the code won’t update unless the code fetches them from os.environ on demand.",
+        status="prototype")
+
     _parallel_config: Optional[_ParallelConfig] = PrivateAttr(default=None)
     _model_format: Optional[_ModelFormatKind] = PrivateAttr(default=None)
     _speculative_model: Optional[str] = PrivateAttr(default=None)
diff --git a/tests/integration/defs/examples/serve/test_serve.py b/tests/integration/defs/examples/serve/test_serve.py
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml