Add t.compile config (#62)

Kacper Pietkun · web-flow · commit ab65f9ba2abb · 2025-08-18T14:08:50.000+02:00
Signed-off-by: Kacper Pietkun &lt;kpietkun@habana.ai&gt;
diff --git a/tests/unit_tests/worker/test_hpu_model_runner.py b/tests/unit_tests/worker/test_hpu_model_runner.py
@@ -4,6 +4,8 @@
 import pytest
 import torch
 import habana_frameworks.torch  # noqa: F401
+from habana_frameworks.torch.utils.internal import is_lazy
+from vllm.model_executor.model_loader import get_model
 
 from vllm.attention import Attention
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
@@ -667,3 +669,38 @@ def test_init_kv_cache_with_kv_sharing_valid():
     assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
     assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
     assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+@pytest.mark.skipif(is_lazy(),
+                    reason="Test skipped because lazy mode is enabled.")
+def test_model_torch_regional_compilation(dist_init, model_runner):
+    from vllm_gaudi.utils import HPUCompileConfig
+    from vllm.model_executor.models.opt import OPTDecoderLayer
+    from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding  # noqa
+    from torch.nn.modules.normalization import LayerNorm
+    from torch._dynamo.eval_frame import OptimizedModule
+
+    def assert_compilation(model, layer_name, module):
+        submodule = model.get_submodule(layer_name)
+        assert isinstance(submodule, OptimizedModule), (
+            f"Layer: '{module.__name__}' was not wrapped with OptimizedModule"  # noqa
+        )
+        assert isinstance(submodule._orig_mod, module), (
+            f"_orig_mod is different from the original module: '{module.__name__}'"  # noqa
+        )
+
+    vllm_config = get_vllm_config()
+    model = get_model(vllm_config=vllm_config)
+    model_runner.compile_config = HPUCompileConfig()
+    model_runner.regional_compilation_layers_list = [
+        LayerNorm, VocabParallelEmbedding
+    ]
+
+    model_runner._regional_compilation(model)
+
+    for i in range(len(model.get_submodule("model.decoder.layers"))):
+        assert_compilation(model, f"model.decoder.layers.{i}", OPTDecoderLayer)
+    assert_compilation(model, "lm_head", VocabParallelEmbedding)
+    assert_compilation(model, "model.decoder.final_layer_norm", LayerNorm)
+    assert_compilation(model, "model.decoder.embed_tokens",
+                       VocabParallelEmbedding)
diff --git a/vllm_gaudi/extension/features.py b/vllm_gaudi/extension/features.py
@@ -71,5 +71,8 @@ def get_features():
         Value('exponential_bucketing', True, env_var='VLLM_EXPONENTIAL_BUCKETING'), 
         Value('linear_bucketing', True),
         Value('bucketing_strategy', FirstEnabled(*bucketing_strategies), env_var_type=choice(*bucketing_strategies)),
+        Value('regional_compilation', True, env_var='VLLM_T_COMPILE_REGIONAL_COMPILATION', env_var_type=boolean),
+        Value('dynamic_shapes_compilation', False, env_var='VLLM_T_COMPILE_DYNAMIC_SHAPES', env_var_type=boolean),
+        Value('fullgraph_compilation', False, env_var='VLLM_T_COMPILE_FULLGRAPH', env_var_type=boolean),
     ]
     return split_values_and_flags(features)
diff --git a/vllm_gaudi/platform.py b/vllm_gaudi/platform.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
+import habana_frameworks.torch as htorch
 
 from vllm import envs
 
@@ -144,7 +145,7 @@ def set_torch_compile(cls) -> None:
         # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
         # torch.compile support
         os.environ['PT_HPU_WEIGHT_SHARING'] = '0'
-        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '0') == '1'
+        is_lazy = htorch.utils.internal.is_lazy()
         if is_lazy:
             torch._dynamo.config.disable = True
             # NOTE multi-HPU inference with HPUGraphs (lazy-only)
diff --git a/vllm_gaudi/utils.py b/vllm_gaudi/utils.py
@@ -1,7 +1,8 @@
 from functools import cache
 import os
 from vllm.utils import make_tensor_with_pad, TORCH_DTYPE_TO_NUMPY_DTYPE
-from typing import (Optional, TypeVar, Union)
+from vllm_gaudi.extension.runtime import get_config
+from typing import (Any, Optional, TypeVar, Union)
 import torch
 import numpy as np
 import numpy.typing as npt
@@ -108,3 +109,45 @@ def make_tensor_with_pad_align(
         tensor = tensor.pin_memory()
 
     return tensor
+
+
+class HPUCompileConfig:
+    """
+    Configuration class, which holds arguments that will be
+    passed to torch compile with HPU backend.
+    """
+
+    def __init__(self,
+                 fullgraph: Optional[bool] = None,
+                 dynamic: Optional[bool] = None):
+        """
+        Allow to override the environment variables for corner case scenarios
+        when single functions are compiled with torch.compile decorator.
+        Env variables should not be overwritten when it comes to compilation
+        of the whole model.
+        """
+        self.fullgraph = fullgraph if fullgraph is not None else \
+            get_config().fullgraph_compilation
+        self.dynamic = dynamic if dynamic is not None else \
+            get_config().dynamic_shapes_compilation
+        self.regional_compilation = get_config().regional_compilation
+
+    def get_compile_args(self) -> dict[str, Any]:
+        """
+        Returns a dictionary of compile arguments that can be used
+        with torch.compile method or decorator
+        """
+        if self.dynamic:
+            return {
+                'backend': 'hpu_backend',
+                'fullgraph': self.fullgraph,
+                'options': {
+                    "force_static_compile": True
+                }
+            }
+        else:
+            return {
+                'backend': 'hpu_backend',
+                'fullgraph': self.fullgraph,
+                'dynamic': False
+            }
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -37,7 +37,7 @@
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, cdiv,
                         is_pin_memory_available, LazyLoader)
-from vllm_gaudi.utils import is_fake_hpu
+from vllm_gaudi.utils import HPUCompileConfig, is_fake_hpu
 from vllm_gaudi.v1.attention.backends.hpu_attn import HPUAttentionMetadataV1
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
@@ -1974,28 +1974,40 @@ def load_model(self) -> None:
                     self.model_memory_usage / float(2**30))
 
     def _maybe_compile(self, *args, **kwargs):
-        if not is_fake_hpu() and not htorch.utils.internal.is_lazy(
-        ) and not self.vllm_config.model_config.enforce_eager:
-            if os.getenv('VLLM_REGIONAL_COMPILATION',
-                         'true').strip().lower() in ("1", "true"):
-                compiled_methods = [
-                    '_update_metadata', '_rotary_prepare_cos_sin'
-                ]
-                for method_name in compiled_methods:
-                    method = getattr(self.model, method_name)
-                    if method is not None:
-                        self._compile_region(self.model, method_name, method)
+        """Entrypoint for a torch.compilation of the model"""
+        if (not is_fake_hpu() and not htorch.utils.internal.is_lazy()
+                and not self.vllm_config.model_config.enforce_eager):
+            self.compile_config = HPUCompileConfig()
+            if self.compile_config.regional_compilation:
+                self._compile_methods()
                 self.regional_compilation_layers_list = [
                     RMSNorm, VocabParallelEmbedding
                 ]
                 self._regional_compilation(self.model)
             else:
                 self.model = self._compile(self.model)
 
+    def _compile_methods(self):
+        """
+        Compile methods which are not part of the compiled model i.e. those
+        which will not be compiled during model's compilation.
+        """
+        compiled_methods = ['_update_metadata', '_rotary_prepare_cos_sin']
+        for method_name in compiled_methods:
+            method = getattr(self.model, method_name)
+            if method is not None:
+                self._compile_region(self.model, method_name, method)
+
     def _regional_compilation(self,
                               module,
                               parent_module=None,
                               module_name=None):
+        """
+        Recursively traverses a PyTorch module and compiles its regions, which
+        can be one of two:
+        1. Children of the nn.ModuleList
+        2. Member of regional_compilation_layers_list
+        """
         if isinstance(module, torch.nn.ModuleList):
             for children_name, children_module in module.named_children():
                 self._compile_region(module, children_name, children_module)
@@ -2017,24 +2029,7 @@ def _compile_region(self, model, name, module):
         setattr(model, name, module)
 
     def _compile(self, module):
-        if not hasattr(self, '_compile_config'):
-            fullgraph = os.getenv('VLLM_T_COMPILE_FULLGRAPH',
-                                  'false').strip().lower() in ("1", "true")
-            dynamic = os.getenv('VLLM_T_COMPILE_DYNAMIC_SHAPES',
-                                'false').strip().lower() in ("1", "true")
-            self._compile_config = {'fullgraph': fullgraph, 'dynamic': dynamic}
-        fullgraph = self._compile_config['fullgraph']
-        dynamic = self._compile_config['dynamic']
-        if dynamic:
-            return torch.compile(module,
-                                 backend='hpu_backend',
-                                 fullgraph=fullgraph,
-                                 options={"force_static_compile": True})
-        else:
-            return torch.compile(module,
-                                 backend='hpu_backend',
-                                 fullgraph=fullgraph,
-                                 dynamic=False)
+        return torch.compile(module, **self.compile_config.get_compile_args())
 
     def _use_graphs(self):
         return not self.model_config.enforce_eager
@@ -2352,8 +2347,7 @@ def warmup_model(self) -> None:
 
         if not htorch.utils.internal.is_lazy(
         ) and not self.model_config.enforce_eager:
-            multiplier = 3 if os.getenv('VLLM_REGIONAL_COMPILATION',
-                                        'true').lower() in ('1', 'true') else 1
+            multiplier = 5 if self.compile_config.regional_compilation else 1
             cache_size_limit = 1 + multiplier * (
                 len(self.bucketing_manager.prompt_buckets) +
                 len(self.bucketing_manager.decode_buckets))

Original file line number	Diff line number	Diff line change
`@@ -71,5 +71,8 @@ def get_features():`
`71`	`71`	`Value('exponential_bucketing', True, env_var='VLLM_EXPONENTIAL_BUCKETING'),`
`72`	`72`	`Value('linear_bucketing', True),`
`73`	`73`	`Value('bucketing_strategy', FirstEnabled(bucketing_strategies), env_var_type=choice(bucketing_strategies)),`
	`74`	`+ Value('regional_compilation', True, env_var='VLLM_T_COMPILE_REGIONAL_COMPILATION', env_var_type=boolean),`
	`75`	`+ Value('dynamic_shapes_compilation', False, env_var='VLLM_T_COMPILE_DYNAMIC_SHAPES', env_var_type=boolean),`
	`76`	`+ Value('fullgraph_compilation', False, env_var='VLLM_T_COMPILE_FULLGRAPH', env_var_type=boolean),`
`74`	`77`	`]`
`75`	`78`	`return split_values_and_flags(features)`