Add profiler for HPU (vllm-project#1753)

Wei-Lin-Intel · czhu15 · commit 66936455444f · 2025-08-13T16:04:58.000+08:00
## Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing
issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before
and after, or e2e results


## Purpose

## Test Plan

## Test Result

&lt;!--- pyml disable-next-line no-emphasis-as-heading --&gt;
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -9,6 +9,8 @@
                     cast, overload)
 
 import cloudpickle
+import os
+import torch
 import torch.nn as nn
 from tqdm.auto import tqdm
 from typing_extensions import TypeVar, deprecated
@@ -40,6 +42,7 @@
 from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
                           PoolingRequestOutput, RequestOutput,
                           ScoringRequestOutput)
+from vllm.platforms import current_platform
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
@@ -247,6 +250,7 @@ def __init__(
 
         self.request_counter = Counter()
         self.default_sampling_params: Union[dict[str, Any], None] = None
+        self.profiler = self._setup_profiler()
 
     def get_tokenizer(
         self,
@@ -255,6 +259,35 @@ def get_tokenizer(
         return self.llm_engine.get_tokenizer_group().get_lora_tokenizer(
             lora_request)
 
+    def _setup_profiler(self):
+        enable_profile = os.getenv("VLLM_ENGINE_PROFILER_ENABLED",
+                                   "false").lower() in ["true", "1"]
+        if not enable_profile:
+            return None
+        warmup = int(os.getenv("VLLM_ENGINE_PROFILER_WARMUP_STEPS", "0"))
+        steps = int(os.getenv("VLLM_ENGINE_PROFILER_STEPS", "1"))
+        repeat = int(os.getenv("VLLM_ENGINE_PROFILER_REPEAT", "1"))
+        schedule = torch.profiler.schedule(wait=0,
+                                           warmup=warmup,
+                                           active=steps,
+                                           repeat=repeat)
+        activities = [ torch.profiler.ProfilerActivity.CPU ]
+        if current_platform.is_cuda():
+            activities.append(torch.profiler.ProfilerActivity.CUDA)
+        elif current_platform.is_hpu():
+            activities.append(torch.profiler.ProfilerActivity.HPU)
+        
+        profiler = torch.profiler.profile(
+            schedule=schedule,
+            activities=activities,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                '.', use_gzip=True),
+            record_shapes=False,
+            with_modules=False,
+            profile_memory=False,
+            with_stack=True)
+        return profiler
+
     def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
         tokenizer_group = self.llm_engine.get_tokenizer_group()
 
@@ -1493,6 +1526,8 @@ def _run_engine(
         outputs: list[Union[RequestOutput, PoolingRequestOutput]] = []
         total_in_toks = 0
         total_out_toks = 0
+        if self.profiler:
+            self.profiler.start()
         while self.llm_engine.has_unfinished_requests():
             step_outputs = self.llm_engine.step()
             for output in step_outputs:
@@ -1515,10 +1550,16 @@ def _run_engine(
                             pbar.update(n)
                         else:
                             pbar.update(1)
+            if self.profiler:
+                self.profiler.step()
 
         if use_tqdm:
             pbar.close()
 
+        if self.profiler:
+            torch.hpu.synchronize()
+            self.profiler.stop()
+
         # Make sure that all workers are finished
         # NOTE(kzawora): this crashes on v1, why?
         # this doesn't seem like hpu-specific issue