Skip to content

Commit 6693645

Browse files
Wei-Lin-Intelczhu15
authored andcommitted
Add profiler for HPU (vllm-project#1753)
## Essential Elements of an Effective PR Description Checklist - [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". - [ ] The test plan, such as providing test command. - [ ] The test results, such as pasting the results comparison before and after, or e2e results ## Purpose ## Test Plan ## Test Result <!--- pyml disable-next-line no-emphasis-as-heading -->
1 parent 26d4308 commit 6693645

File tree

1 file changed

+41
-0
lines changed

1 file changed

+41
-0
lines changed

vllm/entrypoints/llm.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
cast, overload)
1010

1111
import cloudpickle
12+
import os
13+
import torch
1214
import torch.nn as nn
1315
from tqdm.auto import tqdm
1416
from typing_extensions import TypeVar, deprecated
@@ -40,6 +42,7 @@
4042
from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
4143
PoolingRequestOutput, RequestOutput,
4244
ScoringRequestOutput)
45+
from vllm.platforms import current_platform
4346
from vllm.pooling_params import PoolingParams
4447
from vllm.prompt_adapter.request import PromptAdapterRequest
4548
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
@@ -247,6 +250,7 @@ def __init__(
247250

248251
self.request_counter = Counter()
249252
self.default_sampling_params: Union[dict[str, Any], None] = None
253+
self.profiler = self._setup_profiler()
250254

251255
def get_tokenizer(
252256
self,
@@ -255,6 +259,35 @@ def get_tokenizer(
255259
return self.llm_engine.get_tokenizer_group().get_lora_tokenizer(
256260
lora_request)
257261

262+
def _setup_profiler(self):
263+
enable_profile = os.getenv("VLLM_ENGINE_PROFILER_ENABLED",
264+
"false").lower() in ["true", "1"]
265+
if not enable_profile:
266+
return None
267+
warmup = int(os.getenv("VLLM_ENGINE_PROFILER_WARMUP_STEPS", "0"))
268+
steps = int(os.getenv("VLLM_ENGINE_PROFILER_STEPS", "1"))
269+
repeat = int(os.getenv("VLLM_ENGINE_PROFILER_REPEAT", "1"))
270+
schedule = torch.profiler.schedule(wait=0,
271+
warmup=warmup,
272+
active=steps,
273+
repeat=repeat)
274+
activities = [ torch.profiler.ProfilerActivity.CPU ]
275+
if current_platform.is_cuda():
276+
activities.append(torch.profiler.ProfilerActivity.CUDA)
277+
elif current_platform.is_hpu():
278+
activities.append(torch.profiler.ProfilerActivity.HPU)
279+
280+
profiler = torch.profiler.profile(
281+
schedule=schedule,
282+
activities=activities,
283+
on_trace_ready=torch.profiler.tensorboard_trace_handler(
284+
'.', use_gzip=True),
285+
record_shapes=False,
286+
with_modules=False,
287+
profile_memory=False,
288+
with_stack=True)
289+
return profiler
290+
258291
def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
259292
tokenizer_group = self.llm_engine.get_tokenizer_group()
260293

@@ -1493,6 +1526,8 @@ def _run_engine(
14931526
outputs: list[Union[RequestOutput, PoolingRequestOutput]] = []
14941527
total_in_toks = 0
14951528
total_out_toks = 0
1529+
if self.profiler:
1530+
self.profiler.start()
14961531
while self.llm_engine.has_unfinished_requests():
14971532
step_outputs = self.llm_engine.step()
14981533
for output in step_outputs:
@@ -1515,10 +1550,16 @@ def _run_engine(
15151550
pbar.update(n)
15161551
else:
15171552
pbar.update(1)
1553+
if self.profiler:
1554+
self.profiler.step()
15181555

15191556
if use_tqdm:
15201557
pbar.close()
15211558

1559+
if self.profiler:
1560+
torch.hpu.synchronize()
1561+
self.profiler.stop()
1562+
15221563
# Make sure that all workers are finished
15231564
# NOTE(kzawora): this crashes on v1, why?
15241565
# this doesn't seem like hpu-specific issue

0 commit comments

Comments
 (0)