9
9
cast , overload )
10
10
11
11
import cloudpickle
12
+ import os
13
+ import torch
12
14
import torch .nn as nn
13
15
from tqdm .auto import tqdm
14
16
from typing_extensions import TypeVar , deprecated
40
42
from vllm .outputs import (ClassificationRequestOutput , EmbeddingRequestOutput ,
41
43
PoolingRequestOutput , RequestOutput ,
42
44
ScoringRequestOutput )
45
+ from vllm .platforms import current_platform
43
46
from vllm .pooling_params import PoolingParams
44
47
from vllm .prompt_adapter .request import PromptAdapterRequest
45
48
from vllm .sampling_params import (BeamSearchParams , GuidedDecodingParams ,
@@ -247,6 +250,7 @@ def __init__(
247
250
248
251
self .request_counter = Counter ()
249
252
self .default_sampling_params : Union [dict [str , Any ], None ] = None
253
+ self .profiler = self ._setup_profiler ()
250
254
251
255
def get_tokenizer (
252
256
self ,
@@ -255,6 +259,35 @@ def get_tokenizer(
255
259
return self .llm_engine .get_tokenizer_group ().get_lora_tokenizer (
256
260
lora_request )
257
261
262
+ def _setup_profiler (self ):
263
+ enable_profile = os .getenv ("VLLM_ENGINE_PROFILER_ENABLED" ,
264
+ "false" ).lower () in ["true" , "1" ]
265
+ if not enable_profile :
266
+ return None
267
+ warmup = int (os .getenv ("VLLM_ENGINE_PROFILER_WARMUP_STEPS" , "0" ))
268
+ steps = int (os .getenv ("VLLM_ENGINE_PROFILER_STEPS" , "1" ))
269
+ repeat = int (os .getenv ("VLLM_ENGINE_PROFILER_REPEAT" , "1" ))
270
+ schedule = torch .profiler .schedule (wait = 0 ,
271
+ warmup = warmup ,
272
+ active = steps ,
273
+ repeat = repeat )
274
+ activities = [ torch .profiler .ProfilerActivity .CPU ]
275
+ if current_platform .is_cuda ():
276
+ activities .append (torch .profiler .ProfilerActivity .CUDA )
277
+ elif current_platform .is_hpu ():
278
+ activities .append (torch .profiler .ProfilerActivity .HPU )
279
+
280
+ profiler = torch .profiler .profile (
281
+ schedule = schedule ,
282
+ activities = activities ,
283
+ on_trace_ready = torch .profiler .tensorboard_trace_handler (
284
+ '.' , use_gzip = True ),
285
+ record_shapes = False ,
286
+ with_modules = False ,
287
+ profile_memory = False ,
288
+ with_stack = True )
289
+ return profiler
290
+
258
291
def set_tokenizer (self , tokenizer : AnyTokenizer ) -> None :
259
292
tokenizer_group = self .llm_engine .get_tokenizer_group ()
260
293
@@ -1493,6 +1526,8 @@ def _run_engine(
1493
1526
outputs : list [Union [RequestOutput , PoolingRequestOutput ]] = []
1494
1527
total_in_toks = 0
1495
1528
total_out_toks = 0
1529
+ if self .profiler :
1530
+ self .profiler .start ()
1496
1531
while self .llm_engine .has_unfinished_requests ():
1497
1532
step_outputs = self .llm_engine .step ()
1498
1533
for output in step_outputs :
@@ -1515,10 +1550,16 @@ def _run_engine(
1515
1550
pbar .update (n )
1516
1551
else :
1517
1552
pbar .update (1 )
1553
+ if self .profiler :
1554
+ self .profiler .step ()
1518
1555
1519
1556
if use_tqdm :
1520
1557
pbar .close ()
1521
1558
1559
+ if self .profiler :
1560
+ torch .hpu .synchronize ()
1561
+ self .profiler .stop ()
1562
+
1522
1563
# Make sure that all workers are finished
1523
1564
# NOTE(kzawora): this crashes on v1, why?
1524
1565
# this doesn't seem like hpu-specific issue
0 commit comments