Skip to content

Commit f4a992c

Browse files
authored
Removing RPD in favor of torch profiler for V1 (ROCm#558)
Signed-off-by: Gregory Shtrasberg <[email protected]>
1 parent 307d8bc commit f4a992c

File tree

6 files changed

+2
-189
lines changed

6 files changed

+2
-189
lines changed

docker/Dockerfile.rocm

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# default base image
22
ARG REMOTE_VLLM="0"
3-
ARG BUILD_RPD="1"
43
ARG COMMON_WORKDIR=/app
54
ARG BASE_IMAGE=rocm/vllm-dev:base
65

@@ -87,13 +86,6 @@ RUN case "$(which python3)" in \
8786
*) ;; esac
8887

8988
RUN python3 -m pip install --upgrade huggingface-hub[cli]
90-
ARG BUILD_RPD
91-
RUN if [ ${BUILD_RPD} -eq "1" ]; then \
92-
git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
93-
&& cd rocmProfileData/rpd_tracer \
94-
&& pip install -r requirements.txt && cd ../ \
95-
&& make && make install \
96-
&& cd hipMarker && python3 setup.py install ; fi
9789

9890
# Install vLLM
9991
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \

docs/getting_started/installation/gpu/rocm.inc.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,6 @@ It is important that the user kicks off the docker build using buildkit. Either
179179
It provides flexibility to customize the build of docker image using the following arguments:
180180

181181
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
182-
- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
183-
- `BUILD_RPD`: Include RocmProfileData profiling tool in the image
184182
- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
185183

186184
Their values can be passed in when running `docker build` with `--build-arg` options.

vllm/envs.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@
7575
VLLM_PLUGINS: Optional[list[str]] = None
7676
VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
7777
VLLM_TORCH_PROFILER_DIR: Optional[str] = None
78-
VLLM_RPD_PROFILER_DIR: Optional[str] = None
7978
VLLM_USE_TRITON_AWQ: bool = False
8079
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
8180
VLLM_SKIP_P2P_CHECK: bool = False
@@ -588,12 +587,6 @@ def get_vllm_port() -> Optional[int]:
588587
lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
589588
.path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
590589

591-
# Enables rpd profiler if set. Path to the directory where torch profiler
592-
# traces are saved. Note that it must be an absolute path.
593-
"VLLM_RPD_PROFILER_DIR":
594-
lambda: (None if os.getenv("VLLM_RPD_PROFILER_DIR", None) is None else os.
595-
path.expanduser(os.getenv("VLLM_RPD_PROFILER_DIR", "."))),
596-
597590
# If set, vLLM will use Triton implementations of AWQ.
598591
"VLLM_USE_TRITON_AWQ":
599592
lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),

vllm/model_executor/layers/sampler.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
CompletionSequenceGroupOutput, Logprob,
2222
PromptLogprobs, SampleLogprobs, SequenceOutput)
2323
from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
24-
from vllm.utils import rpd_mark
2524

2625
if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
2726
# yapf: disable
@@ -223,7 +222,6 @@ def _init_sampling_tensors(
223222
self._do_top_p_top_k = do_top_p_top_k
224223
self._do_min_p = do_min_p
225224

226-
@rpd_mark(name="Sampler Forward")
227225
def forward(
228226
self,
229227
logits: torch.Tensor,
@@ -793,7 +791,6 @@ def _sample_with_torch(
793791
)
794792

795793

796-
@rpd_mark()
797794
def _sample(
798795
probs: torch.Tensor,
799796
logprobs: torch.Tensor,

vllm/utils.py

Lines changed: 0 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -205,145 +205,6 @@ class _Sentinel:
205205
ALL_PINNED_SENTINEL = _Sentinel()
206206

207207

208-
class rpd_trace:
209-
210-
def __init__(self,
211-
filename=None,
212-
name=None,
213-
nvtx=False,
214-
args=None,
215-
skip=False):
216-
self.skip = skip
217-
if not self.skip:
218-
self.name = name
219-
self.args = args if args else ""
220-
self.rpd = self.initialize_rpd_tracer(filename, nvtx)
221-
222-
def _recreate_cm(self):
223-
return self
224-
225-
def __call__(self, func):
226-
if not self.skip:
227-
if self.name:
228-
self.name += f"{func.__name__}"
229-
else:
230-
self.name = f"{func.__qualname__}"
231-
232-
@wraps(func)
233-
def inner(*args, **kwds):
234-
with self._recreate_cm():
235-
return func(*args, **kwds)
236-
237-
return inner
238-
return func
239-
240-
def __enter__(self):
241-
if not self.skip:
242-
self.rpd.__enter__()
243-
self.rpd.rangePush("python", f"{self.name}", f"{self.args}")
244-
return self
245-
246-
def __exit__(self, *exc):
247-
if not self.skip:
248-
self.rpd.rangePop()
249-
self.rpd.__exit__(None, None, None)
250-
return False
251-
252-
@staticmethod
253-
def setup_environment_variables(filename):
254-
os.environ['RPDT_AUTOSTART'] = '0'
255-
os.environ['RPDT_FILENAME'] = filename
256-
257-
def initialize_rpd_tracer(self, filename, nvtx):
258-
try:
259-
from rpdTracerControl import rpdTracerControl
260-
rpd_trace.setup_environment_variables(filename)
261-
rpdTracerControl.setFilename(name=filename, append=True)
262-
return rpdTracerControl(nvtx=nvtx)
263-
except Exception as e:
264-
print(f"Error initializing rpdTracerControl: {e}")
265-
raise
266-
267-
@staticmethod
268-
def create_file(filename):
269-
import sqlite3
270-
271-
from rocpd.schema import RocpdSchema
272-
try:
273-
print("Creating empty rpd schema file ...")
274-
filename = str(filename)
275-
with sqlite3.connect(filename) as connection:
276-
schema = RocpdSchema()
277-
schema.writeSchema(connection)
278-
connection.commit()
279-
except sqlite3.OperationalError as e:
280-
print(f"SQLite operational error: {e}")
281-
except Exception as e:
282-
print(f"An error occurred while creating the filename: {e}")
283-
284-
285-
@cache
286-
def is_hipScopedMarker_available():
287-
try:
288-
from hipScopedMarker import hipScopedMarker
289-
except ImportError:
290-
hipScopedMarker = None
291-
return hipScopedMarker is not None
292-
293-
294-
class rpd_mark:
295-
296-
def __init__(self, name=None):
297-
self.name = name
298-
299-
def __call__(self, func):
300-
301-
if is_hipScopedMarker_available():
302-
from hipScopedMarker import hipScopedMarker
303-
304-
@wraps(func)
305-
def inner(*args, **kwds):
306-
marker_name = self.name if self.name else f"{func.__name__}"
307-
with hipScopedMarker(f"{marker_name}"):
308-
return func(*args, **kwds)
309-
310-
return inner
311-
312-
else:
313-
return func
314-
315-
316-
class rpd_user_marker:
317-
318-
def __init__(self, name=None):
319-
self.name = name
320-
self.marker = None
321-
322-
def __enter__(self):
323-
if is_hipScopedMarker_available():
324-
from hipScopedMarker import hipScopedMarker
325-
marker_name = self.name if self.name else "UserMarker Undefined"
326-
self.marker = hipScopedMarker(f"{marker_name}")
327-
self.marker.__enter__()
328-
return self
329-
330-
def __exit__(self, exc_type, exc_val, exc_tb):
331-
if is_hipScopedMarker_available() and self.marker:
332-
self.marker.__exit__(exc_type, exc_val, exc_tb)
333-
334-
def start(self):
335-
if is_hipScopedMarker_available():
336-
from hipScopedMarker import hipScopedMarker
337-
marker_name = self.name if self.name else "UserMarker Undefined"
338-
self.marker = hipScopedMarker(f"{marker_name}")
339-
self.marker.__enter__()
340-
return self
341-
342-
def end(self, exc_type=0, exc_val=0, exc_tb=0):
343-
if is_hipScopedMarker_available() and self.marker:
344-
self.marker.__exit__(exc_type, exc_val, exc_tb)
345-
346-
347208
class Device(enum.Enum):
348209
GPU = enum.auto()
349210
CPU = enum.auto()

vllm/worker/worker.py

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
"""A GPU worker class."""
33
import gc
44
import os
5-
from pathlib import Path
65
from typing import Dict, List, Optional, Set, Tuple, Type, Union
76

87
import torch
@@ -117,45 +116,18 @@ def __init__(
117116
with_stack=True,
118117
on_trace_ready=torch.profiler.tensorboard_trace_handler(
119118
torch_profiler_trace_dir, use_gzip=True))
120-
elif envs.VLLM_RPD_PROFILER_DIR:
121-
rpd_profiler_trace_dir = Path(envs.VLLM_RPD_PROFILER_DIR)
122-
123-
if rpd_profiler_trace_dir.suffix != ".rpd":
124-
rpd_profiler_trace_dir = rpd_profiler_trace_dir / "trace.rpd"
125-
126-
rpd_profiler_trace_dir.parent.mkdir(parents=True, exist_ok=True)
127-
128-
logger.info("Profiling enabled. Traces will be saved to: %s",
129-
rpd_profiler_trace_dir)
130-
131-
from vllm.utils import rpd_trace
132-
133-
if self.rank == 0:
134-
rpd_trace.create_file(filename=str(rpd_profiler_trace_dir))
135-
136-
self.profiler = rpd_trace(filename=str(rpd_profiler_trace_dir),
137-
name='Worker RPD Enabled',
138-
nvtx=True)
139119
else:
140120
self.profiler = None
141121

142122
def start_profile(self):
143123
if self.profiler is None:
144124
raise RuntimeError("Profiler is not enabled.")
145-
146-
if envs.VLLM_RPD_PROFILER_DIR:
147-
self.profiler.__enter__()
148-
else:
149-
self.profiler.start()
125+
self.profiler.start()
150126

151127
def stop_profile(self):
152128
if self.profiler is None:
153129
raise RuntimeError("Profiler is not enabled.")
154-
155-
if envs.VLLM_RPD_PROFILER_DIR:
156-
self.profiler.__exit__()
157-
else:
158-
self.profiler.stop()
130+
self.profiler.stop()
159131

160132
def sleep(self, level: int = 1) -> None:
161133
free_bytes_before_sleep = torch.cuda.mem_get_info()[0]

0 commit comments

Comments
 (0)