Skip to content

Commit a6bb017

Browse files
authored
Merge branch 'main' into kunshang/t_compile_support
2 parents e3ebfdb + 78863f8 commit a6bb017

File tree

12 files changed

+730
-21
lines changed

12 files changed

+730
-21
lines changed

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ def load_module_from_path(module_name, path):
6060

6161

6262
def is_sccache_available() -> bool:
63-
return which("sccache") is not None
63+
return which("sccache") is not None and \
64+
not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0")))
6465

6566

6667
def is_ccache_available() -> bool:

tests/detokenizer/test_min_tokens.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import pytest
5+
from transformers import AutoTokenizer
6+
7+
from vllm import SamplingParams
8+
from vllm.v1.engine import EngineCoreRequest
9+
from vllm.v1.engine.detokenizer import FastIncrementalDetokenizer
10+
11+
PROMPT = "Hello, my name is Lee, and I'm a student in the " + \
12+
"college of engineering"
13+
14+
15+
@pytest.mark.parametrize("min_tokens,stop,truth", [
16+
(0, None, " is Lee, and I'm a student in the college of engineering"),
17+
(0, "e", " is L"),
18+
(5, "e", " is Lee, and I'm a stud"),
19+
])
20+
def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):
21+
"""Test for a specific min_tokens and stop.
22+
23+
See https://github.com/vllm-project/vllm/pull/22014
24+
"""
25+
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
26+
all_prompt_ids = tokenizer(PROMPT, add_special_tokens=False).input_ids
27+
28+
# The prompt is "Hello, my name is"
29+
prompt_token_ids = all_prompt_ids[:4]
30+
params = SamplingParams(
31+
stop=stop,
32+
min_tokens=min_tokens,
33+
)
34+
request = EngineCoreRequest("",
35+
prompt_token_ids,
36+
None,
37+
None,
38+
None,
39+
params,
40+
None,
41+
None,
42+
0.0,
43+
None,
44+
cache_salt=None,
45+
data_parallel_rank=None)
46+
47+
detokenizer = FastIncrementalDetokenizer(tokenizer, request)
48+
49+
detokenizer.update(all_prompt_ids[4:], False)
50+
assert detokenizer.output_text == truth

tests/entrypoints/openai/test_prompt_validation.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4+
import io
5+
46
# imports for guided decoding tests
57
import openai
8+
import pybase64
69
import pytest
710
import regex as re
11+
import torch
12+
13+
from vllm.entrypoints.openai.serving_engine import OpenAIServing
814

915
from ...utils import RemoteOpenAIServer
1016

@@ -42,3 +48,46 @@ async def test_out_of_vocab_token_ids():
4248
prompt=[999999],
4349
max_tokens=5,
4450
temperature=0.0)
51+
52+
53+
@pytest.mark.parametrize("dtype",
54+
[torch.float32, torch.bfloat16, torch.float16])
55+
@pytest.mark.parametrize(
56+
"layout",
57+
[torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr])
58+
@pytest.mark.parametrize("seq_len", [2, 10])
59+
@pytest.mark.parametrize("hidden_size", [2, 10])
60+
def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
61+
seq_len: int, hidden_size: int):
62+
# construct arbitrary tensors of various dtypes, layouts, and sizes.
63+
# We need to check against different layouts to make sure that if a user
64+
# uses sparse tensors to reduce the transmission size of prompt embeddings,
65+
# we must cast them to dense/strided before passing them into the engine.
66+
# We don't use non-CPU tensors in this test to avoid preemptively
67+
# initializing cuda and break other tests in the suite that fork processes.
68+
# We also need to make sure that we only use devices that are actually
69+
# available in the environment the test is running on. For simplicity,
70+
# we just test against CPU.
71+
tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
72+
if layout == torch.strided:
73+
tensor = tensor.contiguous()
74+
elif layout == torch.sparse_coo:
75+
tensor = tensor.to_sparse_coo()
76+
elif layout == torch.sparse_csc:
77+
tensor = tensor.to_sparse_csc()
78+
elif layout == torch.sparse_csr:
79+
tensor = tensor.to_sparse_csr()
80+
81+
buffer = io.BytesIO()
82+
torch.save(tensor, buffer)
83+
buffer.seek(0)
84+
encoded_tensor = pybase64.b64encode(buffer.getvalue())
85+
86+
loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor)
87+
assert len(loaded_prompt_embeds) == 1
88+
loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
89+
assert loaded_tensor.device.type == "cpu"
90+
assert loaded_tensor.layout == torch.strided
91+
torch.testing.assert_close(loaded_tensor,
92+
tensor.to("cpu").to_dense(),
93+
equal_nan=True)
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
# gputrc2graph.py
2+
3+
This script processes NVIDIA Nsight Systems (`nsys`) GPU trace files
4+
(`.nsys-rep`) with -t cuda tracing enabled, and generates kernel-level
5+
summaries and visualizations of GPU and non-GPU time. It is useful for
6+
profiling and analyzing nsys profile output.
7+
8+
## Usage
9+
10+
### Command-line Arguments
11+
12+
- `--in_file`
13+
**(required)**
14+
List of input files and their metadata. Each entry should be in the format:
15+
`<nsys-rep>,<engine>,<model>,<elapsed_nonprofiled_sec>`
16+
- `nsys-rep`: Path to the `.nsys-rep` file.
17+
- `engine`: Engine name (e.g., `vllm`).
18+
- `model`: Model name (e.g., `llama`, `gpt-oss`, `ds`).
19+
- `elapsed_nonprofiled_sec`: Wall-clock runtime (in seconds) without
20+
profiling. Specify `0` to use the elapsed time from the nsys-rep file
21+
(this may inflate non-GPU time if actual runtime without profiling is
22+
less). Multiple entries can be provided, separated by spaces.
23+
24+
- `--out_dir`
25+
Output directory for the generated CSV and HTML files.
26+
If not specified, results are saved in the current directory.
27+
28+
- `--title`
29+
Title for the HTML chart/visualization.
30+
31+
- `--nsys_cmd`
32+
Path to the `nsys` command.
33+
Default: `nsys` (assumes it is in your PATH).
34+
Use this if `nsys` is not in your system PATH.
35+
36+
## Notes
37+
38+
- Make sure you have pandas installed.
39+
- Make sure nsys is installed, and specify the path to the `nsys` command with
40+
`--nsys_cmd` if it is not in your PATH.
41+
- For more details on available engines and models, see the help string in
42+
the script or run:
43+
44+
```bash
45+
python3 gputrc2graph.py --help
46+
```
47+
48+
## Example 1: analyze a single profile
49+
50+
To analyze the GPU cycles for say, gpt-oss model with vLLM engine:
51+
52+
1. Run the following command to collect nsys profile, for vllm serve config.
53+
54+
```bash
55+
nsys profile -t cuda -o run1 -f true --trace-fork-before-exec=true \
56+
--cuda-graph-trace=node --delay <DELAY> --duration <DURATION> \
57+
vllm serve openai/gpt-oss-120b ...
58+
```
59+
60+
where:
61+
62+
- DELAY: how many seconds to delay nsys from collecting profiles, needed so
63+
that profiles aren't captured till vllm server has come up and load
64+
generation starts.
65+
- DURATION: how many seconds for nsys profile to run before generating the
66+
profile. This should be > the duration of the run.
67+
68+
2. Run again, this time without collecting the profile, and get the total run
69+
time in seconds. This value will be used by the script to calculate the
70+
CPU(non-GPU) seconds for the analysis.
71+
72+
3. Say the run elapsed time is 306 seconds, from step #2. Run script to
73+
analyze:
74+
75+
```bash
76+
python3 gputrc2graph.py \
77+
--in_file run1.nsys-rep,vllm,gpt-oss,306 \
78+
--title "vLLM-gpt-oss profile"
79+
```
80+
81+
The command will produce 2 files for analysis:
82+
83+
- result.html: this categorizes kernel names into different categories in a
84+
stacked bar chart.
85+
- result.csv: shows how the kernel names are mapped to the different
86+
categories.
87+
88+
### HTML visualization with result.html
89+
90+
The html file shows the number of elapsed seconds due to different GPU
91+
Substages or categories, which consist of moe_gemm (Mixture of Experts GEMM)
92+
kernels the biggest category, at 148 seconds, followed by "attn" or attention
93+
kernels. This lets the user prioritize the kernels to focus on for performance
94+
optimizations.
95+
96+
![Example GPU Trace Visualization](images/html.png)
97+
98+
There's also an appended data table underneath the bar chart for copying out to other post-processing tools.
99+
100+
![Example GPU Trace Table](images/html_tbl.png)
101+
102+
### Kernel to category mapping with result.csv
103+
104+
Suppose the user would like to focus on improving triton kernels. It's not the
105+
biggest consumer of cycles at 9.74 sec but perhaps it hasn't been optimized.
106+
The next step is to use the result.csv to dive into what the kernels are which
107+
compose the triton kernel GPU cycles. The following image shows that
108+
triton_poi_fused__to_copy_add_addmm_cat_.. kernel to be the biggest
109+
contributor to GPU cycles.
110+
111+
![Example GPU Trace csv](images/csv1.png)
112+
113+
## Example 2: analyze multiple profiles
114+
115+
Suppose the user has multiple nsys trace files, captured for different models,
116+
say llama and gpt-oss in this case, and wish to compare their GPU/non-GPU
117+
time, something like the following command can be used.
118+
119+
```bash
120+
python3 gputrc2graph.py \
121+
--in_file run1.nsys-rep,vllm,llama,100 run2.nsys-rep,vllm,gpt-oss,102 \
122+
--out_dir results \
123+
--title "Comparison of vLLM Models"
124+
```
125+
126+
The analysis process is similar to example 1 but now there will be multiple
127+
stack bar charts that can be compared. The categories for the different
128+
kernels will remain the same, so that it's easy to compare the GPU cycles for
129+
the same categories.
130+
131+
Once a category is shown to have more cycles for one configuration than
132+
another, the next step would be to use the csv file to see what kernels are
133+
mapped into that category, and which kernels are taking the largest amount of
134+
time which would cause a difference for the overall category.
135+
136+
## Example 3: add new classification for a new model
137+
138+
Suppose there's a new model ABC that is available for engine DEF, and say there
139+
are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels
140+
have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*"
141+
or "*K*" in them, add a new entry like so:
142+
143+
```python
144+
engine_model = {
145+
'DEF': {
146+
'ABC': {
147+
'layer_anno': {
148+
'Stage': {
149+
'.*': 'layer',
150+
},
151+
'Substage': {
152+
'H|I': 'gemm',
153+
'J|K': 'attn',
154+
'CUDA mem': 'non-gpu-H_D_memops',
155+
'.*': 'misc'
156+
}
157+
}
158+
},
159+
}
160+
'vllm': {...}
161+
```
162+
163+
Basically Substage is a dictionary with a list of key/value pairs, where the
164+
keys are regex's of the kernel names to be classified, and values are the
165+
classification bins which one wishes to compare across engines/models.
166+
167+
The last 2 entries are common for all engine/models, consisting of CUDA memory
168+
operations and a 'misc' for anything that's leftover and can't be classified.
169+
170+
When invoking gputrc2graph.py, specify a trace file with this new model/engine
171+
like the following:
172+
173+
```bash
174+
--infile new.nsys-rep,DEF,ABC,<runtime>
175+
```

0 commit comments

Comments
 (0)