Skip to content

Commit bc84d51

Browse files
committed
Sync NM/0.8.4 into RHOAI 2.20
2 parents 44d7cd2 + 0bc55b1 commit bc84d51

File tree

342 files changed

+11691
-5441
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

342 files changed

+11691
-5441
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
2+
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.31
8+
- name: "exact_match,flexible-extract"
9+
value: 0.47
10+
limit: 1319
11+
num_fewshot: 5

.buildkite/lm-eval-harness/configs/models-small.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
44
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
55
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
66
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
7-
Minitron-4B-Base-FP8.yaml
7+
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
88
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
99
Qwen2-1.5B-Instruct-FP8W8.yaml
1010
Meta-Llama-3-8B-QQQ.yaml

.buildkite/scripts/run-benchmarks.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
set -ex
66
set -o pipefail
77

8-
# cd into parent directory of this file
9-
cd "$(dirname "${BASH_SOURCE[0]}")/.."
8+
# cd 2 levels into the working directory
9+
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
1010

1111
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
1212

.buildkite/test-pipeline.yaml

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -163,11 +163,6 @@ steps:
163163
- tests/tracing
164164
commands:
165165
- pytest -v -s metrics
166-
- "pip install \
167-
'opentelemetry-sdk>=1.26.0,<1.27.0' \
168-
'opentelemetry-api>=1.26.0,<1.27.0' \
169-
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
170-
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
171166
- pytest -v -s tracing
172167

173168
##### fast check tests #####
@@ -292,6 +287,14 @@ steps:
292287
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
293288
parallelism: 4
294289

290+
- label: PyTorch Compilation Unit Tests
291+
source_file_dependencies:
292+
- vllm/
293+
- tests/compile
294+
commands:
295+
- pytest -v -s compile/test_pass_manager.py
296+
- pytest -v -s compile/test_fusion.py
297+
295298
- label: PyTorch Fullgraph Smoke Test # 9min
296299
source_file_dependencies:
297300
- vllm/
@@ -301,7 +304,6 @@ steps:
301304
# these tests need to be separated, cannot combine
302305
- pytest -v -s compile/piecewise/test_simple.py
303306
- pytest -v -s compile/piecewise/test_toy_llama.py
304-
- pytest -v -s compile/test_pass_manager.py
305307

306308
- label: PyTorch Fullgraph Test # 18min
307309
source_file_dependencies:
@@ -376,8 +378,10 @@ steps:
376378
source_file_dependencies:
377379
- vllm/
378380
- tests/tool_use
381+
- tests/mistral_tool_use
379382
commands:
380383
- pytest -v -s tool_use
384+
- pytest -v -s mistral_tool_use
381385

382386
##### models test #####
383387

@@ -389,7 +393,8 @@ steps:
389393
- pytest -v -s models/test_transformers.py
390394
- pytest -v -s models/test_registry.py
391395
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
392-
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
396+
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
397+
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
393398

394399
- label: Language Models Test (Standard) # 32min
395400
#mirror_hardwares: [amd]
@@ -426,7 +431,7 @@ steps:
426431
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
427432
- pytest -v -s models/multimodal
428433
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
429-
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
434+
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
430435
- pytest -v -s models/embedding/vision_language -m core_model
431436
- pytest -v -s models/encoder_decoder/audio_language -m core_model
432437
- pytest -v -s models/encoder_decoder/language -m core_model
@@ -445,10 +450,7 @@ steps:
445450
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
446451
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
447452
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
448-
# HACK - run phi3v tests separately to sidestep this transformers bug
449-
# https://github.com/huggingface/transformers/issues/34307
450-
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
451-
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
453+
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
452454
- pytest -v -s models/embedding/vision_language -m 'not core_model'
453455
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
454456
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'

.pre-commit-config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,12 @@ repos:
122122
language: system
123123
always_run: true
124124
pass_filenames: false
125+
- id: update-dockerfile-graph
126+
name: Update Dockerfile dependency graph
127+
entry: tools/update-dockerfile-graph.sh
128+
language: script
129+
files: ^docker/Dockerfile$
130+
pass_filenames: false
125131
# Keep `suggestion` last
126132
- id: suggestion
127133
name: Suggestion

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ set(VLLM_EXT_SRC
230230
"csrc/cache_kernels.cu"
231231
"csrc/attention/paged_attention_v1.cu"
232232
"csrc/attention/paged_attention_v2.cu"
233+
"csrc/attention/merge_attn_states.cu"
233234
"csrc/pos_encoding_kernels.cu"
234235
"csrc/activation_kernels.cu"
235236
"csrc/layernorm_kernels.cu"

README.md

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,13 @@ Easy, fast, and cheap LLM serving for everyone
1010
</h3>
1111

1212
<p align="center">
13-
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
13+
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
1414
</p>
1515

1616
---
1717

18-
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
19-
20-
---
21-
2218
*Latest News* 🔥
19+
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
2320
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
2421
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
2522
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).

benchmarks/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,24 @@ python3 vllm/benchmarks/benchmark_serving.py \
204204
--seed 42
205205
```
206206

207+
### Running With Sampling Parameters
208+
209+
When using OpenAI-compatible backends such as `vllm`, optional sampling
210+
parameters can be specified. Example client command:
211+
212+
```bash
213+
python3 vllm/benchmarks/benchmark_serving.py \
214+
--backend vllm \
215+
--model NousResearch/Hermes-3-Llama-3.1-8B \
216+
--endpoint /v1/completions \
217+
--dataset-name sharegpt \
218+
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
219+
--top-k 10 \
220+
--top-p 0.9 \
221+
--temperature 0.5 \
222+
--num-prompts 10
223+
```
224+
207225
---
208226
## Example - Offline Throughput Benchmark
209227

benchmarks/backend_request_func.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,3 +497,9 @@ def get_tokenizer(
497497
"scalellm": async_request_openai_completions,
498498
"sglang": async_request_openai_completions,
499499
}
500+
501+
OPENAI_COMPATIBLE_BACKENDS = [
502+
k for k, v in ASYNC_REQUEST_FUNCS.items()
503+
if v in (async_request_openai_completions,
504+
async_request_openai_chat_completions)
505+
]

benchmarks/benchmark_dataset.py

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
288288
class RandomDataset(BenchmarkDataset):
289289
# Default values copied from benchmark_serving.py for the random dataset.
290290
DEFAULT_PREFIX_LEN = 0
291-
DEFAULT_RANGE_RATIO = 1.0
291+
DEFAULT_RANGE_RATIO = 0.0
292292
DEFAULT_INPUT_LEN = 1024
293293
DEFAULT_OUTPUT_LEN = 128
294294

@@ -308,19 +308,32 @@ def sample(
308308
output_len: int = DEFAULT_OUTPUT_LEN,
309309
**kwargs,
310310
) -> list[SampleRequest]:
311+
# Enforce range_ratio < 1
312+
assert range_ratio < 1.0, (
313+
"random_range_ratio must be < 1.0 to ensure a valid sampling range"
314+
)
315+
311316
vocab_size = tokenizer.vocab_size
312317

313318
prefix_token_ids = (np.random.randint(
314319
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
315320

316-
input_low = int(input_len * range_ratio)
317-
output_low = int(output_len * range_ratio)
321+
# New sampling logic: [X * (1 - b), X * (1 + b)]
322+
input_low = int(input_len * (1 - range_ratio))
323+
input_high = int(input_len * (1 + range_ratio))
324+
output_low = int(output_len * (1 - range_ratio))
325+
output_high = int(output_len * (1 + range_ratio))
326+
327+
# Add logging for debugging
328+
logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
329+
logger.info("Sampling output_len from [%s, %s]", output_low,
330+
output_high)
318331

319332
input_lens = np.random.randint(input_low,
320-
input_len + 1,
333+
input_high + 1,
321334
size=num_requests)
322335
output_lens = np.random.randint(output_low,
323-
output_len + 1,
336+
output_high + 1,
324337
size=num_requests)
325338
offsets = np.random.randint(0, vocab_size, size=num_requests)
326339

@@ -472,25 +485,26 @@ def sample(
472485

473486
# Determine how many poem lines to use.
474487
num_input_lines = round((input_len - base_offset) / avg_len)
475-
num_prefix_lines = round((prefix_len - base_offset) / avg_len)
488+
num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
476489
prefix_lines = self.data[:num_prefix_lines]
477490

478491
samples = []
479-
for _ in range(num_requests):
492+
while len(samples) < num_requests:
480493
extra_lines = random.choices(self.data,
481494
k=num_input_lines - num_prefix_lines)
482495
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
483496
msg = [{"role": "user", "content": prompt}]
484497
prompt_formatted = tokenizer.apply_chat_template(
485498
msg, add_generation_prompt=True, tokenize=False)
486499
prompt_len = len(tokenizer(prompt_formatted).input_ids)
487-
samples.append(
488-
SampleRequest(
489-
prompt=prompt_formatted
490-
if return_prompt_formatted else prompt,
491-
prompt_len=prompt_len,
492-
expected_output_len=output_len,
493-
))
500+
if prompt_len <= input_len:
501+
samples.append(
502+
SampleRequest(
503+
prompt=prompt_formatted
504+
if return_prompt_formatted else prompt,
505+
prompt_len=prompt_len,
506+
expected_output_len=output_len,
507+
))
494508
return samples
495509

496510

0 commit comments

Comments
 (0)