diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index d53a680c9182..6b1eabf3d67f 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -67,13 +67,13 @@ Legend:
Show more
-First start serving your model
+First start serving your model:
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
```
-Then run the benchmarking script
+Then run the benchmarking script:
```bash
# download dataset
@@ -87,7 +87,7 @@ vllm bench serve \
--num-prompts 10
```
-If successful, you will see the following output
+If successful, you will see the following output:
```text
============ Serving Benchmark Result ============
@@ -125,7 +125,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
```bash
# start server
-VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
+vllm serve meta-llama/Llama-3.1-8B-Instruct
```
```bash
@@ -167,7 +167,7 @@ vllm bench serve \
##### InstructCoder Benchmark with Speculative Decoding
``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
@@ -184,7 +184,7 @@ vllm bench serve \
##### Spec Bench Benchmark with Speculative Decoding
``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
@@ -366,7 +366,6 @@ Total num output tokens: 1280
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
-VLLM_USE_V1=1 \
vllm bench throughput \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
@@ -781,6 +780,104 @@ This should be seen as an edge case, and if this behavior can be avoided by sett
+#### Embedding Benchmark
+
+Benchmark the performance of embedding requests in vLLM.
+
+
+Show more
+
+##### Text Embeddings
+
+Unlike generative models which use Completions API or Chat Completions API,
+you should set `--backend openai-embeddings` and `--endpoint /v1/embeddings` to use the Embeddings API.
+
+You can use any text dataset to benchmark the model, such as ShareGPT.
+
+Start the server:
+
+```bash
+vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+```
+
+Run the benchmark:
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+ --model jinaai/jina-embeddings-v3 \
+ --backend openai-embeddings \
+ --endpoint /v1/embeddings \
+ --dataset-name sharegpt \
+ --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+##### Multi-modal Embeddings
+
+Unlike generative models which use Completions API or Chat Completions API,
+you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backend to use depends on the model:
+
+- CLIP: `--backend openai-embeddings-clip`
+- VLM2Vec: `--backend openai-embeddings-vlm2vec`
+
+For other models, please add your own implementation inside to match the expected instruction format.
+
+You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
+For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
+
+Serve and benchmark CLIP:
+
+```bash
+# Run this in another process
+vllm serve openai/clip-vit-base-patch32
+
+# Run these one by one after the server is up
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+ --model openai/clip-vit-base-patch32 \
+ --backend openai-embeddings-clip \
+ --endpoint /v1/embeddings \
+ --dataset-name sharegpt \
+ --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json
+
+vllm bench serve \
+ --model openai/clip-vit-base-patch32 \
+ --backend openai-embeddings-clip \
+ --endpoint /v1/embeddings \
+ --dataset-name hf \
+ --dataset-path lmarena-ai/VisionArena-Chat
+```
+
+Serve and benchmark VLM2Vec:
+
+```bash
+# Run this in another process
+vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
+ --trust-remote-code \
+ --chat-template examples/template_vlm2vec_phi3v.jinja
+
+# Run these one by one after the server is up
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+ --model TIGER-Lab/VLM2Vec-Full \
+ --backend openai-embeddings-vlm2vec \
+ --endpoint /v1/embeddings \
+ --dataset-name sharegpt \
+ --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json
+
+vllm bench serve \
+ --model TIGER-Lab/VLM2Vec-Full \
+ --backend openai-embeddings-vlm2vec \
+ --endpoint /v1/embeddings \
+ --dataset-name hf \
+ --dataset-path lmarena-ai/VisionArena-Chat
+```
+
+
+
[](){ #performance-benchmarks }
## Performance Benchmarks
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index e955b15e87fe..7ffc21905924 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1582,10 +1582,10 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
"like to add support for additional dataset formats."
)
- if dataset_class.IS_MULTIMODAL and args.backend not in [
- "openai-chat",
- "openai-audio",
- ]:
+ if dataset_class.IS_MULTIMODAL and not (
+ args.backend in ("openai-chat", "openai-audio")
+ or "openai-embeddings-" in args.backend
+ ):
# multi-modal benchmark is only available on OpenAI Chat
# endpoint-type.
raise ValueError(
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 425a171c3c06..34dce5edb0c7 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -10,9 +10,10 @@
import traceback
from collections.abc import Awaitable
from dataclasses import dataclass, field
-from typing import Optional, Protocol, Union
+from typing import Any, Literal, Optional, Protocol, Union
import aiohttp
+import regex as re
from tqdm.asyncio import tqdm
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@@ -103,6 +104,40 @@ def __call__(
) -> Awaitable[RequestFuncOutput]: ...
+def _validate_api_url(
+ api_url: str,
+ api_name: str,
+ expected_suffixes: Union[str, set[str]],
+) -> None:
+ if isinstance(expected_suffixes, str):
+ expected_suffixes = {expected_suffixes}
+
+ expected_suffixes = {*expected_suffixes, "profile"}
+
+ if not api_url.endswith(tuple(expected_suffixes)):
+ raise ValueError(f"{api_name} URL must end with one of: {expected_suffixes}.")
+
+
+def _update_payload_common(
+ payload: dict[str, Any],
+ request_func_input: RequestFuncInput,
+) -> None:
+ if request_func_input.ignore_eos:
+ payload["ignore_eos"] = request_func_input.ignore_eos
+ if request_func_input.extra_body:
+ payload.update(request_func_input.extra_body)
+
+
+def _update_headers_common(
+ headers: dict[str, Any],
+ request_func_input: RequestFuncInput,
+) -> None:
+ if request_func_input.extra_headers:
+ headers |= request_func_input.extra_headers
+ if request_func_input.request_id:
+ headers["x-request-id"] = request_func_input.request_id
+
+
async def async_request_openai_completions(
request_func_input: RequestFuncInput,
session: aiohttp.ClientSession,
@@ -118,9 +153,7 @@ async def async_request_openai_completions(
The output of the request function.
"""
api_url = request_func_input.api_url
- assert api_url.endswith(("completions", "profile")), (
- "OpenAI Completions API URL must end with 'completions' or 'profile'."
- )
+ _validate_api_url(api_url, "OpenAI Completions API", "completions")
payload = {
"model": request_func_input.model_name
@@ -136,15 +169,12 @@ async def async_request_openai_completions(
"include_usage": True,
},
}
- if request_func_input.ignore_eos:
- payload["ignore_eos"] = request_func_input.ignore_eos
- if request_func_input.extra_body:
- payload.update(request_func_input.extra_body)
- headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
- if request_func_input.extra_headers:
- headers |= request_func_input.extra_headers
- if request_func_input.request_id:
- headers["x-request-id"] = request_func_input.request_id
+ _update_payload_common(payload, request_func_input)
+
+ headers = {
+ "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+ }
+ _update_headers_common(headers, request_func_input)
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@@ -222,27 +252,41 @@ async def async_request_openai_completions(
return output
-async def async_request_openai_chat_completions(
+def _get_chat_content(
request_func_input: RequestFuncInput,
- session: aiohttp.ClientSession,
- pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
- api_url = request_func_input.api_url
- assert api_url.endswith(("chat/completions", "profile")), (
- "OpenAI Chat Completions API URL must end with 'chat/completions'."
- )
+ mm_position: Literal["first", "last"] = "last",
+) -> list[dict[str, Any]]:
+ text_contents = [{"type": "text", "text": request_func_input.prompt}]
- content = [{"type": "text", "text": request_func_input.prompt}]
+ mm_contents = []
if request_func_input.multi_modal_content:
mm_content = request_func_input.multi_modal_content
if isinstance(mm_content, list):
- content.extend(mm_content)
+ mm_contents.extend(request_func_input.multi_modal_content)
elif isinstance(mm_content, dict):
- content.append(mm_content)
+ mm_contents.append(request_func_input.multi_modal_content)
else:
raise TypeError(
"multi_modal_content must be a dict or list[dict] for openai-chat"
)
+
+ if mm_position == "first":
+ return mm_contents + text_contents
+
+ return text_contents + mm_contents
+
+
+async def async_request_openai_chat_completions(
+ request_func_input: RequestFuncInput,
+ session: aiohttp.ClientSession,
+ pbar: Optional[tqdm] = None,
+ mm_position: Literal["first", "last"] = "last",
+) -> RequestFuncOutput:
+ api_url = request_func_input.api_url
+ _validate_api_url(api_url, "OpenAI Chat Completions API", "chat/completions")
+
+ content = _get_chat_content(request_func_input, mm_position=mm_position)
+
payload = {
"model": request_func_input.model_name
if request_func_input.model_name
@@ -257,18 +301,13 @@ async def async_request_openai_chat_completions(
"include_usage": True,
},
}
- if request_func_input.ignore_eos:
- payload["ignore_eos"] = request_func_input.ignore_eos
- if request_func_input.extra_body:
- payload.update(request_func_input.extra_body)
+ _update_payload_common(payload, request_func_input)
+
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
- if request_func_input.extra_headers:
- headers |= request_func_input.extra_headers
- if request_func_input.request_id:
- headers["x-request-id"] = request_func_input.request_id
+ _update_headers_common(headers, request_func_input)
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@@ -343,10 +382,7 @@ async def async_request_openai_audio(
import soundfile
api_url = request_func_input.api_url
- assert api_url.endswith(("transcriptions", "translations")), (
- "OpenAI Chat Completions API URL must end with 'transcriptions' "
- )
- "or `translations`."
+ _validate_api_url(api_url, "OpenAI Audio API", {"transcriptions", "translations"})
content = [{"type": "text", "text": request_func_input.prompt}]
payload = {
@@ -361,15 +397,12 @@ async def async_request_openai_audio(
"stream_include_usage": True,
"stream_continuous_usage_stats": True,
}
- if request_func_input.extra_body:
- payload.update(request_func_input.extra_body)
+ _update_payload_common(payload, request_func_input)
+
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
- if request_func_input.extra_headers:
- headers |= request_func_input.extra_headers
- if request_func_input.request_id:
- headers["x-request-id"] = request_func_input.request_id
+ _update_headers_common(headers, request_func_input)
# Send audio file
def to_bytes(y, sr):
@@ -451,26 +484,13 @@ def to_bytes(y, sr):
return output
-async def async_request_openai_embeddings(
- request_func_input: RequestFuncInput,
+async def _run_openai_embeddings(
session: aiohttp.ClientSession,
+ api_url: str,
+ payload: dict[str, Any],
+ headers: dict[str, Any],
pbar: Optional[tqdm] = None,
-):
- api_url = request_func_input.api_url
- assert api_url.endswith("embeddings"), (
- "OpenAI Embeddings API URL must end with 'embeddings'."
- )
-
- headers = {
- "Content-Type": "application/json",
- "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
- }
-
- payload = {
- "model": request_func_input.model,
- "input": request_func_input.prompt,
- }
-
+) -> RequestFuncOutput:
output = RequestFuncOutput()
st = time.perf_counter()
output.start_time = st
@@ -494,6 +514,137 @@ async def async_request_openai_embeddings(
return output
+async def async_request_openai_embeddings(
+ request_func_input: RequestFuncInput,
+ session: aiohttp.ClientSession,
+ pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+ api_url = request_func_input.api_url
+ _validate_api_url(api_url, "OpenAI Embeddings API", "embeddings")
+
+ payload = {
+ "model": request_func_input.model_name
+ if request_func_input.model_name
+ else request_func_input.model,
+ "input": request_func_input.prompt,
+ }
+ _update_payload_common(payload, request_func_input)
+
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+ }
+ _update_headers_common(headers, request_func_input)
+
+ return await _run_openai_embeddings(
+ session,
+ api_url,
+ payload=payload,
+ headers=headers,
+ pbar=pbar,
+ )
+
+
+async def async_request_openai_embeddings_chat(
+ request_func_input: RequestFuncInput,
+ session: aiohttp.ClientSession,
+ pbar: Optional[tqdm] = None,
+ mm_position: Literal["first", "last"] = "last",
+) -> RequestFuncOutput:
+ api_url = request_func_input.api_url
+ _validate_api_url(api_url, "OpenAI Embeddings API", "embeddings")
+
+ content = _get_chat_content(request_func_input, mm_position=mm_position)
+
+ payload = {
+ "model": request_func_input.model_name
+ if request_func_input.model_name
+ else request_func_input.model,
+ "messages": [
+ {"role": "user", "content": content},
+ ],
+ }
+ _update_payload_common(payload, request_func_input)
+
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+ }
+ _update_headers_common(headers, request_func_input)
+
+ return await _run_openai_embeddings(
+ session,
+ api_url,
+ payload=payload,
+ headers=headers,
+ pbar=pbar,
+ )
+
+
+async def async_request_openai_embeddings_clip(
+ request_func_input: RequestFuncInput,
+ session: aiohttp.ClientSession,
+ pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+ if request_func_input.multi_modal_content:
+ # Image input
+ request_func_input.prompt = ""
+
+ # max_model_len=77 is too short for most datasets,
+ # so by default we truncate the prompt to max_model_len
+ if request_func_input.extra_body is None:
+ request_func_input.extra_body = {}
+ if "truncate_prompt_tokens" not in request_func_input.extra_body:
+ request_func_input.extra_body["truncate_prompt_tokens"] = -1
+
+ return await async_request_openai_embeddings_chat(
+ request_func_input,
+ session,
+ pbar=pbar,
+ )
+
+
+def _try_extract_request_idx(request_func_input: RequestFuncInput):
+ if request_func_input.request_id:
+ match = re.search(r"(\d+)$", request_func_input.request_id)
+ if match:
+ try:
+ return int(match.group(1))
+ except ValueError:
+ pass
+
+ return None
+
+
+async def async_request_openai_embeddings_vlm2vec(
+ request_func_input: RequestFuncInput,
+ session: aiohttp.ClientSession,
+ pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+ if request_func_input.multi_modal_content:
+ request_idx = _try_extract_request_idx(request_func_input)
+
+ # Adjust the ratio manually if needed.
+ use_image_only_prompt = request_idx is None or request_idx % 2 == 0
+
+ if use_image_only_prompt:
+ # Image input
+ request_func_input.prompt = "Represent the given image."
+ else:
+ # Text+Image input
+ request_func_input.prompt = (
+ f"Represent the given image with the following question: "
+ f"{request_func_input.prompt}"
+ )
+
+ return await async_request_openai_embeddings_chat(
+ request_func_input,
+ session,
+ pbar=pbar,
+ mm_position="first",
+ )
+
+
# TODO: Add more request functions for different API protocols.
ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
"vllm": async_request_openai_completions,
@@ -501,6 +652,9 @@ async def async_request_openai_embeddings(
"openai-chat": async_request_openai_chat_completions,
"openai-audio": async_request_openai_audio,
"openai-embeddings": async_request_openai_embeddings,
+ "openai-embeddings-chat": async_request_openai_embeddings_chat,
+ "openai-embeddings-clip": async_request_openai_embeddings_clip,
+ "openai-embeddings-vlm2vec": async_request_openai_embeddings_vlm2vec,
}
OPENAI_COMPATIBLE_BACKENDS = [
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index cad1d2eb2c6a..f061c1479968 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -465,6 +465,7 @@ def calculate_metrics(
async def benchmark(
+ task_type: TaskType,
endpoint_type: str,
api_url: str,
base_url: str,
@@ -490,18 +491,10 @@ async def benchmark(
ramp_up_end_rps: Optional[int] = None,
ready_check_timeout_sec: int = 600,
):
- task_type = (
- TaskType.EMBEDDING
- if api_url.endswith("/v1/embeddings")
- else TaskType.GENERATION
- )
- if endpoint_type in ASYNC_REQUEST_FUNCS:
- if task_type == TaskType.EMBEDDING:
- request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"]
- else:
- request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
- else:
- raise ValueError(f"Unknown backend: {endpoint_type}")
+ try:
+ request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
+ except KeyError:
+ raise ValueError(f"Unknown backend: {endpoint_type}") from None
# Reuses connections across requests to reduce TLS handshake overhead.
connector = aiohttp.TCPConnector(
@@ -1310,36 +1303,43 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
input_requests = get_samples(args, tokenizer)
goodput_config_dict = check_goodput_args(args)
+ backend = args.backend
+ task_type = TaskType.EMBEDDING if "embeddings" in backend else TaskType.GENERATION
+
# Collect the sampling parameters.
- sampling_params = {
- k: v
- for k, v in {
- "top_p": args.top_p,
- "top_k": args.top_k,
- "min_p": args.min_p,
- "temperature": args.temperature,
- "frequency_penalty": args.frequency_penalty,
- "presence_penalty": args.presence_penalty,
- "repetition_penalty": args.repetition_penalty,
- }.items()
- if v is not None
- }
-
- # Sampling parameters are only supported by openai-compatible backend.
- if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
- raise ValueError(
- "Sampling parameters are only supported by openai-compatible backends."
- )
+ if task_type == TaskType.GENERATION:
+ sampling_params = {
+ k: v
+ for k, v in {
+ "top_p": args.top_p,
+ "top_k": args.top_k,
+ "min_p": args.min_p,
+ "temperature": args.temperature,
+ "frequency_penalty": args.frequency_penalty,
+ "presence_penalty": args.presence_penalty,
+ "repetition_penalty": args.repetition_penalty,
+ }.items()
+ if v is not None
+ }
+
+ # Sampling parameters are only supported by openai-compatible backend.
+ if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+ raise ValueError(
+ "Sampling parameters are only supported by openai-compatible backends."
+ )
- if "temperature" not in sampling_params:
- sampling_params["temperature"] = 0.0 # Default to greedy decoding.
+ if "temperature" not in sampling_params:
+ sampling_params["temperature"] = 0.0 # Default to greedy decoding.
+ else:
+ sampling_params = {}
# Avoid GC processing "static" data - reduce pause times.
gc.collect()
gc.freeze()
benchmark_result = await benchmark(
- endpoint_type=args.backend,
+ task_type=task_type,
+ endpoint_type=backend,
api_url=api_url,
base_url=base_url,
model_id=model_id,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 24eac17950fe..930b3bc69c3d 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -498,14 +498,14 @@ def resolve_hf_chat_template(
tokenizer_name_or_path=model_config.tokenizer,
)
if path is not None:
- logger.info(
+ logger.info_once(
"Loading chat template fallback for %s as there isn't one "
"defined on HF Hub.",
tokenizer.name_or_path,
)
chat_template = load_chat_template(path)
else:
- logger.debug(
+ logger.debug_once(
"There is no chat template fallback for %s", tokenizer.name_or_path
)