[Benchmarks] add benchmark for embedding models (#23000)

ZJY0516 · web-flow · commit 3ecbb14b814f · 2025-08-25T23:57:08.000-07:00
Signed-off-by: zjy0516 &lt;riverclouds.zhu@qq.com&gt;
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
@@ -73,7 +73,7 @@ class SampleRequest:
     Represents a single inference request for benchmarking.
     """
 
-    prompt: Union[str, Any]
+    prompt: Union[str, list[str]]
     prompt_len: int
     expected_output_len: int
     multi_modal_data: Optional[
@@ -409,6 +409,7 @@ def sample(
         range_ratio: float = DEFAULT_RANGE_RATIO,
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
+        batchsize: int = 1,
         **kwargs,
     ) -> list[SampleRequest]:
 
@@ -439,6 +440,21 @@ def sample(
                     request_id=request_id_prefix + str(i),
                 )
             )
+        # only used for embeddings benchmark.
+        if batchsize > 1:
+            batch_requests = []
+            # Create batched requests
+            for i in range(0, num_requests, batchsize):
+                batch = requests[i : i + batchsize]
+                batch_requests.append(
+                    SampleRequest(
+                        prompt=[req.prompt for req in batch],
+                        prompt_len=sum(req.prompt_len for req in batch),
+                        expected_output_len=0,
+                        request_id=request_id_prefix + str(i // batchsize),
+                    )
+                )
+            requests = batch_requests
         return requests
 
     def get_prefix(
@@ -475,8 +491,8 @@ def get_sampling_params(
         input_high = math.ceil(real_input_len * (1 + range_ratio))
         output_low = math.floor(output_len * (1 - range_ratio))
         output_high = math.ceil(output_len * (1 + range_ratio))
-        # Ensure the lower bound for output length is at least 1 to 
-        # prevent sampling 0 tokens. 
+        # Ensure the lower bound for output length is at least 1 to
+        # prevent sampling 0 tokens.
         output_low = max(output_low, 1)
 
         if input_low > input_high:
@@ -506,7 +522,6 @@ def get_sampling_params(
                                         size=num_requests)
         return input_lens, output_lens, offsets
 
-
     def generate_token_sequence(
         self,
         *,
@@ -1105,6 +1120,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
               "context length sampled from [input_len * (1 - range_ratio), "
               "input_len * (1 + range_ratio)]."),
     )
+    random_group.add_argument(
+        "--random-batch-size",
+        type=int,
+        default=1,
+        help=("Batch size for random sampling. "
+              "Only used for embeddings benchmark."),
+    )
 
     # random multimodal dataset options
     random_mm_group = parser.add_argument_group(
@@ -1196,8 +1218,6 @@ def normalize(d: dict) -> dict[tuple[int, int, int], float]:
         ),
     )
 
-
-
     hf_group = parser.add_argument_group("hf dataset options")
     hf_group.add_argument("--hf-subset",
                           type=str,
@@ -1348,29 +1368,32 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
     else:
         # For datasets that follow a similar structure, use a mapping.
         dataset_mapping = {
-            "sharegpt":
-            lambda: ShareGPTDataset(random_seed=args.seed,
-                                    dataset_path=args.dataset_path).sample(
-                                        tokenizer=tokenizer,
-                                        num_requests=args.num_prompts,
-                                        output_len=args.sharegpt_output_len,
-                                        request_id_prefix=args.request_id_prefix,
-                                    ),
-            "burstgpt":
-            lambda: BurstGPTDataset(random_seed=args.seed,
-                                    dataset_path=args.dataset_path).
-            sample(tokenizer=tokenizer, num_requests=args.num_prompts, 
-                   request_id_prefix=args.request_id_prefix,),
-            "random":
-            lambda: RandomDataset(random_seed=args.seed,
-                                  dataset_path=args.dataset_path).sample(
+            "sharegpt": lambda: ShareGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                output_len=args.sharegpt_output_len,
+                request_id_prefix=args.request_id_prefix,
+            ),
+            "burstgpt": lambda: BurstGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                request_id_prefix=args.request_id_prefix,
+            ),
+            "random": lambda: RandomDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
                 prefix_len=args.random_prefix_len,
                 input_len=args.random_input_len,
                 output_len=args.random_output_len,
                 range_ratio=args.random_range_ratio,
                 request_id_prefix=args.request_id_prefix,
+                batchsize=args.random_batch_size,
             ),
             "random-mm":
             lambda: RandomMultiModalDataset(
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -69,8 +69,8 @@ async def async_request_openai_completions(
     ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
 
     payload = {
-        "model": request_func_input.model_name \
-            if request_func_input.model_name else request_func_input.model,
+        "model": request_func_input.model_name
+        if request_func_input.model_name else request_func_input.model,
         "prompt": request_func_input.prompt,
         "temperature": 0.0,
         "repetition_penalty": 1.0,
@@ -135,7 +135,7 @@ async def async_request_openai_completions(
                             # Decoding phase
                             else:
                                 output.itl.append(timestamp -
-                                                    most_recent_timestamp)
+                                                  most_recent_timestamp)
 
                             most_recent_timestamp = timestamp
                             generated_text += text or ""
@@ -254,7 +254,7 @@ async def async_request_openai_chat_completions(
                             # Decoding phase
                             else:
                                 output.itl.append(timestamp -
-                                                    most_recent_timestamp)
+                                                  most_recent_timestamp)
 
                             generated_text += content or ""
                         elif usage := data.get("usage"):
@@ -394,12 +394,61 @@ def to_bytes(y, sr):
     return output
 
 
+async def async_request_openai_embeddings(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+):
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "embeddings"
+    ), "OpenAI Embeddings API URL must end with 'embeddings'."
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+
+    payload = {
+        "model": request_func_input.model,
+        "input": request_func_input.prompt,
+    }
+
+    output = RequestFuncOutput()
+    st = time.perf_counter()
+    try:
+        async with session.post(
+            url=api_url,
+            headers=headers,
+            json=payload
+        ) as response:
+            if response.status == 200:
+                output.latency = time.perf_counter() - st
+                data = await response.json()
+                output.success = True
+                output.generated_text = ""
+                output.prompt_len = data.get(
+                    "usage", {}).get(
+                    "prompt_tokens", 0)
+            else:
+                output.success = False
+                output.error = response.reason or ""
+    except Exception as e:
+        output.success = False
+        output.error = str(e)
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
 # TODO: Add more request functions for different API protocols.
 ASYNC_REQUEST_FUNCS = {
     "vllm": async_request_openai_completions,
     "openai": async_request_openai_completions,
     "openai-chat": async_request_openai_chat_completions,
     "openai-audio": async_request_openai_audio,
+    "openai-embeddings": async_request_openai_embeddings,
 }
 
 OPENAI_COMPATIBLE_BACKENDS = [
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py