Add audio endpoints to benchmarking script (#3804) (#3828)

michalkulakowski · web-flow · commit c6ec84eb3e35 · 2025-11-28T14:24:01.000+01:00
### 🛠 Summary

CVS-174708
diff --git a/demos/audio/README.md b/demos/audio/README.md
@@ -102,6 +102,25 @@ print("Generation finished")
 
 Play speech.wav file to check generated speech.
 
+## Benchmarking speech generation
+An asynchronous benchmarking client can be used to access the model server performance with various load conditions. Below are execution examples captured on Intel(R) Core(TM) Ultra 7 258V.
+
+```console
+git clone https://github.com/openvinotoolkit/model_server
+cd model_server/demos/benchmark/v3/
+pip install -r requirements.txt
+python benchmark.py --api_url http://localhost:8122/v3/audio/speech --model microsoft/speecht5_tts --batch_size 1 --limit 100 --request_rate inf --backend text2speech --dataset edinburghcstr/ami --hf-subset 'ihm' --tokenizer openai/whisper-large-v3-turbo --trust-remote-code True
+Number of documents: 100
+100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.19s/it]
+Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
+Tokens: 1802
+Success rate: 100.0%. (100/100)
+Throughput - Tokens per second: 15.2
+Mean latency: 63653.98 ms
+Median latency: 66736.83 ms
+Average document length: 18.02 tokens
+```
+
 ## Transcription
 ### Model preparation
 Many variances of Whisper models can be deployed in a single command by using pre-configured models from [OpenVINO HuggingFace organization](https://huggingface.co/collections/OpenVINO/speech-to-text) and used both for translations and transcriptions endpoints.
@@ -208,6 +227,26 @@ print(transcript.text)
 The quick brown fox jumped over the lazy dog.
 ```
 :::
+
+## Benchmarking transcription
+An asynchronous benchmarking client can be used to access the model server performance with various load conditions. Below are execution examples captured on Intel(R) Core(TM) Ultra 7 258V.
+
+```console
+git clone https://github.com/openvinotoolkit/model_server
+cd model_server/demos/benchmark/v3/
+pip install -r requirements.txt
+python benchmark.py --api_url http://localhost:8000/v3/audio/transcriptions --model openai/whisper-large-v3-turbo --batch_size 1 --limit 1000 --request_rate inf --dataset edinburghcstr/ami --hf-subset ihm --backend speech2text --trust-remote-code True
+Number of documents: 1000
+100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:44<00:00,  3.51it/s]
+Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
+Tokens: 10948
+Success rate: 100.0%. (1000/1000)
+Throughput - Tokens per second: 38.5
+Mean latency: 26670.64 ms
+Median latency: 20772.09 ms
+Average document length: 10.948 tokens
+```
+
 ## Translation
 To test translations endpoint we first need to prepare audio file with speech in language other than English, e.g. Spanish. To generate such sample we will use finetuned version of microsoft/speecht5_tts model.
 
diff --git a/demos/benchmark/embeddings/requirements.txt b/demos/benchmark/embeddings/requirements.txt
diff --git a/demos/benchmark/v3/benchmark.py b/demos/benchmark/v3/benchmark.py
@@ -31,14 +31,19 @@
 from transformers import AutoTokenizer
 import argparse
 import aiohttp
+import io
+import soundfile
 
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
 default_url_description = "Default value depends on the backend: \
     ovms-embeddings: http://localhost:8000/v3/embeddings ;\
     ovms_rerank: http://localhost:8000/v3/rerank ;\
     tei_embed: http://localhost:8080/embed ;\
-    infinity-embeddings: http://localhost:7997/embeddings"
+    infinity-embeddings: http://localhost:7997/embeddings ;\
+    text2speech: http://localhost:8000/v3/audio/speech ;\
+    speech2text: http://localhost:8000/v3/audio/transcriptions ;\
+    translations: http://localhost:8000/v3/audio/translations"
 
 parser = argparse.ArgumentParser(description='Run benchmark for embeddings endpoints', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument('--dataset', required=False, default='Cohere/wikipedia-22-12-simple-embeddings', help='Dataset for load generation from HF or a keyword "synthetic"', dest='dataset')
@@ -47,8 +52,12 @@
 parser.add_argument('--model', required=False, default='Alibaba-NLP/gte-large-en-v1.5', help='HF model name', dest='model')
 parser.add_argument('--request_rate', required=False, default='inf', help='Average amount of requests per seconds in random distribution', dest='request_rate')
 parser.add_argument('--batch_size', required=False, type=int, default=16, help='Number of strings in every requests', dest='batch_size')
-parser.add_argument('--backend', required=False, default='ovms-embeddings', choices=['ovms-embeddings','tei-embed','infinity-embeddings','ovms_rerank'], help='Backend serving API type', dest='backend')
+parser.add_argument('--backend', required=False, default='ovms-embeddings', choices=['ovms-embeddings','tei-embed','infinity-embeddings','ovms_rerank','text2speech','speech2text', 'translations'], help='Backend serving API type', dest='backend')
 parser.add_argument('--limit', required=False, type=int, default=1000, help='Number of documents to use in testing', dest='limit')
+parser.add_argument('--split', required=False, default='train', help='Dataset split', dest='split')
+parser.add_argument('--hf-subset', required=False, help='Hf dataset subset', dest='subset')
+parser.add_argument('--trust-remote-code', required=False, type=bool, default=False, help='Trust remote code from huggingface', dest='trust_remote_code')
+parser.add_argument('--tokenizer', required=False, help='HF tokenizer, provide if different than model', dest='tokenizer')
 
 args = vars(parser.parse_args())
 
@@ -61,15 +70,22 @@
     for i in range(args["limit"]):
         docs = docs.add_item({"text":dummy_text})
 else:
-    filter = f"train[:{args['limit']}]"
-    docs = load_dataset(args["dataset"], split=filter)
+    filter = f"{args['split']}[:{args['limit']}]"
+    if args["subset"] == None:
+        docs = load_dataset(args["dataset"], trust_remote_code=args['trust_remote_code'], split=filter)
+    else:
+        docs = load_dataset(args["dataset"], args["subset"], trust_remote_code=args['trust_remote_code'], split=filter)
 
 print("Number of documents:",len(docs))
 
 batch_size = args['batch_size']
 
 def count_tokens(docs, model):
-    tokenizer = AutoTokenizer.from_pretrained(model)
+    if args["tokenizer"] == None:
+        hf_tokenizer = model
+    else:
+        hf_tokenizer = args["tokenizer"]
+    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer)
     documents = docs.iter(batch_size=1)
     num_tokens = 0
     for request in documents:
@@ -89,12 +105,104 @@ class RequestFuncOutput:
     latency: float = 0.0
     tokens_len: int = 0
     error: str = ""
+    text: str = ""
 
 application_json_headers = {
             "Content-Type": "application/json",
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         }
 
+application_multipart_headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+
+async def async_request_text2speech(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT, read_bufsize=100000) as session:
+        payload = {
+            "model": request_func_input.model,
+            "input": request_func_input.documents[0],
+        }
+        headers = application_json_headers
+
+        output = RequestFuncOutput()
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        if not chunk_bytes:
+                            continue
+                        # uncomment for response debugging
+                        # chunk_bytes = chunk_bytes.decode("utf-8")
+                        # data = json.loads(chunk_bytes)
+                        # TBD: saving response to file
+                        timestamp = time.perf_counter()
+                        output.success = True
+                        output.latency =  timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+                    print("ERROR", response.reason)
+
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+async def async_request_speech2text(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT, read_bufsize=100000) as session:
+        headers = application_multipart_headers
+
+        y, sr = request_func_input.documents[0]["array"], request_func_input.documents[0]["sampling_rate"]
+        buffer = io.BytesIO()
+        soundfile.write(buffer, y, sr, format="WAV")
+        buffer.seek(0)
+
+        form = aiohttp.FormData()
+        form.add_field('file', buffer, content_type='audio/wav')
+        form.add_field('model', request_func_input.model)
+        output = RequestFuncOutput()
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, data=form,
+                                    headers=headers) as response:                
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        if not chunk_bytes:
+                            continue
+                        timestamp = time.perf_counter()
+                        output.success = True
+                        output.latency =  timestamp - st
+                        output.text = chunk_bytes.decode("utf-8")
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
 async def async_request_embeddings(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -231,7 +339,10 @@ async def get_request(
 ) -> AsyncGenerator[List[str], None]:
     documents = documents_all.iter(batch_size=batch_size)
     for request in documents:
-        yield request["text"]
+        if args["backend"] == "speech2text" or args["backend"] == "translations":
+            yield request["audio"]
+        else:
+            yield request["text"]
         if request_rate == float("inf"):
             # If the request rate is infinity, then we don't need to wait.
             continue
@@ -260,11 +371,22 @@ async def limited_request_func(request_func_input, pbar):
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
     benchmark_duration = time.perf_counter() - benchmark_start_time
     pbar.close()
+    if args["backend"] == "speech2text" or args["backend"] == "translations":
+        if args["tokenizer"] == None:
+            hf_tokenizer = model
+        else:
+            hf_tokenizer = args["tokenizer"]
+        tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer)
+        for output in outputs:
+            data = json.loads(output.text)
+            output.tokens_len =  len(tokenizer(data['text'],add_special_tokens=False, truncation=True)["input_ids"])
+
     result = {
         "duration": benchmark_duration,
         "errors": [output.error for output in outputs],
         "latencies": [output.latency for output in outputs],
         "successes": [output.success for output in outputs],
+        "token_count": [output.tokens_len for output in outputs],
     }
     return result
 
@@ -280,6 +402,24 @@ async def limited_request_func(request_func_input, pbar):
 elif args["backend"] == "infinity-embeddings":
     backend_function = async_request_embeddings
     default_api_url = "http://localhost:7997/embeddings"
+elif args["backend"] == "text2speech":
+    if(batch_size != 1):
+        print("ERROR: Only batch_size=1 supported in audio/speech endpoint")
+        exit()
+    backend_function = async_request_text2speech
+    default_api_url = "http://localhost:8000/v3/audio/speech"
+elif args["backend"] == "speech2text":
+    if(batch_size != 1):
+        print("ERROR: Only batch_size=1 supported in audio/transcriptions endpoint")
+        exit()
+    backend_function = async_request_speech2text
+    default_api_url = "http://localhost:8000/v3/audio/transcriptions"
+elif args["backend"] == "translations":
+    if(batch_size != 1):
+        print("ERROR: Only batch_size=1 supported in audio/translations endpoint")
+        exit()
+    backend_function = async_request_speech2text
+    default_api_url = "http://localhost:8000/v3/audio/translations"
 else:
     print("invalid backend")
     exit()
@@ -288,8 +428,10 @@ async def limited_request_func(request_func_input, pbar):
     args["api_url"] = default_api_url
 
 benchmark_results = asyncio.run(benchmark(docs=docs, model=args["model"], api_url=args["api_url"], request_rate=float(args["request_rate"]), backend_function=backend_function))
-
-num_tokens = count_tokens(docs=docs,model=args["model"])
+if args["backend"] == "speech2text" or args["backend"] == "translations":
+    num_tokens = sum(benchmark_results['token_count'])
+else:
+    num_tokens = count_tokens(docs=docs,model=args["model"])
 #print(benchmark_results)
 print("Tokens:",num_tokens)
 print(f"Success rate: {sum(benchmark_results['successes'])/len(benchmark_results['successes'])*100}%. ({sum(benchmark_results['successes'])}/{len(benchmark_results['successes'])})")
diff --git a/demos/benchmark/v3/requirements.txt b/demos/benchmark/v3/requirements.txt
@@ -0,0 +1,8 @@
+datasets==3.6.0
+dataclasses==0.6
+transformers==4.57.3
+numpy==2.3.5
+tqdm==4.67.1
+sentencepiece==0.2.1
+soundfile==0.13.1
+librosa==0.11.0
diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
@@ -371,9 +371,9 @@ An asynchronous benchmarking client can be used to access the model server perfo
 ```console
 git clone https://github.com/openvinotoolkit/model_server
 pushd .
-cd model_server/demos/benchmark/embeddings/
+cd model_server/demos/benchmark/v3/
 pip install -r requirements.txt
-python benchmark_embeddings.py --api_url http://localhost:8000/v3/embeddings --dataset synthetic --synthetic_length 5 --request_rate 10 --batch_size 1 --model BAAI/bge-large-en-v1.5
+python benchmark.py --api_url http://localhost:8000/v3/embeddings --dataset synthetic --synthetic_length 5 --request_rate 10 --batch_size 1 --model BAAI/bge-large-en-v1.5
 Number of documents: 1000
 100%|████████████████████████████████████████████████████████████████| 1000/1000 [01:44<00:00,  9.56it/s]
 Tokens: 5000
@@ -384,7 +384,7 @@ Median latency: 13.97 ms
 Average document length: 5.0 tokens
 
 
-python benchmark_embeddings.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 32 --dataset synthetic --synthetic_length 510 --model BAAI/bge-large-en-v1.5
+python benchmark.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 32 --dataset synthetic --synthetic_length 510 --model BAAI/bge-large-en-v1.5
 Number of documents: 1000
 100%|████████████████████████████████████████████████████████████████| 32/32 [00:17<00:00,  1.82it/s]
 Tokens: 510000
@@ -395,7 +395,7 @@ Median latency: 9905.79 ms
 Average document length: 510.0 tokens
 
 
-python benchmark_embeddings.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 1 --dataset Cohere/wikipedia-22-12-simple-embeddings --model BAAI/bge-large-en-v1.5
+python benchmark.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 1 --dataset Cohere/wikipedia-22-12-simple-embeddings --model BAAI/bge-large-en-v1.5
 Number of documents: 1000
 100%|████████████████████████████████████████████████████████████████| 1000/1000 [00:15<00:00, 64.02it/s]
 Tokens: 83208
diff --git a/demos/rerank/README.md b/demos/rerank/README.md
@@ -212,9 +212,9 @@ OVMS reranking: [0.9968273 0.0913821]
 
 An asynchronous benchmarking client can be used to access the model server performance with various load conditions. Below are execution examples captured on dual Intel(R) Xeon(R) CPU Max 9480.
 ```bash
-cd model_server/demos/benchmark/embeddings/
+cd model_server/demos/benchmark/v3/
 pip install -r requirements.txt
-python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
+python benchmark.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
 Number of documents: 1000
 100%|██████████████████████████████████████| 50/50 [00:19<00:00,  2.53it/s]
 Tokens: 501000
@@ -224,7 +224,7 @@ Mean latency: 10268 ms
 Median latency: 10249 ms
 Average document length: 501.0 tokens
 
-python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
+python benchmark.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
 Number of documents: 1000
 100%|██████████████████████████████████████| 50/50 [00:19<00:00,  2.53it/s]
 Tokens: 501000
@@ -234,7 +234,7 @@ Mean latency: 10268 ms
 Median latency: 10249 ms
 Average document length: 501.0 tokens
 
-python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset Cohere/wikipedia-22-12-simple-embeddings --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
+python benchmark.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset Cohere/wikipedia-22-12-simple-embeddings --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
 Number of documents: 1000
 100%|██████████████████████████████████████| 50/50 [00:09<00:00,  5.55it/s]
 Tokens: 92248