openvinotoolkit · dtrawins · Aug 13, 2025 · Aug 9, 2025 · Aug 9, 2025 · Aug 10, 2025
diff --git a/ci/lib_search.py b/ci/lib_search.py
@@ -107,7 +107,7 @@ def check_dir(start_dir):
         'net_http.patch',
         'partial.patch',
         'ovms_drogon_trantor.patch',
-        'gorila.patch',
+        'gorilla.patch',
         'opencv_cmake_flags.txt',
         'ovms-c/dist',
         'requirements.txt',

diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md
@@ -9,8 +9,7 @@ With the rise of AI PC capabilities, hosting own Visual Studio code assistant is
 - Intel Meteor Lake, Lunar Lake, Arrow Lake or newer Intel CPU.
 
 ## Prepare Code Chat/Edit Model 
-We need to use medium size model in order to keep 50ms/word for human to feel the chat responsive.
-This will work in streaming mode, meaning we will see the chat response/code diff generation slowly roll out in real-time.
+We need to use medium size model to get reliable responses but also to fit it to the available memory on the host or discrete GPU.
 
 Download export script, install its dependencies and create directory for the models:
 ```console
@@ -22,10 +21,10 @@ mkdir models
 
 Export `codellama/CodeLlama-7b-Instruct-hf`:
 ```console
-python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device NPU --overwrite_models
+python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --overwrite_models
 ```
 
-> **Note:** Use `--target_device GPU` for Intel GPU or omit this parameter to run on Intel CPU
+> **Note:** Use `--target_device NPU` for Intel NPU or omit this parameter to run on Intel CPU
 
 ## Prepare Code Completion Model
 For this task we need smaller, lighter model that will produce code quicker than chat task.
@@ -104,10 +103,16 @@ Please refer to OpenVINO Model Server installation first: [link](../../docs/depl
 ovms --rest_port 8000 --config_path ./models/config_all.json
 ```
 
-### Linux: via Docker
+### Linux: via Docker with GPU
+```bash
+docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/models/config_all.json
+```
+
+### Linux: via Docker with NPU
 ```bash
 docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
-  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:2025.2 --rest_port 8000 --config_path /workspace/models/config_all.json
+  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/models/config_all.json
 ```
 
 ## Set Up Visual Studio Code

diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md
@@ -111,19 +111,31 @@ Text generation for NPU target device. Command below sets max allowed prompt siz
 ```console
 python export_model.py text_generation --source_model meta-llama/Llama-3.2-3B-Instruct --config_file_path models/config_all.json --model_repository_path models --target_device NPU --max_prompt_len 2048 --ov_cache_dir ./models/.ov_cache
 ```
+> **Note:** Some models like `mistralai/Mistral-7B-Instruct-v0.3` might fail to export because the task can't be determined automatically. In such situation it can be set in `--extra_quantization_parameters`. For example:
+```console
+python export_model.py text_generation --source_model mistralai/Mistral-7B-Instruct-v0.3 --model_repository_path models --extra_quantization_params "--task text-generation-with-past"
+```
+> **Note:** Model `microsoft/Phi-3.5-vision-instruct` requires one manual adjustments after export in the file `generation_config.json` like in the [PR](https://huggingface.co/microsoft/Phi-3.5-vision-instruct/discussions/40/files).
+It will ensure, the generation stops after eos token.
 
 ### Embedding Models
 
 #### Embeddings with deployment on a single CPU host:
 ```console
-python export_model.py embeddings --source_model Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 --config_file_path models/config_all.json
+python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 --config_file_path models/config_all.json
 ```
 
 #### Embeddings with deployment on a dual CPU host:
 ```console
-python export_model.py embeddings --source_model Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 --config_file_path models/config_all.json --num_streams 2
+python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 --config_file_path models/config_all.json --num_streams 2
+```
+
+#### Embeddings with pooling parameter
+```console
+python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --weight-format fp16 --config_file_path models/config_all.json
 ```
 
+
 #### With Input Truncation
 By default, embeddings endpoint returns an error when the input exceed the maximum model context length.
 It is possible to change the behavior to truncate prompts automatically to fit the model. Add `--truncate` option in the export command.
@@ -138,7 +150,7 @@ python export_model.py embeddings \
 
 ### Reranking Models
 ```console
-python export_model.py rerank \
+python export_model.py rerank_ov \
     --source_model BAAI/bge-reranker-large \
     --weight-format int8 \
     --config_file_path models/config_all.json \

diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -231,7 +231,8 @@ def add_common_arguments(parser):
           reasoning_parser: "{{reasoning_parser}}",{% endif %}
           {%- if tool_parser %}
           tool_parser: "{{tool_parser}}",{% endif %}
-          enable_tool_guided_generation: {% if not enable_tool_guided_generation %}false{% else %} true{% endif%},
+          {%- if enable_tool_guided_generation %}
+          enable_tool_guided_generation: {% if not enable_tool_guided_generation %}false{% else %} true{% endif%},{% endif %}
       }
   }
   input_stream_handler {
@@ -401,7 +402,12 @@ def export_text_generation_model(model_repository_path, source_model, model_name
                     task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1"
             optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path)
             if os.system(optimum_command):
-                raise ValueError("Failed to export llm model", source_model)    
+                raise ValueError("Failed to export llm model", source_model)
+            if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))):
+                print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model")
+                convert_tokenizer_command = "convert_tokenizer --with-detokenizer -o {} {}".format(llm_model_path, source_model)
+                if os.system(convert_tokenizer_command):
+                    raise ValueError("Failed to export tokenizer and detokenizer", source_model)
     ### Export draft model for speculative decoding 
     draft_source_model = task_parameters.get("draft_source_model", None)
     draft_model_dir_name = None   

diff --git a/demos/continuous_batching/accuracy/README.md b/demos/continuous_batching/accuracy/README.md
@@ -113,7 +113,7 @@ Use [Berkeley function call leaderboard ](https://github.com/ShishirPatil/gorill
 git clone https://github.com/ShishirPatil/gorilla
 cd gorilla/berkeley-function-call-leaderboard
 git checkout ac37049f00022af54cc44b6aa0cad4402c22d1a0
-curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/agent-accuracy/demos/continuous_batching/accuracy/gorila.patch | git apply -v
+curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/continuous_batching/accuracy/gorilla.patch | git apply -v
 pip install -e . 
 ```
 The commands below assumes the models is deployed with the name `openvino-qwen3-8b-int8`. It must match the name set in the `bfcl_eval/constants/model_config.py`.

diff --git a/...continuous_batching/accuracy/gorila.patch → ...ontinuous_batching/accuracy/gorilla.patch b/...continuous_batching/accuracy/gorila.patch → ...ontinuous_batching/accuracy/gorilla.patch
diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md
@@ -1,5 +1,7 @@
 # Agentic AI with OpenVINO Model Server {#ovms_demos_continuous_batching_agent}
 
+This demo version requires OVMS version 2025.3. Build it from [source](../../../docs/build_from_source.md) before it is published.
+
 OpenVINO Model Server can be used to serve language models for AI Agents. It supports the usage of tools in the context of content generation.
 It can be integrated with MCP servers and AI agent frameworks. 
 You can learn more about [tools calling based on OpenAI API](https://platform.openai.com/docs/guides/function-calling?api-mode=responses)
@@ -10,10 +12,14 @@ Here are presented required steps to deploy language models trained for tools su
 The application employing OpenAI agent SDK is using MCP server. It is equipped with a set of tools to providing context for the content generation.
 The tools can also be used for automation purposes based on input in text format.  
 
+
+
 ## Export LLM model
 Currently supported models:
 - Qwen/Qwen3-8B
+- Qwen/Qwen3-4B
 - meta-llama/Llama-3.1-8B-Instruct
+- meta-llama/Llama-3.2-3B-Instruct
 - NousResearch/Hermes-3-Llama-3.1-8B
 - microsoft/Phi-4-mini-instruct
 
@@ -23,7 +29,7 @@ The model response with tool call follow a specific syntax which is process by a
 Download export script, install it's dependencies and create directory for the models:
 ```console
 curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py
-pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt
+pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/2/demos/common/export_models/requirements.txt
 mkdir models
 ```
 Run `export_model.py` script to download and quantize the model:
@@ -47,7 +53,13 @@ python export_model.py text_generation --source_model Qwen/Qwen3-8B --weight-for
 ::::
 
 You can use similar commands for different models. Change the source_model and the tools_model_type (note that as of today the following types as available: `[phi4, llama3, qwen3, hermes3]`).
-> **Note:** The tuned chat template will be copied to the model folder as template.jinja and the response parser will be set in the graph.pbtxt
+> **Note:** Some models give more reliable responses with tuned chat template. Copy custom template to the model folder like below:
+```
+curl -L -o models/meta-llama/Llama-3.1-8B-Instruct/template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_llama3.1_json.jinja
+curl -L -o models/meta-llama/Llama-3.2-3B-Instruct/template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_llama3.2_json.jinja
+curl -L -o models/NousResearch/Hermes-3-Llama-3.1-8B/template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_hermes.jinja
+curl -L -o models/microsoft/Phi-4-mini-instruct/template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_phi4_mini.jinja
+```
 
 
 ## Start OVMS
@@ -74,7 +86,7 @@ In case you want to use GPU device to run the generation, add extra docker param
 to `docker run` command, use the image with GPU support. Export the models with precision matching the GPU capacity and adjust pipeline configuration.
 It can be applied using the commands below:
 ```bash
-docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/models:ro openvino/model_server:2025.2-gpu \
+docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/models:ro openvino/model_server:latest-gpu \
 --rest_port 8000 --model_path /models/Qwen/Qwen3-8B --model_name Qwen/Qwen3-8B
 ```
 :::

diff --git a/demos/continuous_batching/agentic_ai/openai_agent.py b/demos/continuous_batching/agentic_ai/openai_agent.py
@@ -117,7 +117,7 @@ def get_model(self, _) -> Model:
     agent = Agent(
         name="Assistant",
         mcp_servers=[fs_server, weather_server],
-        model_settings=ModelSettings(tool_choice="auto", temperature=0.0),
+        model_settings=ModelSettings(tool_choice="auto", temperature=0.0,max_tokens=1000, extra_body={"chat_template_kwargs":{"enable_thinking": False}}),
     )
     loop = asyncio.new_event_loop()
     loop.run_until_complete(run(args.query, agent, OVMS_MODEL_PROVIDER, args.stream))
diff --git a/demos/continuous_batching/scaling/README.md b/demos/continuous_batching/scaling/README.md
@@ -137,10 +137,10 @@ python export_model.py text_generation --source_model meta-llama/Meta-Llama-3-8B
 ```
 Start the Model Server instances:
 ```bash
-docker run --device /dev/dri/renderD128 -d --rm -p 8003:8003 -u 0 -v $(pwd)/models/Meta-Llama-3-8B-Instruct_INT4:/model:ro openvino/model_server:latest --rest_port 8003 --model_name meta-llama/Meta-Llama-3-8B-Instruct --model_path /model
-docker run --device /dev/dri/renderD129 -d --rm -p 8004:8004 -u 0 -v $(pwd)/models/Meta-Llama-3-8B-Instruct_INT4:/model:ro openvino/model_server:latest --rest_port 8004 --model_name meta-llama/Meta-Llama-3-8B-Instruct --model_path /model
-docker run --device /dev/dri/renderD130 -d --rm -p 8005:8005 -u 0 -v $(pwd)/models/Meta-Llama-3-8B-Instruct_INT4:/model:ro openvino/model_server:latest --rest_port 8005 --model_name meta-llama/Meta-Llama-3-8B-Instruct --model_path /model
-docker run --device /dev/dri/renderD131 -d --rm -p 8006:8006 -u 0 -v $(pwd)/models/Meta-Llama-3-8B-Instruct_INT4:/model:ro openvino/model_server:latest --rest_port 8006 --model_name meta-llama/Meta-Llama-3-8B-Instruct --model_path /model
+docker run --device /dev/dri/renderD128 -d --rm -p 8003:8003 -u 0 -v $(pwd)/models/Meta-Llama-3-8B-Instruct_INT4:/model:ro openvino/model_server:latest-gpu --rest_port 8003 --model_name meta-llama/Meta-Llama-3-8B-Instruct --model_path /model
+docker run --device /dev/dri/renderD129 -d --rm -p 8004:8004 -u 0 -v $(pwd)/models/Meta-Llama-3-8B-Instruct_INT4:/model:ro openvino/model_server:latest-gpu --rest_port 8004 --model_name meta-llama/Meta-Llama-3-8B-Instruct --model_path /model
+docker run --device /dev/dri/renderD130 -d --rm -p 8005:8005 -u 0 -v $(pwd)/models/Meta-Llama-3-8B-Instruct_INT4:/model:ro openvino/model_server:latest-gpu --rest_port 8005 --model_name meta-llama/Meta-Llama-3-8B-Instruct --model_path /model
+docker run --device /dev/dri/renderD131 -d --rm -p 8006:8006 -u 0 -v $(pwd)/models/Meta-Llama-3-8B-Instruct_INT4:/model:ro openvino/model_server:latest-gpu --rest_port 8006 --model_name meta-llama/Meta-Llama-3-8B-Instruct --model_path /model
 ```
 Confirm in logs if the containers loaded the models successfully.
 
@@ -211,11 +211,12 @@ Continuous batching with Multi GPU configuration will be added soon.
 
 Export the model:
 ```bash
-python export_model.py text_generation --source_model deepseek-ai/DeepSeek-R1-Distill-Qwen-32B --model_name DeepSeek-R1-Distill-Qwen-32B_INT4 --weight-format int4 --model_repository_path models --target_device HETERO:GPU.0,GPU.1 --pipeline_type LM
+python export_model.py text_generation --source_model deepseek-ai/DeepSeek-R1-Distill-Qwen-32B --model_name DeepSeek-R1-Distill-Qwen-32B_INT4 --weight-format int4 --model_repository_path models --target_device HETERO:GPU.0,GPU.1 --pipeline_type LM_CB
 ```
+> **Note**: Using the pipeline type LM_CB which includes continuous batching, requires OVMS version 2025.3. Build it from source before the publication.
 
 ```bash
-docker run --device /dev/dri -d --rm -p 8000:8000 -u 0 -v $(pwd)/models/DeepSeek-R1-Distill-Qwen-32B_INT4:/model:ro openvino/model_server:latest --rest_port 8000 --model_name deepseek-ai/DeepSeek-R1-Distill-Qwen-32B --model_path /model
+docker run --device /dev/dri -d --rm -p 8000:8000 -u 0 -v $(pwd)/models/DeepSeek-R1-Distill-Qwen-32B_INT4:/model:ro openvino/model_server:latest-gpu --rest_port 8000 --model_name deepseek-ai/DeepSeek-R1-Distill-Qwen-32B --model_path /model
 ```
 
 ### Testing the scalability

diff --git a/demos/continuous_batching/speculative_decoding/README.md b/demos/continuous_batching/speculative_decoding/README.md
@@ -141,40 +141,43 @@ Models used in this demo - `meta-llama/CodeLlama-7b-hf` and `AMD-Llama-135m` are
 
 Below you can see an exemplary unary request (you can switch `stream` parameter to enable streamed response). Compared to calls to regular continuous batching model, this request has additional parameter `num_assistant_tokens` which specifies how many tokens should a draft model generate before main model validates them. 
 
-
 ```console
-curl http://localhost:8000/v3/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "meta-llama/CodeLlama-7b-hf",
-    "temperature": 0,
-    "max_tokens":100,
-    "stream":false,
-    "prompt": "<s>def quicksort(numbers):",
-    "num_assistant_tokens": 5
-  }'| jq .
+pip3 install openai
+```
+```python
+from openai import OpenAI
+
+client = OpenAI(
+  base_url="http://localhost:8000/v3",
+  api_key="unused"
+)
+
+stream = client.completions.create(
+    model="meta-llama/CodeLlama-7b-hf",
+    prompt="<s>def quicksort(numbers):",
+    temperature=0,
+    max_tokens=100,
+    extra_body={"num_assistant_tokens": 5}
+    stream=True,
+)
+for chunk in stream:
+    if chunk.choices[0].text is not None:
+        print(chunk.choices[0].text, end="", flush=True)
 ```
-```json
-{
-  "choices": [
-    {
-      "finish_reason": "length",
-      "index": 0,
-      "logprobs": null,
-      "text": "\n    if len(numbers) <= 1:\n        return numbers\n    else:\n        pivot = numbers[0]\n        lesser = [x for x in numbers[1:] if x <= pivot]\n        greater = [x for x in numbers[1:] if x > pivot]\n        return quicksort(lesser) + [pivot] + quicksort(greater)\n\n\ndef quicksort_recursive(numbers):\n    if"
-    }
-  ],
-  "created": 1737547359,
-  "model": "meta-llama/CodeLlama-7b-hf-sd",
-  "object": "text_completion",
-  "usage": {
-    "prompt_tokens": 9,
-    "completion_tokens": 100,
-    "total_tokens": 109
-  }
-}
 
 ```
+if len(numbers) <= 1:
+  return numbers
+else:
+  pivot = numbers[0]
+  lesser = [x for x in numbers[1:] if x <= pivot]
+  greater = [x for x in numbers[1:] if x > pivot]
+  return quicksort(lesser) + [pivot] + quicksort(greater)
+
+def quicksort_recursive(numbers):
+   if
+```   
+
 
 High value for `num_assistant_tokens` brings profit when tokens generated by the draft model mostly match the main model. If they don't, tokens are dropped and both models do additional work. For low values such risk is lower, but the potential performance boost is limited. Usually the value of `5` is a good compromise.