openvinotoolkit · dtrawins · Aug 13, 2025 · Aug 9, 2025 · Aug 9, 2025 · Aug 10, 2025
diff --git a/ci/lib_search.py b/ci/lib_search.py
@@ -107,7 +107,7 @@ def check_dir(start_dir):
         'net_http.patch',
         'partial.patch',
         'ovms_drogon_trantor.patch',
-        'gorila.patch',
+        'gorilla.patch',
         'opencv_cmake_flags.txt',
         'ovms-c/dist',
         'requirements.txt',

diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md
@@ -9,8 +9,7 @@ With the rise of AI PC capabilities, hosting own Visual Studio code assistant is
 - Intel Meteor Lake, Lunar Lake, Arrow Lake or newer Intel CPU.
 
 ## Prepare Code Chat/Edit Model 
-We need to use medium size model in order to keep 50ms/word for human to feel the chat responsive.
-This will work in streaming mode, meaning we will see the chat response/code diff generation slowly roll out in real-time.
+We need to use medium size model to get reliable responses but also to fit it to the available memory on the host or discrete GPU.
 
 Download export script, install its dependencies and create directory for the models:
 ```console
@@ -22,10 +21,10 @@ mkdir models
 
 Export `codellama/CodeLlama-7b-Instruct-hf`:
 ```console
-python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device NPU --overwrite_models
+python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --overwrite_models
 ```
 
-> **Note:** Use `--target_device GPU` for Intel GPU or omit this parameter to run on Intel CPU
+> **Note:** Use `--target_device NPU` for Intel NPU or omit this parameter to run on Intel CPU
 
 ## Prepare Code Completion Model
 For this task we need smaller, lighter model that will produce code quicker than chat task.
@@ -104,10 +103,16 @@ Please refer to OpenVINO Model Server installation first: [link](../../docs/depl
 ovms --rest_port 8000 --config_path ./models/config_all.json
 ```
 
-### Linux: via Docker
+### Linux: via Docker with GPU
+```bash
+docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/models/config_all.json
+```
+
+### Linux: via Docker with NPU
 ```bash
 docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
-  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:2025.2 --rest_port 8000 --config_path /workspace/models/config_all.json
+  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/models/config_all.json
 ```
 
 ## Set Up Visual Studio Code

diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md
@@ -37,10 +37,11 @@ python export_model.py text_generation --help
 ```
 Expected Output:
 ```console
-usage: export_model.py text_generation [-h] [--model_repository_path MODEL_REPOSITORY_PATH] --source_model SOURCE_MODEL [--model_name MODEL_NAME] [--weight-format PRECISION] [--config_file_path CONFIG_FILE_PATH] [--overwrite_models] [--target_device TARGET_DEVICE]
-                                       [--ov_cache_dir OV_CACHE_DIR] [--pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}] [--kv_cache_precision {u8}] [--extra_quantization_params EXTRA_QUANTIZATION_PARAMS] [--enable_prefix_caching] [--disable_dynamic_split_fuse]
-                                       [--max_num_batched_tokens MAX_NUM_BATCHED_TOKENS] [--max_num_seqs MAX_NUM_SEQS] [--cache_size CACHE_SIZE] [--draft_source_model DRAFT_SOURCE_MODEL] [--draft_model_name DRAFT_MODEL_NAME] [--max_prompt_len MAX_PROMPT_LEN] [--prompt_lookup_decoding]
-                                       [--tools_model_type {llama3,phi4,hermes3,qwen3}]
+usage: export_model.py text_generation [-h] [--model_repository_path MODEL_REPOSITORY_PATH] --source_model SOURCE_MODEL [--model_name MODEL_NAME] [--weight-format PRECISION] [--config_file_path CONFIG_FILE_PATH]
+                                       [--overwrite_models] [--target_device TARGET_DEVICE] [--ov_cache_dir OV_CACHE_DIR] [--extra_quantization_params EXTRA_QUANTIZATION_PARAMS] [--pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}]
+                                       [--kv_cache_precision {u8}] [--enable_prefix_caching] [--disable_dynamic_split_fuse] [--max_num_batched_tokens MAX_NUM_BATCHED_TOKENS] [--max_num_seqs MAX_NUM_SEQS]
+                                       [--cache_size CACHE_SIZE] [--draft_source_model DRAFT_SOURCE_MODEL] [--draft_model_name DRAFT_MODEL_NAME] [--max_prompt_len MAX_PROMPT_LEN] [--prompt_lookup_decoding]
+                                       [--reasoning_parser {qwen3}] [--tool_parser {llama3,phi4,hermes3,qwen3}] [--enable_tool_guided_generation]
 
 options:
   -h, --help            show this help message and exit
@@ -59,12 +60,12 @@ options:
                         CPU, GPU, NPU or HETERO, default is CPU
   --ov_cache_dir OV_CACHE_DIR
                         Folder path for compilation cache to speedup initialization time
+  --extra_quantization_params EXTRA_QUANTIZATION_PARAMS
+                        Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2"
   --pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}
                         Type of the pipeline to be used. AUTO is used by default
   --kv_cache_precision {u8}
                         u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.
-  --extra_quantization_params EXTRA_QUANTIZATION_PARAMS
-                        Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2"
   --enable_prefix_caching
                         This algorithm is used to cache the prompt tokens.
   --disable_dynamic_split_fuse
@@ -83,8 +84,12 @@ options:
                         Sets NPU specific property for maximum number of tokens in the prompt. Not effective if target device is not NPU
   --prompt_lookup_decoding
                         Set pipeline to use prompt lookup decoding
-  --tools_model_type {llama3,phi4,hermes3,qwen3}
-                        Set the type of model chat template and output parser
+  --reasoning_parser {qwen3}
+                        Set the type of the reasoning parser for reasoning content extraction
+  --tool_parser {llama3,phi4,hermes3,qwen3}
+                        Set the type of the tool parser for tool calls extraction
+  --enable_tool_guided_generation
+                        Enables enforcing tool schema during generation. Requires setting tool_parser
 ```
 
 ## Model Export Examples
@@ -111,19 +116,34 @@ Text generation for NPU target device. Command below sets max allowed prompt siz
 ```console
 python export_model.py text_generation --source_model meta-llama/Llama-3.2-3B-Instruct --config_file_path models/config_all.json --model_repository_path models --target_device NPU --max_prompt_len 2048 --ov_cache_dir ./models/.ov_cache
 ```
+> **Note:** Some models like `mistralai/Mistral-7B-Instruct-v0.3` might fail to export because the task can't be determined automatically. In such situation it can be set in `--extra_quantization_parameters`. For example:
+```console
+python export_model.py text_generation --source_model mistralai/Mistral-7B-Instruct-v0.3 --model_repository_path models --extra_quantization_params "--task text-generation-with-past"
+```
+> **Note:** Model `microsoft/Phi-3.5-vision-instruct` requires one manual adjustments after export in the file `generation_config.json` like in the [PR](https://huggingface.co/microsoft/Phi-3.5-vision-instruct/discussions/40/files).
+It will ensure, the generation stops after eos token.
+
+> **Note:** In order to export GPTQ models, you need to install also package `auto_gptq` via command `BUILD_CUDA_EXT=0 pip install auto_gptq` on Linux and `set BUILD_CUDA_EXT=0 && pip install auto_gptq` on Windows. 
+
 
 ### Embedding Models
 
 #### Embeddings with deployment on a single CPU host:
 ```console
-python export_model.py embeddings --source_model Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 --config_file_path models/config_all.json
+python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 --config_file_path models/config_all.json
 ```
 
 #### Embeddings with deployment on a dual CPU host:
 ```console
-python export_model.py embeddings --source_model Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 --config_file_path models/config_all.json --num_streams 2
+python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 --config_file_path models/config_all.json --num_streams 2
+```
+
+#### Embeddings with pooling parameter
+```console
+python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --weight-format fp16 --config_file_path models/config_all.json
 ```
 
+
 #### With Input Truncation
 By default, embeddings endpoint returns an error when the input exceed the maximum model context length.
 It is possible to change the behavior to truncate prompts automatically to fit the model. Add `--truncate` option in the export command.
@@ -138,7 +158,7 @@ python export_model.py embeddings \
 
 ### Reranking Models
 ```console
-python export_model.py rerank \
+python export_model.py rerank_ov \
     --source_model BAAI/bge-reranker-large \
     --weight-format int8 \
     --config_file_path models/config_all.json \

diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -52,7 +52,7 @@ def add_common_arguments(parser):
                          'Not effective if target device is not NPU', dest='max_prompt_len')
 parser_text.add_argument('--prompt_lookup_decoding', action='store_true', help='Set pipeline to use prompt lookup decoding', dest='prompt_lookup_decoding')
 parser_text.add_argument('--reasoning_parser', choices=["qwen3"], help='Set the type of the reasoning parser for reasoning content extraction', dest='reasoning_parser')
-parser_text.add_argument('--tool_parser', choices=["llama3","phi4","hermes3"], help='Set the type of the tool parser for tool calls extraction', dest='tool_parser')
+parser_text.add_argument('--tool_parser', choices=["llama3","phi4","hermes3", "qwen3"], help='Set the type of the tool parser for tool calls extraction', dest='tool_parser')
 parser_text.add_argument('--enable_tool_guided_generation', action='store_true', help='Enables enforcing tool schema during generation. Requires setting tool_parser', dest='enable_tool_guided_generation')
 
 parser_embeddings = subparsers.add_parser('embeddings', help='[deprecated] export model for embeddings endpoint with models split into separate, versioned directories')
@@ -231,7 +231,8 @@ def add_common_arguments(parser):
           reasoning_parser: "{{reasoning_parser}}",{% endif %}
           {%- if tool_parser %}
           tool_parser: "{{tool_parser}}",{% endif %}
-          enable_tool_guided_generation: {% if not enable_tool_guided_generation %}false{% else %} true{% endif%},
+          {%- if enable_tool_guided_generation %}
+          enable_tool_guided_generation: {% if not enable_tool_guided_generation %}false{% else %} true{% endif%},{% endif %}
       }
   }
   input_stream_handler {
@@ -401,7 +402,12 @@ def export_text_generation_model(model_repository_path, source_model, model_name
                     task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1"
             optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path)
             if os.system(optimum_command):
-                raise ValueError("Failed to export llm model", source_model)    
+                raise ValueError("Failed to export llm model", source_model)
+            if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))):
+                print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model")
+                convert_tokenizer_command = "convert_tokenizer --with-detokenizer -o {} {}".format(llm_model_path, source_model)
+                if os.system(convert_tokenizer_command):
+                    raise ValueError("Failed to export tokenizer and detokenizer", source_model)
     ### Export draft model for speculative decoding 
     draft_source_model = task_parameters.get("draft_source_model", None)
     draft_model_dir_name = None   
@@ -666,11 +672,12 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
         args['draft_model_name'] = args['draft_source_model']
 ###
 
+if args['extra_quantization_params'] is None:
+    args['extra_quantization_params'] = ""
+
 template_parameters = {k: v for k, v in args.items() if k not in ['model_repository_path', 'source_model', 'model_name', 'precision', 'version', 'config_file_path', 'overwrite_models']}
 print("template params:", template_parameters)
 
-if template_parameters['extra_quantization_params'] is None:
-    template_parameters['extra_quantization_params'] = ""
 if args['task'] == 'text_generation':
     export_text_generation_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])
 

diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt
@@ -9,7 +9,7 @@ nncf>=2.11.0
 sentence_transformers
 sentencepiece==0.2.0
 openai 
-transformers<4.52
+transformers<4.53
 einops
 torchvision
 timm==1.0.15

diff --git a/demos/continuous_batching/accuracy/README.md b/demos/continuous_batching/accuracy/README.md
@@ -113,7 +113,7 @@ Use [Berkeley function call leaderboard ](https://github.com/ShishirPatil/gorill
 git clone https://github.com/ShishirPatil/gorilla
 cd gorilla/berkeley-function-call-leaderboard
 git checkout ac37049f00022af54cc44b6aa0cad4402c22d1a0
-curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/agent-accuracy/demos/continuous_batching/accuracy/gorila.patch | git apply -v
+curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/continuous_batching/accuracy/gorilla.patch | git apply -v
 pip install -e . 
 ```
 The commands below assumes the models is deployed with the name `openvino-qwen3-8b-int8`. It must match the name set in the `bfcl_eval/constants/model_config.py`.

diff --git a/...continuous_batching/accuracy/gorila.patch → ...ontinuous_batching/accuracy/gorilla.patch b/...continuous_batching/accuracy/gorila.patch → ...ontinuous_batching/accuracy/gorilla.patch
diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md
@@ -1,5 +1,7 @@
 # Agentic AI with OpenVINO Model Server {#ovms_demos_continuous_batching_agent}
 
+This demo version requires OVMS version 2025.3. Build it from [source](../../../docs/build_from_source.md) before it is published.
+
 OpenVINO Model Server can be used to serve language models for AI Agents. It supports the usage of tools in the context of content generation.
 It can be integrated with MCP servers and AI agent frameworks. 
 You can learn more about [tools calling based on OpenAI API](https://platform.openai.com/docs/guides/function-calling?api-mode=responses)
@@ -10,10 +12,14 @@ Here are presented required steps to deploy language models trained for tools su
 The application employing OpenAI agent SDK is using MCP server. It is equipped with a set of tools to providing context for the content generation.
 The tools can also be used for automation purposes based on input in text format.  
 
+
+
 ## Export LLM model
 Currently supported models:
 - Qwen/Qwen3-8B
+- Qwen/Qwen3-4B
 - meta-llama/Llama-3.1-8B-Instruct
+- meta-llama/Llama-3.2-3B-Instruct
 - NousResearch/Hermes-3-Llama-3.1-8B
 - microsoft/Phi-4-mini-instruct
 
@@ -23,7 +29,7 @@ The model response with tool call follow a specific syntax which is process by a
 Download export script, install it's dependencies and create directory for the models:
 ```console
 curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py
-pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt
+pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/2/demos/common/export_models/requirements.txt
 mkdir models
 ```
 Run `export_model.py` script to download and quantize the model:
@@ -47,7 +53,13 @@ python export_model.py text_generation --source_model Qwen/Qwen3-8B --weight-for
 ::::
 
 You can use similar commands for different models. Change the source_model and the tools_model_type (note that as of today the following types as available: `[phi4, llama3, qwen3, hermes3]`).
-> **Note:** The tuned chat template will be copied to the model folder as template.jinja and the response parser will be set in the graph.pbtxt
+> **Note:** Some models give more reliable responses with tuned chat template. Copy custom template to the model folder like below:
+```
+curl -L -o models/meta-llama/Llama-3.1-8B-Instruct/template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_llama3.1_json.jinja
+curl -L -o models/meta-llama/Llama-3.2-3B-Instruct/template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_llama3.2_json.jinja
+curl -L -o models/NousResearch/Hermes-3-Llama-3.1-8B/template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_hermes.jinja
+curl -L -o models/microsoft/Phi-4-mini-instruct/template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_phi4_mini.jinja
+```
 
 
 ## Start OVMS
@@ -74,7 +86,7 @@ In case you want to use GPU device to run the generation, add extra docker param
 to `docker run` command, use the image with GPU support. Export the models with precision matching the GPU capacity and adjust pipeline configuration.
 It can be applied using the commands below:
 ```bash
-docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/models:ro openvino/model_server:2025.2-gpu \
+docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/models:ro openvino/model_server:latest-gpu \
 --rest_port 8000 --model_path /models/Qwen/Qwen3-8B --model_name Qwen/Qwen3-8B
 ```
 :::

diff --git a/demos/continuous_batching/agentic_ai/openai_agent.py b/demos/continuous_batching/agentic_ai/openai_agent.py
@@ -117,7 +117,7 @@ def get_model(self, _) -> Model:
     agent = Agent(
         name="Assistant",
         mcp_servers=[fs_server, weather_server],
-        model_settings=ModelSettings(tool_choice="auto", temperature=0.0),
+        model_settings=ModelSettings(tool_choice="auto", temperature=0.0, max_tokens=1000, extra_body={"chat_template_kwargs": {"enable_thinking": False}}),
     )
     loop = asyncio.new_event_loop()
     loop.run_until_complete(run(args.query, agent, OVMS_MODEL_PROVIDER, args.stream))