add procedure for testing accuracy for agentic use case (#3434)

dtrawins · ngrozae · web-flow · commit 45b2eace5bb0 · 2025-07-14T17:03:06.000+02:00
---------

Co-authored-by: ngrozae &lt;104074686+ngrozae@users.noreply.github.com&gt;
diff --git a/ci/lib_search.py b/ci/lib_search.py
@@ -107,6 +107,7 @@ def check_dir(start_dir):
         'net_http.patch',
         'partial.patch',
         'ovms_drogon_trantor.patch',
+        'gorila.patch',
         'opencv_cmake_flags.txt',
         'ovms-c/dist',
         'requirements.txt',
diff --git a/demos/continuous_batching/accuracy/README.md b/demos/continuous_batching/accuracy/README.md
@@ -20,7 +20,8 @@ pip3 install -U -r demos/common/export_models/requirements.txt
 mkdir models 
 python demos/common/export_models/export_model.py text_generation --source_model meta-llama/Meta-Llama-3-8B-Instruct --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models
 python demos/common/export_models/export_model.py text_generation --source_model meta-llama/Meta-Llama-3-8B --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models
-python demos/common/export_models/export_model.py text_generation --source_model OpenGVLab/InternVL2_5-8B --weight-format fp16 --config_file_path models/config.json --model_repository_path models  
+python demos/common/export_models/export_model.py text_generation --source_model OpenGVLab/InternVL2_5-8B --weight-format fp16 --config_file_path models/config.json --model_repository_path models
+python demos/common/export_models/export_model.py text_generation --source_model Qwen/Qwen3-8B --model_name openvino-qwen3-8b-int8 --weight-format int8 --config_file_path models/config.json --model_repository_path models --tools_model_type qwen3 --overwrite_models --enable_prefix_caching
 ```
 
 ## Starting the model server
@@ -71,16 +72,14 @@ lm-eval --model local-completions --tasks wikitext --model_args model=meta-llama
 
 ## Running the tests for VLM models
 
-
 Use [lmms-eval project](https://github.com/EvolvingLMMs-Lab/lmms-eval) - mme and mmmu_val tasks. 
 
-
 ```bash
-export OPENAI_COMPATIBLE_API_URL=http://localhost:8000/v3
-export OPENAI_COMPATIBLE_API_KEY="unused"
+export OPENAI_BASE_URL=http://localhost:8000/v3
+export OPENAI_API_KEY="unused"
 git clone https://github.com/EvolvingLMMs-Lab/lmms-eval
 cd lmms-eval
-git checkout 4471ad311e620ed6cf3a0419d8ba6f18f8fb1cb3  # https://github.com/EvolvingLMMs-Lab/lmms-eval/issues/625
+git checkout f64dfa5fd063e989a0a665d2fd0615df23888c83
 pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu"
 python -m lmms_eval \
     --model openai_compatible \
@@ -92,11 +91,9 @@ python -m lmms_eval \
     --output_path ./logs
 ```
 
+**Results example:**
 
-### 5. Results
-
-Results:
-```
+```text
 openai_compatible (model_version=OpenGVLab/InternVL2_5-8B,max_retries=1), gen_kwargs: (), limit: None, num_fewshot: None, batch_size: 1
 | Tasks  |Version|Filter|n-shot|       Metric       |   |  Value  |   |Stderr|
 |--------|-------|------|-----:|--------------------|---|--------:|---|------|
@@ -107,7 +104,35 @@ openai_compatible (model_version=OpenGVLab/InternVL2_5-8B,max_retries=1), gen_kw
 ```
 
 
+## Running the tests for agentic models with function calls
+
+Use [Berkeley function call leaderboard ](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard)
+
+
+```bash
+git clone https://github.com/ShishirPatil/gorilla
+cd gorilla/berkeley-function-call-leaderboard
+git checkout ac37049f00022af54cc44b6aa0cad4402c22d1a0
+curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/agent-accuracy/demos/continuous_batching/accuracy/gorila.patch | git apply -v
+pip install -e . 
+```
+The commands below assumes the models is deployed with the name `openvino-qwen3-8b-int8`. It must match the name set in the `bfcl_eval/constants/model_config.py`.
+```bash
+export OPENAI_BASE_URL=http://localhost:8000/v3
+bfcl generate --model openvino-qwen3-8b-int8-FC --test-category multiple --num-threads 100 -o
+bfcl evaluate --model openvino-qwen3-8b-int8-FC
+```
+
+**Analyzing results**
+The output artifacts will be stored in `result` and `scores`. For example:
+
+```text
+cat score/openvino-qwen3-8b-int4-FC/BFCL_v3_simple_score.json | head -1
+{"accuracy": 0.95, "correct_count": 380, "total_count": 400}
+```
+Those results can be compared with the reference from the [berkeley leaderbaord](https://gorilla.cs.berkeley.edu/leaderboard.html#leaderboard).
 
+---
 
 > **Note:** The same procedure can be used to validate vLLM component. The only needed change would be updating base_url including replacing `/v3/` with `/v1/`.  
 
diff --git a/demos/continuous_batching/accuracy/gorila.patch b/demos/continuous_batching/accuracy/gorila.patch
@@ -0,0 +1,105 @@
+diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
+index db41f84..9200637 100644
+--- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
++++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
+@@ -863,7 +863,7 @@ api_inference_model_map = {
+         input_price=None,
+         output_price=None,
+         is_fc_model=True,
+-        underscore_to_dot=True,
++        underscore_to_dot=True, 
+     ),
+     "qwen3-0.6b": ModelConfig(
+         model_name="qwen3-0.6b",
+@@ -1930,6 +1930,78 @@ third_party_inference_model_map = {
+         is_fc_model=False,
+         underscore_to_dot=False,
+     ),
++    "openvino-qwen3-8b-int8-FC": ModelConfig(
++        model_name="openvino-qwen3-8b-int8-FC",
++        display_name="openvino-qwen3-8b-int8-FC",
++        url="https://huggingface.co/Qwen/Qwen3-8B",
++        org="OpenAI",
++        license="apache-2.0",
++        model_handler=OpenAIHandler,
++        input_price=None,
++        output_price=None,
++        is_fc_model=True,
++        underscore_to_dot=True,
++    ),
++    "openvino-qwen3-8b-int4-FC": ModelConfig(
++        model_name="ovms-qwen3-8b-int4-FC",
++        display_name="ovms-qwen3-8b-int4-FC",
++        url="https://huggingface.co/Qwen/Qwen3-8B",
++        org="OpenAI",
++        license="apache-2.0",
++        model_handler=OpenAIHandler,
++        input_price=None,
++        output_price=None,
++        is_fc_model=True,
++        underscore_to_dot=True,
++    ),
++    "openvino-qwen3-4b-int8-FC": ModelConfig(
++        model_name="openvino-qwen3-4b-int8-FC",
++        display_name="openvino-qwen3-4b-int8-FC",
++        url="https://huggingface.co/Qwen/Qwen3-4B",
++        org="OpenAI",
++        license="apache-2.0",
++        model_handler=OpenAIHandler,
++        input_price=None,
++        output_price=None,
++        is_fc_model=True,
++        underscore_to_dot=True,
++    ),
++    "openvino-qwen3-4b-int4-FC": ModelConfig(
++        model_name="openvino-qwen3-4b-int4-FC",
++        display_name="openvino-qwen3-4b-int4-FC",
++        url="https://huggingface.co/Qwen/Qwen3-4B",
++        org="OpenAI",
++        license="apache-2.0",
++        model_handler=OpenAIHandler,
++        input_price=None,
++        output_price=None,
++        is_fc_model=True,
++        underscore_to_dot=True,
++    ),
++    "openvino-phi-4-mini-instruct-int8-FC": ModelConfig(
++        model_name="openvino-phi-4-mini-instruct-int8-FC",
++        display_name="openvino-phi-4-mini-instruct-int8-FC",
++        url="https://huggingface.co/microsoft/phi4-mini-instruct",
++        org="OpenAI",
++        license="apache-2.0",
++        model_handler=OpenAIHandler,
++        input_price=None,
++        output_price=None,
++        is_fc_model=True,
++        underscore_to_dot=True,
++    ),
++    "openvino-phi-4-mini-instruct-int4-FC": ModelConfig(
++        model_name="openvino-phi-4-mini-instruct-int4-FC",
++        display_name="openvino-phi-4-mini-instruct-int4-FC",
++        url="https://huggingface.co/microsoft/phi4-mini-instruct",
++        org="OpenAI",
++        license="apache-2.0",
++        model_handler=OpenAIHandler,
++        input_price=None,
++        output_price=None,
++        is_fc_model=True,
++        underscore_to_dot=True,
++    ),
+ }
+ 
+ 
+diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py
+index 656efc2..a1345a1 100644
+--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py
++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py
+@@ -22,7 +22,7 @@ class OpenAIHandler(BaseHandler):
+     def __init__(self, model_name, temperature) -> None:
+         super().__init__(model_name, temperature)
+         self.model_style = ModelStyle.OpenAI
+-        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
++        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY","unused"), base_url=os.getenv("OPENAI_BASE_URL","http://localhost:8000"))
+ 
+     def decode_ast(self, result, language="Python"):
+         if "FC" in self.model_name or self.is_fc_model: