Skip to content

Commit 45b2eac

Browse files
dtrawinsngrozae
andauthored
add procedure for testing accuracy for agentic use case (#3434)
--------- Co-authored-by: ngrozae <[email protected]>
1 parent 146712d commit 45b2eac

File tree

3 files changed

+141
-10
lines changed

3 files changed

+141
-10
lines changed

ci/lib_search.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ def check_dir(start_dir):
107107
'net_http.patch',
108108
'partial.patch',
109109
'ovms_drogon_trantor.patch',
110+
'gorila.patch',
110111
'opencv_cmake_flags.txt',
111112
'ovms-c/dist',
112113
'requirements.txt',

demos/continuous_batching/accuracy/README.md

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ pip3 install -U -r demos/common/export_models/requirements.txt
2020
mkdir models
2121
python demos/common/export_models/export_model.py text_generation --source_model meta-llama/Meta-Llama-3-8B-Instruct --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models
2222
python demos/common/export_models/export_model.py text_generation --source_model meta-llama/Meta-Llama-3-8B --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models
23-
python demos/common/export_models/export_model.py text_generation --source_model OpenGVLab/InternVL2_5-8B --weight-format fp16 --config_file_path models/config.json --model_repository_path models
23+
python demos/common/export_models/export_model.py text_generation --source_model OpenGVLab/InternVL2_5-8B --weight-format fp16 --config_file_path models/config.json --model_repository_path models
24+
python demos/common/export_models/export_model.py text_generation --source_model Qwen/Qwen3-8B --model_name openvino-qwen3-8b-int8 --weight-format int8 --config_file_path models/config.json --model_repository_path models --tools_model_type qwen3 --overwrite_models --enable_prefix_caching
2425
```
2526

2627
## Starting the model server
@@ -71,16 +72,14 @@ lm-eval --model local-completions --tasks wikitext --model_args model=meta-llama
7172

7273
## Running the tests for VLM models
7374

74-
7575
Use [lmms-eval project](https://github.com/EvolvingLMMs-Lab/lmms-eval) - mme and mmmu_val tasks.
7676

77-
7877
```bash
79-
export OPENAI_COMPATIBLE_API_URL=http://localhost:8000/v3
80-
export OPENAI_COMPATIBLE_API_KEY="unused"
78+
export OPENAI_BASE_URL=http://localhost:8000/v3
79+
export OPENAI_API_KEY="unused"
8180
git clone https://github.com/EvolvingLMMs-Lab/lmms-eval
8281
cd lmms-eval
83-
git checkout 4471ad311e620ed6cf3a0419d8ba6f18f8fb1cb3 # https://github.com/EvolvingLMMs-Lab/lmms-eval/issues/625
82+
git checkout f64dfa5fd063e989a0a665d2fd0615df23888c83
8483
pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu"
8584
python -m lmms_eval \
8685
--model openai_compatible \
@@ -92,11 +91,9 @@ python -m lmms_eval \
9291
--output_path ./logs
9392
```
9493

94+
**Results example:**
9595

96-
### 5. Results
97-
98-
Results:
99-
```
96+
```text
10097
openai_compatible (model_version=OpenGVLab/InternVL2_5-8B,max_retries=1), gen_kwargs: (), limit: None, num_fewshot: None, batch_size: 1
10198
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
10299
|--------|-------|------|-----:|--------------------|---|--------:|---|------|
@@ -107,7 +104,35 @@ openai_compatible (model_version=OpenGVLab/InternVL2_5-8B,max_retries=1), gen_kw
107104
```
108105

109106

107+
## Running the tests for agentic models with function calls
108+
109+
Use [Berkeley function call leaderboard ](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard)
110+
111+
112+
```bash
113+
git clone https://github.com/ShishirPatil/gorilla
114+
cd gorilla/berkeley-function-call-leaderboard
115+
git checkout ac37049f00022af54cc44b6aa0cad4402c22d1a0
116+
curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/agent-accuracy/demos/continuous_batching/accuracy/gorila.patch | git apply -v
117+
pip install -e .
118+
```
119+
The commands below assumes the models is deployed with the name `openvino-qwen3-8b-int8`. It must match the name set in the `bfcl_eval/constants/model_config.py`.
120+
```bash
121+
export OPENAI_BASE_URL=http://localhost:8000/v3
122+
bfcl generate --model openvino-qwen3-8b-int8-FC --test-category multiple --num-threads 100 -o
123+
bfcl evaluate --model openvino-qwen3-8b-int8-FC
124+
```
125+
126+
**Analyzing results**
127+
The output artifacts will be stored in `result` and `scores`. For example:
128+
129+
```text
130+
cat score/openvino-qwen3-8b-int4-FC/BFCL_v3_simple_score.json | head -1
131+
{"accuracy": 0.95, "correct_count": 380, "total_count": 400}
132+
```
133+
Those results can be compared with the reference from the [berkeley leaderbaord](https://gorilla.cs.berkeley.edu/leaderboard.html#leaderboard).
110134

135+
---
111136

112137
> **Note:** The same procedure can be used to validate vLLM component. The only needed change would be updating base_url including replacing `/v3/` with `/v1/`.
113138
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
2+
index db41f84..9200637 100644
3+
--- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
4+
+++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
5+
@@ -863,7 +863,7 @@ api_inference_model_map = {
6+
input_price=None,
7+
output_price=None,
8+
is_fc_model=True,
9+
- underscore_to_dot=True,
10+
+ underscore_to_dot=True,
11+
),
12+
"qwen3-0.6b": ModelConfig(
13+
model_name="qwen3-0.6b",
14+
@@ -1930,6 +1930,78 @@ third_party_inference_model_map = {
15+
is_fc_model=False,
16+
underscore_to_dot=False,
17+
),
18+
+ "openvino-qwen3-8b-int8-FC": ModelConfig(
19+
+ model_name="openvino-qwen3-8b-int8-FC",
20+
+ display_name="openvino-qwen3-8b-int8-FC",
21+
+ url="https://huggingface.co/Qwen/Qwen3-8B",
22+
+ org="OpenAI",
23+
+ license="apache-2.0",
24+
+ model_handler=OpenAIHandler,
25+
+ input_price=None,
26+
+ output_price=None,
27+
+ is_fc_model=True,
28+
+ underscore_to_dot=True,
29+
+ ),
30+
+ "openvino-qwen3-8b-int4-FC": ModelConfig(
31+
+ model_name="ovms-qwen3-8b-int4-FC",
32+
+ display_name="ovms-qwen3-8b-int4-FC",
33+
+ url="https://huggingface.co/Qwen/Qwen3-8B",
34+
+ org="OpenAI",
35+
+ license="apache-2.0",
36+
+ model_handler=OpenAIHandler,
37+
+ input_price=None,
38+
+ output_price=None,
39+
+ is_fc_model=True,
40+
+ underscore_to_dot=True,
41+
+ ),
42+
+ "openvino-qwen3-4b-int8-FC": ModelConfig(
43+
+ model_name="openvino-qwen3-4b-int8-FC",
44+
+ display_name="openvino-qwen3-4b-int8-FC",
45+
+ url="https://huggingface.co/Qwen/Qwen3-4B",
46+
+ org="OpenAI",
47+
+ license="apache-2.0",
48+
+ model_handler=OpenAIHandler,
49+
+ input_price=None,
50+
+ output_price=None,
51+
+ is_fc_model=True,
52+
+ underscore_to_dot=True,
53+
+ ),
54+
+ "openvino-qwen3-4b-int4-FC": ModelConfig(
55+
+ model_name="openvino-qwen3-4b-int4-FC",
56+
+ display_name="openvino-qwen3-4b-int4-FC",
57+
+ url="https://huggingface.co/Qwen/Qwen3-4B",
58+
+ org="OpenAI",
59+
+ license="apache-2.0",
60+
+ model_handler=OpenAIHandler,
61+
+ input_price=None,
62+
+ output_price=None,
63+
+ is_fc_model=True,
64+
+ underscore_to_dot=True,
65+
+ ),
66+
+ "openvino-phi-4-mini-instruct-int8-FC": ModelConfig(
67+
+ model_name="openvino-phi-4-mini-instruct-int8-FC",
68+
+ display_name="openvino-phi-4-mini-instruct-int8-FC",
69+
+ url="https://huggingface.co/microsoft/phi4-mini-instruct",
70+
+ org="OpenAI",
71+
+ license="apache-2.0",
72+
+ model_handler=OpenAIHandler,
73+
+ input_price=None,
74+
+ output_price=None,
75+
+ is_fc_model=True,
76+
+ underscore_to_dot=True,
77+
+ ),
78+
+ "openvino-phi-4-mini-instruct-int4-FC": ModelConfig(
79+
+ model_name="openvino-phi-4-mini-instruct-int4-FC",
80+
+ display_name="openvino-phi-4-mini-instruct-int4-FC",
81+
+ url="https://huggingface.co/microsoft/phi4-mini-instruct",
82+
+ org="OpenAI",
83+
+ license="apache-2.0",
84+
+ model_handler=OpenAIHandler,
85+
+ input_price=None,
86+
+ output_price=None,
87+
+ is_fc_model=True,
88+
+ underscore_to_dot=True,
89+
+ ),
90+
}
91+
92+
93+
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py
94+
index 656efc2..a1345a1 100644
95+
--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py
96+
+++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py
97+
@@ -22,7 +22,7 @@ class OpenAIHandler(BaseHandler):
98+
def __init__(self, model_name, temperature) -> None:
99+
super().__init__(model_name, temperature)
100+
self.model_style = ModelStyle.OpenAI
101+
- self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
102+
+ self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY","unused"), base_url=os.getenv("OPENAI_BASE_URL","http://localhost:8000"))
103+
104+
def decode_ast(self, result, language="Python"):
105+
if "FC" in self.model_name or self.is_fc_model:

0 commit comments

Comments
 (0)