lmms-eval/examples/models/sglang_qwen3vl.sh at main · EmbeddedLLM/lmms-eval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/bin/bash

# Qwen3-VL Evaluation Script with SGLang Backend
# This script demonstrates how to evaluate Qwen3-VL models using SGLang for accelerated inference
#
# Requirements:
# - sglang>=0.4.6
# - qwen-vl-utils
# - CUDA-enabled GPU(s)
#
# Installation:
# uv add "sglang[all]" qwen-vl-utils
# OR
# pip install "sglang[all]>=0.4.6" qwen-vl-utils

# ============================================================================
# Configuration
# ============================================================================

# Model Configuration
# Available Qwen3-VL models:
# - Qwen/Qwen3-VL-30B-A3B-Instruct
# - Qwen/Qwen3-VL-30B-A3B-Thinking
# - Qwen/Qwen3-VL-235B-A22B-Instruct
# - Qwen/Qwen3-VL-235B-A22B-Thinking
MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"

# Parallelization Settings
# Adjust based on your GPU configuration
TENSOR_PARALLEL_SIZE=4  # Number of GPUs for tensor parallelism (tp_size in SGLang)

# Memory and Performance Settings
GPU_MEMORY_UTILIZATION=0.85  # mem_fraction_static in SGLang (0.0 - 1.0)
BATCH_SIZE=64                # Batch size for evaluation

# SGLang Specific Settings
MAX_PIXELS=1605632           # Maximum pixels for image processing
MIN_PIXELS=784               # Minimum pixels (28x28)
MAX_FRAME_NUM=32            # Maximum number of video frames
THREADS=16                  # Number of threads for decoding visuals

# Task Configuration
# Common tasks: mmmu_val, mme, mathvista, ai2d, etc.
TASKS="mmmu_val,mme"

# Output Configuration
OUTPUT_PATH="./logs/qwen3vl_sglang"
LOG_SAMPLES=true
LOG_SUFFIX="qwen3vl_sglang"

# Evaluation Limits (optional)
# LIMIT=100  # Uncomment to limit number of samples (for testing)

# ============================================================================
# Environment Configuration
# ============================================================================
export HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}"

# ============================================================================
# EXAMPLE 1: Basic SGLang Usage (Without MCP Tools)
# ============================================================================
# This is the standard evaluation without tool calling support.
# The model will process image/video queries and return responses directly.
#
# Key Parameters:
# - model: The model identifier
# - tensor_parallel_size: Number of GPUs for tensor parallelism
# - gpu_memory_utilization: GPU memory fraction to use
# - max_pixels/min_pixels: Image resolution constraints
# - max_frame_num: Maximum frames for video processing
# - threads: Thread count for visual processing

echo "=========================================="
echo "Qwen3-VL Evaluation with SGLang"
echo "=========================================="
echo "Model: $MODEL"
echo "Tensor Parallel Size: $TENSOR_PARALLEL_SIZE"
echo "Tasks: $TASKS"
echo "Batch Size: $BATCH_SIZE"
echo "Max Pixels: $MAX_PIXELS"
echo "Output Path: $OUTPUT_PATH"
echo "=========================================="

# Build the command
CMD="uv run python -m lmms_eval \
    --model sglang \
    --model_args model=${MODEL},tensor_parallel_size=${TENSOR_PARALLEL_SIZE},gpu_memory_utilization=${GPU_MEMORY_UTILIZATION},max_pixels=${MAX_PIXELS},min_pixels=${MIN_PIXELS},max_frame_num=${MAX_FRAME_NUM},threads=${THREADS} \
    --tasks ${TASKS} \
    --batch_size ${BATCH_SIZE} \
    --output_path ${OUTPUT_PATH}"

# Add optional arguments
if [ "$LOG_SAMPLES" = true ]; then
    CMD="$CMD --log_samples --log_samples_suffix ${LOG_SUFFIX}"
fi

if [ ! -z "$LIMIT" ]; then
    CMD="$CMD --limit ${LIMIT}"
fi

# Execute
echo "Running command:"
echo "$CMD"
echo ""

eval $CMD

echo ""
echo "=========================================="
echo "Evaluation Complete!"
echo "Results saved to: $OUTPUT_PATH"
echo "=========================================="

# ============================================================================
# EXAMPLE 2: SGLang with MCP Client Tools (Tool-Enabled Evaluation)
# ============================================================================
# This example demonstrates how to enable MCP (Model Context Protocol) client
# for tool calling support with SGLang.
#
# IMPORTANT: Before running this, you need to:
# 1. Create an MCP server that exposes tools (e.g., image processing, web search)
# 2. The MCP server should be a Python script that implements tool definitions
# 3. Pass the path to the MCP server script via mcp_server_path parameter
#
# How MCP Tool Calling Works with SGLang:
# ─────────────────────────────────────────
# 1. User sends a request with a question
# 2. SGLang processes the message and generates text
# 3. The function_call_parser detects if tool calls are in the generated text
#    (finish_reason == "tool_calls")
# 4. If tool calls are detected:
#    a. Parse the tool call function name and arguments from generated text
#    b. Retrieve tool definition from MCPClient
#    c. Execute the tool via MCPClient.run_tool(tool_name, arguments)
#    d. Convert tool result to OpenAI-compatible format
#    e. Append tool result to conversation as {"role": "tool", ...}
#    f. Generate next response with updated context (max_turn times)
# 5. Continue until model produces final text or max_turn is reached
#
# Tool Calling Loop in Code (from sglang.py):
# ──────────────────────────────────────────────
# while keep_rolling and turn_count < max_turn:
#     output = await self.client.async_generate(...)
#     if function_call_parser.has_tool_call(output["text"]):
#         tool_calls = function_call_parser.parse_non_stream(output["text"])
#         for tool_call in tool_calls:
#             result = await self.mcp_client.run_tool(tool_call.name, args)
#             # Convert result to OpenAI format
#             tool_messages.append({"role": "tool", "name": tool_call.name, "content": result})
#         messages.append(assistant_response)
#         messages.extend(tool_messages)
#         # Prepare next input for model with tool results
#         turn_count += 1
#
# Example with MCP tools enabled:
# (Uncomment the following lines to use with MCP server)
#
# # Path to MCP server implementation
# MCP_SERVER_PATH="/path/to/mcp_server.py"
# WORK_DIR="/tmp/sglang_mcp_work"
#
# CMD="uv run python -m lmms_eval \
#     --model sglang \
#     --model_args model=${MODEL},tensor_parallel_size=${TENSOR_PARALLEL_SIZE},gpu_memory_utilization=${GPU_MEMORY_UTILIZATION},max_pixels=${MAX_PIXELS},min_pixels=${MIN_PIXELS},max_frame_num=${MAX_FRAME_NUM},threads=${THREADS},mcp_server_path=${MCP_SERVER_PATH},work_dir=${WORK_DIR},max_turn=5 \
#     --tasks ${TASKS} \
#     --batch_size 1 \
#     --output_path ${OUTPUT_PATH}_with_mcp \
#     --log_samples --log_samples_suffix ${LOG_SUFFIX}_mcp"
#
# eval $CMD

# ============================================================================
# Parameter Reference
# ============================================================================
# model                    : Model identifier (required)
# tensor_parallel_size     : Number of GPUs for tensor parallelism (default: 1)
# gpu_memory_utilization   : GPU memory fraction (0.0-1.0, default: 0.8)
# batch_size               : Batch size for evaluation (default: 1)
# max_pixels               : Max image resolution (default: 1605632)
# min_pixels               : Min image resolution (default: 28*28=784)
# max_frame_num            : Max frames for videos (default: 768)
# fps                      : Frames per second for video sampling (optional)
# nframes                  : Fixed number of frames for video (default: 32)
# threads                  : Thread count for visual processing (default: 16)
# mcp_server_path          : Path to MCP server script for tool calling (optional)
# work_dir                 : Working directory for MCP tools (default: /tmp/...)
# max_turn                 : Maximum tool calling turns (default: 5)
# chat_template            : Custom chat template jinja file (optional)
# json_model_override_args : JSON args to override model config (optional)
#
#
# ============================================================================
# Tool Calling Best Practices
# ============================================================================
# 1. TOOL DESIGN:
#    - Keep tools focused on single tasks
#    - Provide clear, specific descriptions
#    - Define input schema with required fields
#    - Return results in structured format
#
# 2. MCP SERVER:
#    - Must be a standalone Python script
#    - Should handle errors gracefully
#    - Return results in TextContent or ImageContent format
#    - Avoid long-running operations (timeouts)
#
# 3. CONFIGURATION:
#    - Set appropriate max_turn value (5-10 recommended)
#    - Use batch_size=1 when tools are enabled (sequential processing)
#    - Allocate sufficient work_dir space for temporary files
#    - Monitor GPU memory with tool execution
#
# 4. DEBUGGING:
#    - Use --verbosity DEBUG to see tool call details
#    - Check work_dir for saved images/videos
#    - Validate MCP server responds correctly:
#      `python mcp_server.py` should start without errors
#    - Test tool functions independently before evaluation
#
# ============================================================================