Skip to content

Commit dac44e7

Browse files
zhangdw156HuanzhiMaozhangdw
authored
[BFCL] Support LoRA adapters for local evaluation using vLLM backend #1274 (#1275)
## Description This PR addresses the limitation where the vLLM backend for local evaluation does not support loading LoRA adapters. It introduces a new `lora_modules` parameter to the `spin_up_local_server` function and passes the `--lora-modules` argument to the `vllm serve` command when provided. This allows users to directly evaluate fine-tuned LoRA models without merging weights. ## Related Issue Fixes #1274 ## Changes - Modified `bfcl/eval_checker/model_handler/local_inference/base_oss_handler.py` (or the specific file path you modified): - Added `lora_modules` argument to `spin_up_local_server`. - Implemented logic to append `--lora-modules` to the vLLM command list if the argument is present. ## Test Plan - Verified locally by running evaluation with a LoRA fine-tuned checkpoint. - Confirmed that vLLM successfully loads the adapter weights during server startup. ## Checklist - [x] My code follows the style guidelines of this project. - [x] I have performed a self-review of my own code. - [x] I have commented my code, particularly in hard-to-understand areas. --------- Co-authored-by: Huanzhi Mao <huanzhimao@gmail.com> Co-authored-by: zhangdw <zhangdw.cs@gmail.com>
1 parent fa5fa07 commit dac44e7

File tree

4 files changed

+68
-3
lines changed

4 files changed

+68
-3
lines changed

berkeley-function-call-leaderboard/README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,12 +223,18 @@ bfcl generate \
223223
--backend {sglang|vllm} \
224224
--num-gpus 1 \
225225
--gpu-memory-utilization 0.9 \
226-
--local-model-path /path/to/local/model # ← optional
226+
--local-model-path /path/to/base/model \
227+
--enable-lora \
228+
--max-lora-rank 128 \
229+
--lora-modules module1="/path/to/lora/adapter1" module2="/path/to/lora/adapter2" # ← optional
227230
```
228231

229232
- Choose your backend using `--backend sglang` or `--backend vllm`. The default backend is `vllm`.
230233
- Control GPU usage by adjusting `--num-gpus` (default `1`, relevant for multi-GPU tensor parallelism) and `--gpu-memory-utilization` (default `0.9`), which can help avoid out-of-memory errors.
231-
- `--local-model-path` (optional): Point this flag at a directory that already contains the model's files (`config.json`, tokenizer, weights, etc.). Use it only when you've pre‑downloaded the model and the weights live somewhere other than the default `$HF_HOME` cache.
234+
- `--local-model-path` (optional): Point this flag at a directory that already contains the model's files (`config.json`, tokenizer, weights, etc.). Use it only when you've pre-downloaded the model and the weights live somewhere other than the default `$HF_HOME` cache.
235+
- `--enable-lora` (optional): Enable LoRA for the vLLM backend. This flag is required to use LoRA modules. This only works when backend is `vllm`.
236+
- `--max-lora-rank` (optional): Specify the maximum LoRA rank for the vLLM backend. This is an integer value. This only works when backend is `vllm` and `--enable-lora` flag is set.
237+
- `--lora-modules` (optional): Specify the path to the LoRA modules for the vLLM backend in `name="path"` format. This allows evaluation of fine-tuned models with LoRA adapters. You can specify multiple LoRA modules by repeating this argument. This only works when backend is `vllm` and `--enable-lora` flag is set.
232238

233239
##### For Pre-existing OpenAI-compatible Endpoints
234240

berkeley-function-call-leaderboard/bfcl_eval/__main__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,21 @@ def generate(
148148
"--run-ids",
149149
help="If true, also run the test entry mentioned in the test_case_ids_to_generate.json file, in addition to the --test_category argument.",
150150
),
151+
enable_lora: bool = typer.Option(
152+
False,
153+
"--enable-lora",
154+
help="Enable LoRA for vLLM backend.",
155+
),
156+
max_lora_rank: Optional[int] = typer.Option(
157+
None,
158+
"--max-lora-rank",
159+
help="Specify the maximum LoRA rank for vLLM backend.",
160+
),
161+
lora_modules: Optional[List[str]] = typer.Option(
162+
None,
163+
"--lora-modules",
164+
help='Specify the path to the LoRA modules for vLLM backend in name="path" format. Can be specified multiple times.',
165+
),
151166
):
152167
"""
153168
Generate the LLM response for one or more models on a test-category (same as openfunctions_evaluation.py).
@@ -168,6 +183,9 @@ def generate(
168183
result_dir=result_dir,
169184
allow_overwrite=allow_overwrite,
170185
run_ids=run_ids,
186+
enable_lora=enable_lora,
187+
max_lora_rank=max_lora_rank,
188+
lora_modules=lora_modules,
171189
)
172190
load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file
173191
generation_main(args)

berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from collections import defaultdict
1010
from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
1111
from copy import deepcopy
12+
from typing import Optional
1213

1314
from bfcl_eval.constants.eval_config import (
1415
PROJECT_ROOT,
@@ -55,7 +56,27 @@ def get_args():
5556
default=None,
5657
help="Specify the path to a local directory containing the model's config/tokenizer/weights for fully offline inference. Use this only if the model weights are stored in a location other than the default HF_HOME directory.",
5758
)
59+
parser.add_argument(
60+
"--lora-modules",
61+
type=str,
62+
default=None,
63+
nargs="*",
64+
help="Specify the path to the LoRA modules for vLLM backend in name=\"path\" format. Can be specified multiple times.",
65+
)
66+
parser.add_argument(
67+
"--enable-lora",
68+
action="store_true",
69+
default=False,
70+
help="Enable LoRA for vLLM backend.",
71+
)
72+
parser.add_argument(
73+
"--max-lora-rank",
74+
type=int,
75+
default=None,
76+
help="Specify the maximum LoRA rank for vLLM backend.",
77+
)
5878
args = parser.parse_args()
79+
print(f"Parsed arguments: {args}")
5980

6081
return args
6182

@@ -240,6 +261,9 @@ def _writer():
240261
backend=args.backend,
241262
skip_server_setup=args.skip_server_setup,
242263
local_model_path=args.local_model_path,
264+
lora_modules=args.lora_modules,
265+
enable_lora=args.enable_lora,
266+
max_lora_rank=args.max_lora_rank,
243267
)
244268

245269
# ───── dependency bookkeeping ──────────────────────────────

berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ def spin_up_local_server(
7979
backend: str,
8080
skip_server_setup: bool,
8181
local_model_path: Optional[str],
82+
lora_modules: Optional[list[str]] = None,
83+
enable_lora: bool = False,
84+
max_lora_rank: Optional[int] = None,
8285
):
8386
"""
8487
Spin up a local server for the model.
@@ -171,7 +174,21 @@ def spin_up_local_server(
171174
"--gpu-memory-utilization",
172175
str(gpu_memory_utilization),
173176
"--trust-remote-code",
174-
],
177+
]
178+
+ (["--enable-lora"] if enable_lora else [])
179+
+ (
180+
["--max-lora-rank", str(max_lora_rank)]
181+
if max_lora_rank is not None
182+
else []
183+
)
184+
+ (
185+
sum(
186+
[["--lora-modules", lora_module] for lora_module in lora_modules],
187+
[],
188+
)
189+
if lora_modules
190+
else []
191+
),
175192
stdout=subprocess.PIPE, # Capture stdout
176193
stderr=subprocess.PIPE, # Capture stderr
177194
text=True, # To get the output as text instead of bytes

0 commit comments

Comments
 (0)