[BFCL] Support LoRA adapters for local evaluation using vLLM backend #1274 (#1275)

zhangdw156 · HuanzhiMao · zhangdw · web-flow · commit dac44e7ac9db · 2026-01-17T23:13:06.000+08:00
## Description This PR addresses the limitation where the vLLM backend for local evaluation does not support loading LoRA adapters. It introduces a new `lora_modules` parameter to the `spin_up_local_server` function and passes the `--lora-modules` argument to the `vllm serve` command when provided. This allows users to directly evaluate fine-tuned LoRA models without merging weights. ## Related Issue Fixes #1274 ## Changes - Modified `bfcl/eval_checker/model_handler/local_inference/base_oss_handler.py` (or the specific file path you modified): - Added `lora_modules` argument to `spin_up_local_server`. - Implemented logic to append `--lora-modules` to the vLLM command list if the argument is present. ## Test Plan - Verified locally by running evaluation with a LoRA fine-tuned checkpoint. - Confirmed that vLLM successfully loads the adapter weights during server startup. ## Checklist - [x] My code follows the style guidelines of this project. - [x] I have performed a self-review of my own code. - [x] I have commented my code, particularly in hard-to-understand areas. --------- Co-authored-by: Huanzhi Mao <huanzhimao@gmail.com> Co-authored-by: zhangdw <zhangdw.cs@gmail.com>
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -223,12 +223,18 @@ bfcl generate \
   --backend {sglang|vllm} \
   --num-gpus 1 \
   --gpu-memory-utilization 0.9 \
-  --local-model-path /path/to/local/model   # ← optional
+  --local-model-path /path/to/base/model \
+  --enable-lora \
+  --max-lora-rank 128 \
+  --lora-modules module1="/path/to/lora/adapter1" module2="/path/to/lora/adapter2" # ← optional
 ```
 
 - Choose your backend using `--backend sglang` or `--backend vllm`. The default backend is `vllm`.
 - Control GPU usage by adjusting `--num-gpus` (default `1`, relevant for multi-GPU tensor parallelism) and `--gpu-memory-utilization` (default `0.9`), which can help avoid out-of-memory errors.
-- `--local-model-path` (optional): Point this flag at a directory that already contains the model's files (`config.json`, tokenizer, weights, etc.). Use it only when you've pre‑downloaded the model and the weights live somewhere other than the default `$HF_HOME` cache.
+- `--local-model-path` (optional): Point this flag at a directory that already contains the model's files (`config.json`, tokenizer, weights, etc.). Use it only when you've pre-downloaded the model and the weights live somewhere other than the default `$HF_HOME` cache.
+- `--enable-lora` (optional): Enable LoRA for the vLLM backend. This flag is required to use LoRA modules. This only works when backend is `vllm`.
+- `--max-lora-rank` (optional): Specify the maximum LoRA rank for the vLLM backend. This is an integer value. This only works when backend is `vllm` and `--enable-lora` flag is set.
+- `--lora-modules` (optional): Specify the path to the LoRA modules for the vLLM backend in `name="path"` format. This allows evaluation of fine-tuned models with LoRA adapters. You can specify multiple LoRA modules by repeating this argument. This only works when backend is `vllm` and `--enable-lora` flag is set.
 
 ##### For Pre-existing OpenAI-compatible Endpoints
 
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/__main__.py b/berkeley-function-call-leaderboard/bfcl_eval/__main__.py
@@ -148,6 +148,21 @@ def generate(
         "--run-ids",
         help="If true, also run the test entry mentioned in the test_case_ids_to_generate.json file, in addition to the --test_category argument.",
     ),
+    enable_lora: bool = typer.Option(
+        False,
+        "--enable-lora",
+        help="Enable LoRA for vLLM backend.",
+    ),
+    max_lora_rank: Optional[int] = typer.Option(
+        None,
+        "--max-lora-rank",
+        help="Specify the maximum LoRA rank for vLLM backend.",
+    ),
+    lora_modules: Optional[List[str]] = typer.Option(
+        None,
+        "--lora-modules",
+        help='Specify the path to the LoRA modules for vLLM backend in name="path" format. Can be specified multiple times.',
+    ),
 ):
     """
     Generate the LLM response for one or more models on a test-category (same as openfunctions_evaluation.py).
@@ -168,6 +183,9 @@ def generate(
         result_dir=result_dir,
         allow_overwrite=allow_overwrite,
         run_ids=run_ids,
+        enable_lora=enable_lora,
+        max_lora_rank=max_lora_rank,
+        lora_modules=lora_modules,
     )
     load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True)  # Load the .env file
     generation_main(args)
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
@@ -9,6 +9,7 @@
 from collections import defaultdict
 from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
 from copy import deepcopy
+from typing import Optional
 
 from bfcl_eval.constants.eval_config import (
     PROJECT_ROOT,
@@ -55,7 +56,27 @@ def get_args():
         default=None,
         help="Specify the path to a local directory containing the model's config/tokenizer/weights for fully offline inference. Use this only if the model weights are stored in a location other than the default HF_HOME directory.",
     )
+    parser.add_argument(
+        "--lora-modules",
+        type=str,
+        default=None,
+        nargs="*",
+        help="Specify the path to the LoRA modules for vLLM backend in name=\"path\" format. Can be specified multiple times.",
+    )
+    parser.add_argument(
+        "--enable-lora",
+        action="store_true",
+        default=False,
+        help="Enable LoRA for vLLM backend.",
+    )
+    parser.add_argument(
+        "--max-lora-rank",
+        type=int,
+        default=None,
+        help="Specify the maximum LoRA rank for vLLM backend.",
+    )
     args = parser.parse_args()
+    print(f"Parsed arguments: {args}")
 
     return args
 
@@ -240,6 +261,9 @@ def _writer():
                 backend=args.backend,
                 skip_server_setup=args.skip_server_setup,
                 local_model_path=args.local_model_path,
+                lora_modules=args.lora_modules,
+                enable_lora=args.enable_lora,
+                max_lora_rank=args.max_lora_rank,
             )
 
         # ───── dependency bookkeeping ──────────────────────────────
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py
@@ -79,6 +79,9 @@ def spin_up_local_server(
         backend: str,
         skip_server_setup: bool,
         local_model_path: Optional[str],
+        lora_modules: Optional[list[str]] = None,
+        enable_lora: bool = False,
+        max_lora_rank: Optional[int] = None,
     ):
         """
         Spin up a local server for the model.
@@ -171,7 +174,21 @@ def spin_up_local_server(
                             "--gpu-memory-utilization",
                             str(gpu_memory_utilization),
                             "--trust-remote-code",
-                        ],
+                        ]
+                        + (["--enable-lora"] if enable_lora else [])
+                        + (
+                            ["--max-lora-rank", str(max_lora_rank)]
+                            if max_lora_rank is not None
+                            else []
+                        )
+                        + (
+                            sum(
+                                [["--lora-modules", lora_module] for lora_module in lora_modules],
+                                [],
+                            )
+                            if lora_modules
+                            else []
+                        ),
                         stdout=subprocess.PIPE,  # Capture stdout
                         stderr=subprocess.PIPE,  # Capture stderr
                         text=True,  # To get the output as text instead of bytes