diff --git a/tests/common/vllm_test.py b/tests/common/vllm_test.py index adb523667e..4aeee3adb3 100644 --- a/tests/common/vllm_test.py +++ b/tests/common/vllm_test.py @@ -1228,6 +1228,8 @@ def setUp(self): self.config.explorer.rollout_model.tensor_parallel_size = 1 self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE self.config.explorer.rollout_model.enable_openai_api = True + self.config.explorer.rollout_model.enable_lora = True + self.config.explorer.rollout_model.enable_runtime_lora_updating = True self.config.check_and_update() self.engines, self.auxiliary_engines = create_inference_models(self.config) diff --git a/trinity/common/config.py b/trinity/common/config.py index df5286cb41..83b751fece 100644 --- a/trinity/common/config.py +++ b/trinity/common/config.py @@ -556,6 +556,7 @@ class InferenceModelConfig: # ! DO NOT SET, automatically set from model.lora_configs enable_lora: bool = False + enable_runtime_lora_updating: bool = False lora_modules: Optional[List[Dict]] = None lora_kwargs: Optional[dict] = field(default_factory=dict) diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py index f3921e91f5..0fef9c5d4e 100644 --- a/trinity/common/models/vllm_model.py +++ b/trinity/common/models/vllm_model.py @@ -54,6 +54,8 @@ def __init__( os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" if self.vllm_version >= parse_version("0.11.0"): os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" + if self.config.enable_runtime_lora_updating: + os.environ["VLLM_ALLOW_RUNTIME_LORA_UPDATING"] = "1" if not config.enforce_eager: # To avoid torch compile conflicts when multiple model are started simultaneously. # remove this when the following PR is released: