Support dynamic lora updating (#472)

pan-x-c · web-flow · commit 15a6d2ac6733 · 2026-01-09T16:34:15.000+08:00
diff --git a/tests/common/vllm_test.py b/tests/common/vllm_test.py
@@ -1228,6 +1228,8 @@ def setUp(self):
         self.config.explorer.rollout_model.tensor_parallel_size = 1
         self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
         self.config.explorer.rollout_model.enable_openai_api = True
+        self.config.explorer.rollout_model.enable_lora = True
+        self.config.explorer.rollout_model.enable_runtime_lora_updating = True
 
         self.config.check_and_update()
         self.engines, self.auxiliary_engines = create_inference_models(self.config)
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -556,6 +556,7 @@ class InferenceModelConfig:
 
     # ! DO NOT SET, automatically set from model.lora_configs
     enable_lora: bool = False
+    enable_runtime_lora_updating: bool = False
     lora_modules: Optional[List[Dict]] = None
     lora_kwargs: Optional[dict] = field(default_factory=dict)
 
diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py
@@ -54,6 +54,8 @@ def __init__(
             os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
         if self.vllm_version >= parse_version("0.11.0"):
             os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+        if self.config.enable_runtime_lora_updating:
+            os.environ["VLLM_ALLOW_RUNTIME_LORA_UPDATING"] = "1"
         if not config.enforce_eager:
             # To avoid torch compile conflicts when multiple model are started simultaneously.
             # remove this when the following PR is released: