[LoRA] Reduce the loading time of MoE LoRA (vllm-project#30243)

jeejeelee · web-flow · commit 80433e225ee0 · 2025-12-08T13:29:47.000Z
Signed-off-by: Jee Jee Li &lt;pandaleefree@gmail.com&gt;
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
@@ -115,7 +115,7 @@ def from_lora_tensors(
         weights_mapper: WeightsMapper | None = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
-        pin_memory = str(device) == "cpu" and is_pin_memory_available()
+
         loras: dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
             if is_base_embeddding_weights(tensor_name):
@@ -139,14 +139,8 @@ def from_lora_tensors(
                         f" with the base model's vocabulary size({model_vocab_size})."
                     )
                 loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
-                if pin_memory:
-                    loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
             else:
                 loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
-
-                if pin_memory:
-                    loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
-
         return cls(lora_model_id, peft_helper.r, loras)
 
     @classmethod
@@ -742,6 +736,32 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
         for lora in lora_model.loras.values():
             lora.optimize()
 
+        first_lora: LoRALayerWeights = next(iter(lora_model.loras.values()))
+        assert first_lora.lora_a is not None
+        if isinstance(first_lora.lora_a, list):
+            lora_device = next(iter(first_lora.lora_a))
+        else:
+            lora_device = first_lora.lora_a.device
+        # Execute pin_memory after LoRA weight merging, mainly because:
+        # 1. Some MoE models have a large number of LoRA weights. If we
+        # perform # pin_memory immediately after loading weights, the
+        # overhead is significant.
+        # 2. The weight packing above (e.g., pack_moe) may invalidate the
+        # pin_memory allocation, so we execute it after packing.
+
+        pin_memory = str(lora_device) == "cpu" and is_pin_memory_available()
+        if pin_memory:
+            for lora in lora_model.loras.values():
+                if isinstance(lora.lora_a, list):
+                    for index in range(len(lora.lora_a)):
+                        if lora.lora_a[index] is None:
+                            continue
+                        lora.lora_a[index] = lora.lora_a[index].pin_memory()
+                        lora.lora_b[index] = lora.lora_b[index].pin_memory()
+                else:
+                    lora.lora_a = lora.lora_a.pin_memory()
+                    lora.lora_b = lora.lora_b.pin_memory()
+
     def _get_lora_layer_weights(
         self, lora_model: LoRAModel, module_name: str
     ) -> LoRALayerWeights | None: