diffusion/lora: reuse load_weights packed mapping

dongbo910220 · dongbo910220 · commit 517eb73484ad · 2026-02-01T13:20:28.000+08:00
Signed-off-by: dongbo910220 &lt;1275604947@qq.com&gt;
diff --git a/tests/diffusion/lora/test_lora_manager.py b/tests/diffusion/lora/test_lora_manager.py
@@ -133,7 +133,11 @@ def _fake_replace_submodule(root: torch.nn.Module, module_name: str, submodule:
     monkeypatch.setattr(manager_mod, "replace_submodule", _fake_replace_submodule)
 
     pipeline = torch.nn.Module()
-    pipeline.packed_modules_mapping = {"to_qkv": ["to_q", "to_k", "to_v"]}
+    pipeline.stacked_params_mapping = [
+        (".to_qkv", ".to_q", "q"),
+        (".to_qkv", ".to_k", "k"),
+        (".to_qkv", ".to_v", "v"),
+    ]
     pipeline.transformer = torch.nn.Module()
     pipeline.transformer.to_qkv = _FakeLinearBase()
 
@@ -145,7 +149,7 @@ def _fake_replace_submodule(root: torch.nn.Module, module_name: str, submodule:
     )
 
     # Treat the dummy layer as a packed 3-slice projection so the manager uses
-    # `packed_modules_mapping` to decide replacement based on target_modules.
+    # `stacked_params_mapping` to decide replacement based on target_modules.
     monkeypatch.setattr(manager, "_get_packed_modules_list", lambda _module: ["q", "k", "v"])
 
     peft_helper = type("_PH", (), {"r": 1, "target_modules": ["to_q"]})()
@@ -206,7 +210,11 @@ def test_lora_manager_activates_fused_lora_on_packed_layer():
 
 def test_lora_manager_activates_packed_lora_from_sublayers():
     pipeline = torch.nn.Module()
-    pipeline.packed_modules_mapping = {"to_qkv": ["to_q", "to_k", "to_v"]}
+    pipeline.stacked_params_mapping = [
+        (".to_qkv", ".to_q", "q"),
+        (".to_qkv", ".to_k", "k"),
+        (".to_qkv", ".to_v", "v"),
+    ]
     manager = DiffusionLoRAManager(
         pipeline=pipeline,
         device=torch.device("cpu"),
diff --git a/vllm_omni/diffusion/lora/manager.py b/vllm_omni/diffusion/lora/manager.py
@@ -131,17 +131,43 @@ def _compute_supported_lora_modules(self) -> set[str]:
     def _compute_packed_modules_mapping(self) -> dict[str, list[str]]:
         """Collect packed->sublayer mappings from the diffusion model.
 
-        vLLM models declare `packed_modules_mapping` on the model class. For
-        diffusion pipelines, we attach the same mapping on the transformer
-        module(s) that implement packed (fused) projections, so LoRA loading can
-        accept checkpoints trained against the logical sub-projections.
+        Diffusion models often use packed (fused) projections like `to_qkv` or
+        `w13`, while LoRA checkpoints are typically saved against the logical
+        sub-projections (e.g. `to_q`/`to_k`/`to_v`, `w1`/`w3`). Many diffusion
+        model implementations already define these relationships in
+        `load_weights()` via `stacked_params_mapping`. To avoid duplicating the
+        mapping in multiple places, we derive packed→sublayer mappings from the
+        model's `stacked_params_mapping`.
         """
+
+        def _derive_from_stacked_params_mapping(stacked: object) -> dict[str, list[str]]:
+            if not isinstance(stacked, (list, tuple)):
+                return {}
+            derived: dict[str, list[str]] = {}
+            for item in stacked:
+                if not isinstance(item, (list, tuple)) or len(item) < 2:
+                    continue
+                packed_suffix, sub_suffix = item[0], item[1]
+                if not isinstance(packed_suffix, str) or not packed_suffix:
+                    continue
+                if not isinstance(sub_suffix, str) or not sub_suffix:
+                    continue
+                # The mapping strings are usually suffix patterns (e.g. ".to_qkv"),
+                # but some models scope them under submodules (e.g. ".attn1.to_qkv").
+                # For LoRA we only care about the leaf module names.
+                packed_name = packed_suffix.split(".")[-1]
+                sub_name = sub_suffix.split(".")[-1]
+                existing = derived.get(packed_name)
+                if existing is None:
+                    derived[packed_name] = [sub_name]
+                elif sub_name not in existing:
+                    existing.append(sub_name)
+            return derived
+
         mapping: dict[str, list[str]] = {}
         for module in self.pipeline.modules():
-            packed = getattr(module, "packed_modules_mapping", None)
-            if not isinstance(packed, dict):
-                continue
-            for packed_name, sub_names in packed.items():
+            derived = _derive_from_stacked_params_mapping(getattr(module, "stacked_params_mapping", None))
+            for packed_name, sub_names in derived.items():
                 if not isinstance(packed_name, str) or not packed_name:
                     continue
                 if not isinstance(sub_names, (list, tuple)) or not all(isinstance(s, str) for s in sub_names):
@@ -155,7 +181,7 @@ def _compute_packed_modules_mapping(self) -> dict[str, list[str]]:
                     mapping[packed_name] = sub_names_list
                 elif existing != sub_names_list:
                     logger.warning(
-                        "Conflicting packed_modules_mapping for %s: %s vs %s; using %s",
+                        "Conflicting packed module mapping for %s: %s vs %s; using %s",
                         packed_name,
                         existing,
                         sub_names_list,
@@ -170,7 +196,7 @@ def _get_packed_sublayer_suffixes(self, packed_module_suffix: str, n_slices: int
             return None
         if len(sub_suffixes) != n_slices:
             logger.warning(
-                "packed_modules_mapping[%s] has %d slices but layer expects %d; skipping sublayer lookup",
+                "Packed module mapping[%s] has %d slices but layer expects %d; skipping sublayer lookup",
                 packed_module_suffix,
                 len(sub_suffixes),
                 n_slices,
diff --git a/vllm_omni/diffusion/lora/utils.py b/vllm_omni/diffusion/lora/utils.py
@@ -39,9 +39,10 @@ def _expand_expected_modules_for_packed_layers(
     `supported_modules`, but the sublayer names are not. Expanding the set
     ensures these sublayer keys are not dropped when loading a LoRA checkpoint.
 
-    The packed→sublayer mapping is model-specific (see each diffusion model's
-    `packed_modules_mapping`) so new packed layers are added alongside the model
-    implementation rather than hard-coded in the LoRA framework.
+    The packed→sublayer mapping is model-specific and is derived from each
+    diffusion model's `stacked_params_mapping` (used by `load_weights()`), so
+    new packed layers are added alongside the model implementation rather than
+    hard-coded in the LoRA framework.
     """
     expanded = set(supported_modules)
     if not packed_modules_mapping:
diff --git a/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py b/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py
@@ -559,10 +559,6 @@ class Flux2Transformer2DModel(nn.Module):
     """
 
     _repeated_blocks = ["Flux2TransformerBlock", "Flux2SingleTransformerBlock"]
-    packed_modules_mapping = {
-        "to_qkv": ["to_q", "to_k", "to_v"],
-        "add_kv_proj": ["add_q_proj", "add_k_proj", "add_v_proj"],
-    }
 
     def __init__(
         self,
@@ -735,6 +731,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             (".add_kv_proj", ".add_k_proj", "k"),
             (".add_kv_proj", ".add_v_proj", "v"),
         ]
+        # Expose packed shard mappings for LoRA handling of fused projections.
+        self.stacked_params_mapping = stacked_params_mapping
 
         params_dict = dict(self.named_parameters())
 
diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py
@@ -551,10 +551,6 @@ class GlmImageTransformer2DModel(CachedTransformer):
             `od_config.tf_model_config`.
     """
 
-    packed_modules_mapping = {
-        "to_qkv": ["to_q", "to_k", "to_v"],
-    }
-
     def __init__(
         self,
         od_config: OmniDiffusionConfig,
@@ -724,6 +720,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             (".to_qkv", ".to_k", "k"),
             (".to_qkv", ".to_v", "v"),
         ]
+        # Expose packed shard mappings for LoRA handling of fused projections.
+        self.stacked_params_mapping = stacked_params_mapping
 
         params_dict = dict(self.named_parameters())
 
diff --git a/vllm_omni/diffusion/models/longcat_image/longcat_image_transformer.py b/vllm_omni/diffusion/models/longcat_image/longcat_image_transformer.py
@@ -503,11 +503,6 @@ class LongCatImageTransformer2DModel(nn.Module):
     Supports Sequence Parallelism (Ulysses and Ring) when configured via OmniDiffusionConfig.
     """
 
-    packed_modules_mapping = {
-        "to_qkv": ["to_q", "to_k", "to_v"],
-        "add_kv_proj": ["add_q_proj", "add_k_proj", "add_v_proj"],
-    }
-
     def __init__(
         self,
         od_config: OmniDiffusionConfig,
@@ -707,6 +702,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             (".add_kv_proj", ".add_k_proj", "k"),
             (".add_kv_proj", ".add_v_proj", "v"),
         ]
+        # Expose packed shard mappings for LoRA handling of fused projections.
+        self.stacked_params_mapping = stacked_params_mapping
 
         params_dict = dict(self.named_parameters())
 
diff --git a/vllm_omni/diffusion/models/ovis_image/ovis_image_transformer.py b/vllm_omni/diffusion/models/ovis_image/ovis_image_transformer.py
@@ -366,10 +366,6 @@ class OvisImageTransformer2DModel(nn.Module):
     """
 
     _repeated_blocks = ["OvisImageTransformerBlock", "OvisImageSingleTransformerBlock"]
-    packed_modules_mapping = {
-        "to_qkv": ["to_q", "to_k", "to_v"],
-        "add_kv_proj": ["add_q_proj", "add_k_proj", "add_v_proj"],
-    }
 
     def __init__(
         self,
@@ -518,6 +514,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             (".add_kv_proj", ".add_k_proj", "k"),
             (".add_kv_proj", ".add_v_proj", "v"),
         ]
+        # Expose packed shard mappings for LoRA handling of fused projections.
+        self.stacked_params_mapping = stacked_params_mapping
 
         params_dict = dict(self.named_parameters())
 
diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
@@ -1053,6 +1053,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             (".add_kv_proj", ".add_k_proj", "k"),
             (".add_kv_proj", ".add_v_proj", "v"),
         ]
+        # Expose packed shard mappings for LoRA handling of fused projections.
+        self.stacked_params_mapping = stacked_params_mapping
 
         params_dict = dict(self.named_parameters())
 
diff --git a/vllm_omni/diffusion/models/sd3/sd3_transformer.py b/vllm_omni/diffusion/models/sd3/sd3_transformer.py
@@ -322,10 +322,6 @@ class SD3Transformer2DModel(nn.Module):
     """
 
     _repeated_blocks = ["SD3TransformerBlock"]
-    packed_modules_mapping = {
-        "to_qkv": ["to_q", "to_k", "to_v"],
-        "add_kv_proj": ["add_q_proj", "add_k_proj", "add_v_proj"],
-    }
 
     def __init__(
         self,
@@ -454,6 +450,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             (".add_kv_proj", ".add_k_proj", "k"),
             (".add_kv_proj", ".add_v_proj", "v"),
         ]
+        # Expose packed shard mappings for LoRA handling of fused projections.
+        self.stacked_params_mapping = stacked_params_mapping
 
         params_dict = dict(self.named_parameters())
 
diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py
@@ -725,7 +725,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         Returns:
             Set of parameter names that were successfully loaded.
         """
-        # Stacked params mapping for self-attention QKV fusion
+        # Stacked params mapping for self-attention QKV fusion.
         # Format: (param_name, shard_name, shard_id)
         # Note: Only fuse attn1 (self-attention), NOT attn2 (cross-attention)
         stacked_params_mapping = [
@@ -734,6 +734,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             (".attn1.to_qkv", ".attn1.to_k", "k"),
             (".attn1.to_qkv", ".attn1.to_v", "v"),
         ]
+        # Expose packed shard mappings for LoRA handling of fused projections.
+        self.stacked_params_mapping = stacked_params_mapping
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py
@@ -544,10 +544,6 @@ class ZImageTransformer2DModel(CachedTransformer):
     """
 
     _repeated_blocks = ["ZImageTransformerBlock"]
-    packed_modules_mapping = {
-        "to_qkv": ["to_q", "to_k", "to_v"],
-        "w13": ["w1", "w3"],
-    }
 
     # Sequence Parallelism for Z-Image (following diffusers' _cp_plan pattern)
     # Similar to how Wan uses `rope` module's split_output to shard rotary embeddings,
@@ -920,6 +916,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             (".w13", ".w1", 0),
             (".w13", ".w3", 1),
         ]
+        # Expose packed shard mappings for LoRA handling of fused projections.
+        self.stacked_params_mapping = stacked_params_mapping
 
         params_dict = dict(self.named_parameters())
 

Original file line number	Diff line number	Diff line change
`@@ -1053,6 +1053,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:`
`1053`	`1053`	`(".add_kv_proj", ".add_k_proj", "k"),`
`1054`	`1054`	`(".add_kv_proj", ".add_v_proj", "v"),`
`1055`	`1055`	`]`
	`1056`	`+ # Expose packed shard mappings for LoRA handling of fused projections.`
	`1057`	`+ self.stacked_params_mapping = stacked_params_mapping`
`1056`	`1058`
`1057`	`1059`	`params_dict = dict(self.named_parameters())`
`1058`	`1060`