[Diffusion] Revert 18619 (#19510)

BBuf · web-flow · commit 145ae518acdf · 2026-03-03T08:15:15.000+08:00
diff --git a/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py b/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py
@@ -30,9 +30,8 @@
     apply_qk_norm,
 )
 from sglang.multimodal_gen.runtime.layers.linear import (
-    ColumnParallelLinear,
     MergedColumnParallelLinear,
-    RowParallelLinear,
+    ReplicatedLinear,
 )
 from sglang.multimodal_gen.runtime.layers.quantization.configs.base_config import (
     QuantizationConfig,
@@ -89,109 +88,6 @@ def _get_qkv_projections(
     return img_query, img_key, img_value, txt_query, txt_key, txt_value
 
 
-class GELU(nn.Module):
-    r"""
-    GELU activation function with tanh approximation support with `approximate="tanh"`.
-
-    Parameters:
-        dim_in (`int`): The number of channels in the input.
-        dim_out (`int`): The number of channels in the output.
-        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
-        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
-        quant_config: Quantization configure.
-        prefix: The name of the layer in the state dict.
-    """
-
-    def __init__(
-        self,
-        dim_in: int,
-        dim_out: int,
-        approximate: str = "none",
-        bias: bool = True,
-        quant_config=None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.proj = ColumnParallelLinear(
-            dim_in,
-            dim_out,
-            bias=bias,
-            gather_output=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.proj" if prefix else "",
-        )
-        self.approximate = approximate
-
-    def forward(self, hidden_states):
-        hidden_states = self.proj(hidden_states)
-        return F.gelu(hidden_states[0], approximate=self.approximate)
-
-
-class FeedForward(nn.Module):
-    r"""
-    A feed-forward layer.
-
-    Parameters:
-        dim (`int`): The number of channels in the input.
-        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
-        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
-        quant_config: Quantization configure.
-        prefix: The name of the layer in the state dict.
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        dim_out: Optional[int] = None,
-        mult: int = 4,
-        activation_fn: str = "geglu",
-        inner_dim=None,
-        bias: bool = True,
-        quant_config=None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        if inner_dim is None:
-            inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-
-        if activation_fn == "gelu":
-            act_fn = GELU(dim, inner_dim, bias=bias, quant_config=None, prefix=prefix)
-        if activation_fn == "gelu-approximate":
-            act_fn = GELU(
-                dim,
-                inner_dim,
-                approximate="tanh",
-                bias=bias,
-                quant_config=None,
-                prefix=prefix,
-            )
-        else:
-            raise NotImplementedError(
-                f"activation_fn '{activation_fn}' is not supported."
-            )
-
-        self.net = nn.ModuleList([])
-        self.net.append(act_fn)
-        self.net.append(nn.Identity())
-        self.net.append(
-            RowParallelLinear(
-                inner_dim,
-                dim_out,
-                bias=True,
-                input_is_parallel=True,
-                quant_config=None,
-            )
-        )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        for module in self.net:
-            hidden_states = module(hidden_states)
-        return hidden_states
-
-
 class QwenTimestepProjEmbeddings(nn.Module):
     def __init__(self, embedding_dim, use_additional_t_cond=False):
         super().__init__()
@@ -624,27 +520,9 @@ def __init__(
             )
         else:
             # Use separate Q/K/V projections for non-quantized models
-            self.to_q = ColumnParallelLinear(
-                dim,
-                self.inner_dim,
-                bias=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.to_q",
-            )
-            self.to_k = ColumnParallelLinear(
-                dim,
-                self.inner_dim,
-                bias=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.to_k",
-            )
-            self.to_v = ColumnParallelLinear(
-                dim,
-                self.inner_dim,
-                bias=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.to_v",
-            )
+            self.to_q = ReplicatedLinear(dim, self.inner_dim, bias=True)
+            self.to_k = ReplicatedLinear(dim, self.inner_dim, bias=True)
+            self.to_v = ReplicatedLinear(dim, self.inner_dim, bias=True)
 
         if self.qk_norm:
             self.norm_q = RMSNorm(head_dim, eps=eps) if qk_norm else nn.Identity()
@@ -662,51 +540,25 @@ def __init__(
                 )
             else:
                 # Use separate Q/K/V projections for non-quantized models
-                self.add_q_proj = ColumnParallelLinear(
-                    added_kv_proj_dim,
-                    self.inner_dim,
-                    bias=True,
-                    quant_config=quant_config,
-                    prefix=f"{prefix}.add_q_proj",
+                self.add_q_proj = ReplicatedLinear(
+                    added_kv_proj_dim, self.inner_dim, bias=True
                 )
-                self.add_k_proj = ColumnParallelLinear(
-                    added_kv_proj_dim,
-                    self.inner_dim,
-                    bias=True,
-                    quant_config=quant_config,
-                    prefix=f"{prefix}.add_k_proj",
+                self.add_k_proj = ReplicatedLinear(
+                    added_kv_proj_dim, self.inner_dim, bias=True
                 )
-                self.add_v_proj = ColumnParallelLinear(
-                    added_kv_proj_dim,
-                    self.inner_dim,
-                    bias=True,
-                    quant_config=quant_config,
-                    prefix=f"{prefix}.add_v_proj",
+                self.add_v_proj = ReplicatedLinear(
+                    added_kv_proj_dim, self.inner_dim, bias=True
                 )
 
         if context_pre_only is not None and not context_pre_only:
-            self.to_add_out = ColumnParallelLinear(
-                self.inner_dim,
-                self.dim,
-                bias=out_bias,
-                gather_output=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.to_add_out",
-            )
+            self.to_add_out = ReplicatedLinear(self.inner_dim, self.dim, bias=out_bias)
         else:
             self.to_add_out = None
 
         if not pre_only:
             self.to_out = nn.ModuleList([])
             self.to_out.append(
-                ColumnParallelLinear(
-                    self.inner_dim,
-                    self.dim,
-                    bias=out_bias,
-                    gather_output=True,
-                    quant_config=quant_config,
-                    prefix=f"{prefix}.to_out.0",
-                )
+                ReplicatedLinear(self.inner_dim, self.dim, bias=out_bias)
             )
         else:
             self.to_out = None
@@ -848,13 +700,8 @@ def __init__(
         # Image processing modules
         self.img_mod = nn.Sequential(
             nn.SiLU(),
-            ColumnParallelLinear(
-                dim,
-                6 * dim,
-                bias=True,
-                gather_output=True,
-                quant_config=mod_quant_config,
-                prefix=f"{prefix}.img_mod",
+            nn.Linear(
+                dim, 6 * dim, bias=True
             ),  # For scale, shift, gate for norm1 and norm2
         )
         self.img_norm1 = LayerNormScaleShift(
@@ -877,13 +724,8 @@ def __init__(
         # Text processing modules
         self.txt_mod = nn.Sequential(
             nn.SiLU(),
-            ColumnParallelLinear(
-                dim,
-                6 * dim,
-                bias=True,
-                gather_output=True,
-                quant_config=mod_quant_config,
-                prefix=f"{prefix}.txt_mod",
+            nn.Linear(
+                dim, 6 * dim, bias=True
             ),  # For scale, shift, gate for norm1 and norm2
         )
         self.txt_norm1 = LayerNormScaleShift(
@@ -919,15 +761,11 @@ def __init__(
                 dim=dim,
                 dim_out=dim,
                 activation_fn="gelu-approximate",
-                quant_config=quant_config,
-                prefix=f"{prefix}.img_mlp",
             )
             self.txt_mlp = FeedForward(
                 dim=dim,
                 dim_out=dim,
                 activation_fn="gelu-approximate",
-                quant_config=quant_config,
-                prefix=f"{prefix}.txt_mlp",
             )
 
         if nunchaku_enabled:
@@ -1043,8 +881,8 @@ def forward(
         modulate_index: Optional[List[int]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Get modulation parameters for both streams
-        img_mod_params = self.img_mod[1](temb_img_silu)[0]  # [B, 6*dim]
-        txt_mod_params = self.txt_mod[1](temb_txt_silu)[0]  # [B, 6*dim]
+        img_mod_params = self.img_mod[1](temb_img_silu)  # [B, 6*dim]
+        txt_mod_params = self.txt_mod[1](temb_txt_silu)  # [B, 6*dim]
 
         if (
             self.quant_config is not None
@@ -1107,7 +945,7 @@ def forward(
             gate_x=img_gate1,
             residual_x=hidden_states,
         )
-        img_mlp_output = self.img_mlp(img_modulated2)[0]
+        img_mlp_output = self.img_mlp(img_modulated2)
 
         if img_mlp_output.dim() == 2:
             img_mlp_output = img_mlp_output.unsqueeze(0)
@@ -1123,7 +961,7 @@ def forward(
             scale=txt_scale2,
         )
         txt_gate2 = txt_gate2_raw.unsqueeze(1)
-        txt_mlp_output = self.txt_mlp(txt_modulated2)[0]
+        txt_mlp_output = self.txt_mlp(txt_modulated2)
 
         if txt_mlp_output.dim() == 2:
             txt_mlp_output = txt_mlp_output.unsqueeze(0)