replace F.pad by built-in padding in Conv3D

bm-synth · bm-synth · commit c586b4ba949d · 2025-01-21T11:52:58.000Z
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
@@ -103,6 +103,7 @@ def __init__(
         self.width_pad = width_pad
         self.time_pad = time_pad
         self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
+        self.const_padding_conv3d =  (0, self.width_pad, self.height_pad) 
 
         self.temporal_dim = 2
         self.time_kernel_size = time_kernel_size
@@ -115,6 +116,8 @@ def __init__(
             kernel_size=kernel_size,
             stride=stride,
             dilation=dilation,
+            padding = 0 if self.pad_mode == 'replicate' else self.const_padding_conv3d,
+            padding_mode = 'zeros',
         )
 
     def fake_context_parallel_forward(
@@ -135,9 +138,7 @@ def forward(self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = Non
         if self.pad_mode == "replicate":
             conv_cache = None
         else:
-            padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
             conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
-            inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
 
         output = self.conv(inputs)
         return output, conv_cache