couple_channels; no_grad

crutcher · crutcher · commit f195cc32ac43 · 2025-08-25T15:01:31.000-07:00
diff --git a/timm/layers/drop.py b/timm/layers/drop.py
@@ -138,6 +138,7 @@ def drop_block_2d(
     with_noise: bool = False,
     inplace: bool = False,
     batchwise: bool = False,
+    couple_channels: bool = False,
     partial_edge_blocks: bool = False,
 ):
     """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
@@ -151,8 +152,10 @@ def drop_block_2d(
         gamma_scale: adjustment scale for the drop_prob.
         with_noise: should normal noise be added to the dropped region?
         inplace: if the drop should be applied in-place on the input tensor.
-        batchwise: should the entire batch use the same drop mask?
-        partial_edge_blocks: partial-blocks at the edges, faster.
+        batchwise: when true, the entire batch is shares the same drop mask; much faster.
+        couple_channels: when true, channels share the same drop mask;
+          much faster, with significant semantic impact.
+        partial_edge_blocks: partial-blocks at the edges; minor speedup, minor semantic impact.
 
     Returns:
         If inplace, the modified `x`; otherwise, the dropped copy of `x`, on the same device.
@@ -165,45 +168,47 @@ def drop_block_2d(
     kh, kw = kernel
 
     # batchwise => one mask for whole batch, quite a bit faster
-    noise_shape = (1 if batchwise else B, C, H, W)
+    noise_shape = (1 if batchwise else B, 1 if couple_channels else C, H, W)
 
     gamma = (
         float(gamma_scale * drop_prob * H * W)
         / float(kh * kw)
         / float((H - kh + 1) * (W - kw + 1))
     )
 
-    drop_filter = drop_block_2d_drop_filter_(
-        kernel=kernel,
-        partial_edge_blocks=partial_edge_blocks,
-        inplace=True,
-        selection=torch.empty(
-            noise_shape,
-            dtype=x.dtype,
-            device=x.device,
-        ).bernoulli_(gamma),
-    )
-    keep_filter = 1.0 - drop_filter
+    with torch.no_grad():
+        drop_filter = drop_block_2d_drop_filter_(
+            kernel=kernel,
+            partial_edge_blocks=partial_edge_blocks,
+            inplace=True,
+            selection=torch.empty(
+                noise_shape,
+                dtype=x.dtype,
+                device=x.device,
+            ).bernoulli_(gamma),
+        )
+        keep_filter = 1.0 - drop_filter
 
     if with_noise:
         # x += (noise * drop_filter)
-        drop_noise = torch.randn_like(drop_filter)
-        drop_noise.mul_(drop_filter)
+        with torch.no_grad():
+            drop_noise = torch.randn_like(drop_filter)
+            drop_noise.mul_(drop_filter)
 
         if inplace:
             x.mul_(keep_filter)
             x.add_(drop_noise)
-
         else:
             x = x * keep_filter + drop_noise
 
     else:
         # x *= (size(keep_filter) / (sum(keep_filter) + eps))
-        count = keep_filter.numel()
-        total = keep_filter.to(dtype=torch.float32).sum()
-        keep_scale = count / total.add(1e-7).to(x.dtype)
+        with torch.no_grad():
+            count = keep_filter.numel()
+            total = keep_filter.to(dtype=torch.float32).sum()
+            keep_scale = count / total.add(1e-7).to(x.dtype)
 
-        keep_filter.mul_(keep_scale)
+            keep_filter.mul_(keep_scale)
 
         if inplace:
             x.mul_(keep_filter)
@@ -247,8 +252,10 @@ class DropBlock2d(nn.Module):
         gamma_scale: adjustment scale for the drop_prob.
         with_noise: should normal noise be added to the dropped region?
         inplace: if the drop should be applied in-place on the input tensor.
-        batchwise: should the entire batch use the same drop mask?
-        partial_edge_blocks: partial-blocks at the edges, faster.
+        batchwise: when true, the entire batch is shares the same drop mask; much faster.
+        couple_channels: when true, channels share the same drop mask;
+          much faster, with significant semantic impact.
+        partial_edge_blocks: partial-blocks at the edges; minor speedup, minor semantic impact.
     """
 
     drop_prob: float
@@ -257,6 +264,7 @@ class DropBlock2d(nn.Module):
     with_noise: bool
     inplace: bool
     batchwise: bool
+    couple_channels: bool
     partial_edge_blocks: bool
 
     def __init__(
@@ -266,8 +274,9 @@ def __init__(
         gamma_scale: float = 1.0,
         with_noise: bool = False,
         inplace: bool = False,
-        batchwise: bool = False,
-        partial_edge_blocks: bool = True,
+        batchwise: bool = True,
+        couple_channels: bool = False,
+        partial_edge_blocks: bool = False,
     ):
         super(DropBlock2d, self).__init__()
         self.drop_prob = drop_prob
diff --git a/timm/models/resnet.py b/timm/models/resnet.py
@@ -326,9 +326,10 @@ def make_blocks(
         down_kernel_size: int = 1,
         avg_down: bool = False,
         drop_block_rate: float = 0.,
-        drop_path_rate: float = 0.,
-        drop_block_batchwise: bool = False,
+        drop_block_batchwise: bool = True,
+        drop_block_couple_channels: bool = False,
         drop_block_partial_edge_blocks: bool = True,
+        drop_path_rate: float = 0.,
         **kwargs,
 ) -> Tuple[List[Tuple[str, nn.Module]], List[Dict[str, Any]]]:
     """Create ResNet stages with specified block configurations.
@@ -343,8 +344,10 @@ def make_blocks(
         down_kernel_size: Kernel size for downsample layers.
         avg_down: Use average pooling for downsample.
         drop_block_rate: DropBlock drop rate.
-        drop_block_batchwise: Batchwise block dropping, faster.
-        drop_block_partial_edge_blocks: dropping produces partial blocks on the edge, faster.
+        drop_block_batchwise: Batchwise block dropping, much faster.
+        drop_block_couple_channels: Couple channel drops.
+        drop_block_partial_edge_blocks: Permit partial drop blocks on the edge,
+          slightly faster.
         drop_path_rate: Drop path rate for stochastic depth.
         **kwargs: Additional arguments passed to block constructors.
 
@@ -364,6 +367,7 @@ def make_blocks(
             drop_blocks(
                 drop_prob=drop_block_rate,
                 batchwise=drop_block_batchwise,
+                couple_channels=drop_block_couple_channels,
                 partial_edge_blocks=drop_block_partial_edge_blocks,
             ))):
         stage_name = f'layer{stage_idx + 1}'  # never liked this name, but weight compat requires it
@@ -465,10 +469,11 @@ def __init__(
             norm_layer: LayerType = nn.BatchNorm2d,
             aa_layer: Optional[Type[nn.Module]] = None,
             drop_rate: float = 0.0,
-            drop_path_rate: float = 0.,
             drop_block_rate: float = 0.,
             drop_block_batchwise: bool = True,
+            drop_block_couple_channels: bool = False,
             drop_block_partial_edge_blocks: bool = True,
+            drop_path_rate: float = 0.,
             zero_init_last: bool = True,
             block_args: Optional[Dict[str, Any]] = None,
     ):
@@ -497,10 +502,11 @@ def __init__(
             norm_layer (str, nn.Module): normalization layer
             aa_layer (nn.Module): anti-aliasing layer
             drop_rate (float): Dropout probability before classifier, for training (default 0.)
-            drop_path_rate (float): Stochastic depth drop-path rate (default 0.)
             drop_block_rate (float): Drop block rate (default 0.)
-            drop_block_batchwise (bool): Sample blocks batchwise, faster.
+            drop_block_batchwise (bool): Sample blocks batchwise, significantly faster.
+            drop_block_couple_channels (bool): couple channels when dropping blocks.
             drop_block_partial_edge_blocks (bool): Partial block dropping at the edges, faster.
+            drop_path_rate (float): Stochastic depth drop-path rate (default 0.)
             zero_init_last (bool): zero-init the last weight in residual path (usually last BN affine weight)
             block_args (dict): Extra kwargs to pass through to block module
         """
@@ -572,6 +578,7 @@ def __init__(
             aa_layer=aa_layer,
             drop_block_rate=drop_block_rate,
             drop_block_batchwise=drop_block_batchwise,
+            drop_block_couple_channels=drop_block_couple_channels,
             drop_block_partial_edge_blocks=drop_block_partial_edge_blocks,
             drop_path_rate=drop_path_rate,
             **block_args,
@@ -1459,8 +1466,8 @@ def resnet10t(pretrained: bool = False, **kwargs) -> ResNet:
     return _create_resnet('resnet10t', pretrained, **dict(model_args, **kwargs))
 
 @register_model
-def resnet10t_dropblock_correct(pretrained: bool = False, **kwargs) -> ResNet:
-    """Constructs a ResNet-10-T model with drop_block_rate=0.05, using the most accurate DropBlock2d features.
+def resnet10t_dropblock_slow(pretrained: bool = False, **kwargs) -> ResNet:
+    """Constructs a ResNet-10-T model with drop_block_rate=0.05, using the slowest DropBlock2d features.
     """
     model_args = dict(
         block=BasicBlock,
@@ -1469,7 +1476,8 @@ def resnet10t_dropblock_correct(pretrained: bool = False, **kwargs) -> ResNet:
         stem_type='deep_tiered',
         avg_down=True,
         drop_block_rate=0.05,
-        drop_block_batchwise=True,
+        drop_block_batchwise=False,
+        drop_block_couple_channels=False,
         drop_block_partial_edge_blocks=True,
     )
     return _create_resnet('resnet10t', pretrained, **dict(model_args, **kwargs))
@@ -1485,7 +1493,8 @@ def resnet10t_dropblock_fast(pretrained: bool = False, **kwargs) -> ResNet:
         stem_type='deep_tiered',
         avg_down=True,
         drop_block_rate=0.05,
-        drop_block_batchwise=False,
+        drop_block_batchwise=True,
+        drop_block_couple_channels=True,
         drop_block_partial_edge_blocks=False,
     )
     return _create_resnet('resnet10t', pretrained, **dict(model_args, **kwargs))