huggingface
diff --git a/‎timm/models/convnext.py‎
Lines changed: 64 additions & 18 deletions b/‎timm/models/convnext.py‎
Lines changed: 64 additions & 18 deletions
@@ -44,10 +44,27 @@
 import torch.nn as nn
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from timm.layers import trunc_normal_, AvgPool2dSame, DropPath, calculate_drop_path_rates, Mlp, GlobalResponseNormMlp, \
-    LayerNorm2d, LayerNorm, RmsNorm2d, RmsNorm, create_conv2d, get_act_layer, get_norm_layer, make_divisible, to_ntuple
-from timm.layers import SimpleNorm2d, SimpleNorm
-from timm.layers import NormMlpClassifierHead, ClassifierHead
+from timm.layers import (
+    trunc_normal_,
+    AvgPool2dSame,
+    DropPath,
+    calculate_drop_path_rates,
+    Mlp,
+    GlobalResponseNormMlp,
+    LayerNorm2d,
+    LayerNorm,
+    RmsNorm2d,
+    RmsNorm,
+    SimpleNorm2d,
+    SimpleNorm,
+    create_conv2d,
+    get_act_layer,
+    get_norm_layer,
+    make_divisible,
+    to_ntuple,
+    NormMlpClassifierHead,
+    ClassifierHead,
+)
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
 from ._manipulate import named_apply, checkpoint_seq
@@ -59,7 +76,15 @@
 class Downsample(nn.Module):
     """Downsample module for ConvNeXt."""
 
-    def __init__(self, in_chs: int, out_chs: int, stride: int = 1, dilation: int = 1) -> None:
+    def __init__(
+            self,
+            in_chs: int,
+            out_chs: int,
+            stride: int = 1,
+            dilation: int = 1,
+            device=None,
+            dtype=None,
+    ) -> None:
         """Initialize Downsample module.
 
         Args:
@@ -68,6 +93,7 @@ def __init__(self, in_chs: int, out_chs: int, stride: int = 1, dilation: int = 1
             stride: Stride for downsampling.
             dilation: Dilation rate.
         """
+        dd = {'device': device, 'dtype': dtype}
         super().__init__()
         avg_stride = stride if dilation == 1 else 1
         if stride > 1 or dilation > 1:
@@ -77,7 +103,7 @@ def __init__(self, in_chs: int, out_chs: int, stride: int = 1, dilation: int = 1
             self.pool = nn.Identity()
 
         if in_chs != out_chs:
-            self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
+            self.conv = create_conv2d(in_chs, out_chs, 1, stride=1, **dd)
         else:
             self.conv = nn.Identity()
 
@@ -115,6 +141,8 @@ def __init__(
             act_layer: Union[str, Callable] = 'gelu',
             norm_layer: Optional[Callable] = None,
             drop_path: float = 0.,
+            device=None,
+            dtype=None,
     ):
         """
 
@@ -133,6 +161,7 @@ def __init__(
             norm_layer: Normalization layer (defaults to LN if not specified).
             drop_path: Stochastic depth probability.
         """
+        dd = {'device': device, 'dtype': dtype}
         super().__init__()
         out_chs = out_chs or in_chs
         dilation = to_ntuple(2)(dilation)
@@ -149,12 +178,18 @@ def __init__(
             dilation=dilation[0],
             depthwise=True,
             bias=conv_bias,
+            **dd,
+        )
+        self.norm = norm_layer(out_chs, **dd)
+        self.mlp = mlp_layer(
+            out_chs,
+            int(mlp_ratio * out_chs),
+            act_layer=act_layer,
+            **dd,
         )
-        self.norm = norm_layer(out_chs)
-        self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
-        self.gamma = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value is not None else None
+        self.gamma = nn.Parameter(ls_init_value * torch.ones(out_chs, **dd)) if ls_init_value is not None else None
         if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
-            self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0])
+            self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0], **dd)
         else:
             self.shortcut = nn.Identity()
         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
@@ -196,7 +231,9 @@ def __init__(
             use_grn: bool = False,
             act_layer: Union[str, Callable] = 'gelu',
             norm_layer: Optional[Callable] = None,
-            norm_layer_cl: Optional[Callable] = None
+            norm_layer_cl: Optional[Callable] = None,
+            device=None,
+            dtype=None,
     ) -> None:
         """Initialize ConvNeXt stage.
 
@@ -216,14 +253,15 @@ def __init__(
             norm_layer: Normalization layer.
             norm_layer_cl: Normalization layer for channels last.
         """
+        dd = {'device': device, 'dtype': dtype}
         super().__init__()
         self.grad_checkpointing = False
 
         if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
             ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
             pad = 'same' if dilation[1] > 1 else 0  # same padding needed if dilation used
             self.downsample = nn.Sequential(
-                norm_layer(in_chs),
+                norm_layer(in_chs, **dd),
                 create_conv2d(
                     in_chs,
                     out_chs,
@@ -232,6 +270,7 @@ def __init__(
                     dilation=dilation[0],
                     padding=pad,
                     bias=conv_bias,
+                    **dd,
                 ),
             )
             in_chs = out_chs
@@ -253,6 +292,7 @@ def __init__(
                 use_grn=use_grn,
                 act_layer=act_layer,
                 norm_layer=norm_layer if conv_mlp else norm_layer_cl,
+                **dd,
             ))
             in_chs = out_chs
         self.blocks = nn.Sequential(*stage_blocks)
@@ -324,6 +364,8 @@ def __init__(
             norm_eps: Optional[float] = None,
             drop_rate: float = 0.,
             drop_path_rate: float = 0.,
+            device=None,
+            dtype=None,
     ):
         """
         Args:
@@ -349,6 +391,7 @@ def __init__(
             drop_path_rate: Stochastic depth drop rate.
         """
         super().__init__()
+        dd = {'device': device, 'dtype': dtype}
         assert output_stride in (8, 16, 32)
         kernel_sizes = to_ntuple(4)(kernel_sizes)
         norm_layer, norm_layer_cl = _get_norm_layers(norm_layer, conv_mlp, norm_eps)
@@ -362,17 +405,17 @@ def __init__(
         if stem_type == 'patch':
             # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
             self.stem = nn.Sequential(
-                nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias),
-                norm_layer(dims[0]),
+                nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias, **dd),
+                norm_layer(dims[0], **dd),
             )
             stem_stride = patch_size
         else:
             mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0]
             self.stem = nn.Sequential(*filter(None, [
-                nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias),
+                nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias, **dd),
                 act_layer() if 'act' in stem_type else None,
-                nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias),
-                norm_layer(dims[0]),
+                nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias, **dd),
+                norm_layer(dims[0], **dd),
             ]))
             stem_stride = 4
 
@@ -406,6 +449,7 @@ def __init__(
                 act_layer=act_layer,
                 norm_layer=norm_layer,
                 norm_layer_cl=norm_layer_cl,
+                **dd,
             ))
             prev_chs = out_chs
             # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
@@ -417,12 +461,13 @@ def __init__(
         # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
         if head_norm_first:
             assert not head_hidden_size
-            self.norm_pre = norm_layer(self.num_features)
+            self.norm_pre = norm_layer(self.num_features, **dd)
             self.head = ClassifierHead(
                 self.num_features,
                 num_classes,
                 pool_type=global_pool,
                 drop_rate=self.drop_rate,
+                **dd,
             )
         else:
             self.norm_pre = nn.Identity()
@@ -434,6 +479,7 @@ def __init__(
                 drop_rate=self.drop_rate,
                 norm_layer=norm_layer,
                 act_layer='gelu',
+                **dd,
             )
             self.head_hidden_size = self.head.num_features
         named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)