mlfoundations · rwightman · Feb 17, 2023
diff --git a/src/open_clip/model.py b/src/open_clip/model.py
@@ -11,12 +11,12 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torch.utils.checkpoint import checkpoint
 
 from .hf_model import HFTextEncoder
 from .modified_resnet import ModifiedResNet
 from .timm_model import TimmModel
-from .transformer import LayerNormFp32, LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer
+from .transformer import LayerNormFp32, LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer, \
+    to_autocast_dtype
 from .utils import to_2tuple
 
 
@@ -217,12 +217,14 @@ def encode_text(self, text, normalize: bool = False):
         cast_dtype = self.transformer.get_cast_dtype()
 
         x = self.token_embedding(text).to(cast_dtype)  # [batch_size, n_ctx, d_model]
-
         x = x + self.positional_embedding.to(cast_dtype)
+        x = to_autocast_dtype(x)
+
         x = x.permute(1, 0, 2)  # NLD -> LND
         x = self.transformer(x, attn_mask=self.attn_mask)
         x = x.permute(1, 0, 2)  # LND -> NLD
         x = self.ln_final(x)  # [batch_size, n_ctx, transformer.width]
+
         # take features from the eot embedding (eot_token is the highest number in each sequence)
         x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
         return F.normalize(x, dim=-1) if normalize else x

diff --git a/src/open_clip/transformer.py b/src/open_clip/transformer.py
@@ -18,7 +18,6 @@ def forward(self, x: torch.Tensor):
         x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight, self.bias, self.eps)
         return x.to(orig_type)
 
-
 class LayerNorm(nn.LayerNorm):
     """Subclass torch's LayerNorm (with cast back to input dtype)."""
 
@@ -501,6 +500,15 @@ def forward(self, x: torch.Tensor):
         return pooled
 
 
+def to_autocast_dtype(x: torch.Tensor):
+    if x.device.type == 'cpu' and torch.is_autocast_cpu_enabled():
+        return x.to(torch.get_autocast_cpu_dtype())
+    elif torch.is_autocast_enabled():
+        return x.to(torch.get_autocast_gpu_dtype())
+    # NOTE this doesn't cover possible xpu / hpu + autocast use
+    return x
+
+
 class TextTransformer(nn.Module):
     output_tokens: torch.jit.Final[bool]
 
@@ -607,6 +615,8 @@ def forward(self, text):
             attn_mask = attn_mask[None, :seq_len, :seq_len] + cls_mask[:, :seq_len, :seq_len]
 
         x = x + self.positional_embedding[:seq_len].to(cast_dtype)
+        x = to_autocast_dtype(x)
+
         x = x.permute(1, 0, 2)  # NLD -> LND
         x = self.transformer(x, attn_mask=attn_mask)
         x = x.permute(1, 0, 2)  # LND -> NLD