Change CLIP dtype management in llama.py

linziyi96 · linziyi96 · commit 23e8f8b232c8 · 2023-08-03T01:00:32.000+08:00
It is probably safer to keep CLIP at its original precision (e.g., fp16) regardless of the autocast setting: Some casting (e.g., from fp16 to bf16) may be lossy and can potentially harm the pre-trained model. Keep the changes to llama.py only at this moment since a lot of copy- pasted codes may be refactored in the future (#3).
diff --git a/accessory/model/LLM/llama.py b/accessory/model/LLM/llama.py
@@ -364,8 +364,10 @@ def clip_encode_image(self, x):
 
 
     def encode_image(self, image):
-        # return self.patch_embed(image)
-        image_tokens = self.clip_encode_image(image)
+        with torch.cuda.amp.autocast(enabled=False):
+            image = image.half()
+            image_tokens = self.clip_encode_image(image)
+            image = image.to(self.clip_proj.weight.dtype)
         image_tokens = self.clip_proj_norm(self.clip_proj(image_tokens))
         return image_tokens