Refactor tensor creation dtype / device control.

linziyi96 · linziyi96 · commit b93d0b6dff2b · 2023-08-03T00:43:22.000+08:00
This commit makes two changes during model creation: 1. Decouples promote_trainable_params_to_fp32 from model __init__. This is to avoid casting to fp32 to save memory in inference-only mode (#4). 2. Use a context manager to manage default tensor type change. In the previous version, the default tensor type is reset to torch.FloatTensor after creating the vision model, which is technically incorrect and should be the previous default tensor type instead. We implement our own context manager because the official context managers seem to be incomplete at this time (PyTorch 2.0.1): No dtype manager is provided and set_default_device is ineffective to the torch.Tensor calls which are used in fairscale.
diff --git a/accessory/demos/multi_turn.py b/accessory/demos/multi_turn.py
@@ -15,6 +15,7 @@
 import gradio as gr
 
 from util.misc import setup_for_distributed, load_pretrained
+from util.tensor_type import default_tensor_type
 from model.meta import MetaModel
 from data.conversation.lib import conv_templates, SeparatorStyle
 
@@ -50,17 +51,16 @@ def model_worker(
     # set the print behavior.
     setup_for_distributed(rank == 0)
 
-    model = MetaModel(
-        args.llama_type, args.llama_config, args.tokenizer_path,
-        with_visual=False, max_seq_len=args.model_max_seq_len,
-    )
     target_dtype = {
         "bf16": torch.bfloat16,
         "fp16": torch.float16,
     }[args.dtype]
-    for n, p in model.named_parameters():
-        p.data = p.data.to(target_dtype)
-    model.cuda().eval()
+    with default_tensor_type(dtype=target_dtype, device="cuda"):
+        model = MetaModel(
+            args.llama_type, args.llama_config, args.tokenizer_path,
+            with_visual=False, max_seq_len=args.model_max_seq_len,
+        )
+    model.eval()
     print(f"Loading pretrained weights from {args.pretrained_path}")
     load_pretrained(args.pretrained_path, args.pretrained_type, model)
     print(f"Model = {str(model)}")
diff --git a/accessory/main_finetune.py b/accessory/main_finetune.py
@@ -32,6 +32,7 @@
 
 import util.misc as misc
 from util.misc import NativeScalerWithGradNormCount as NativeScaler
+from util.tensor_type import default_tensor_type, promote_trainable_params_to_fp32
 from model.meta import MetaModel
 from engine_finetune import train_one_epoch
 from torch.utils.data import Dataset
@@ -150,8 +151,15 @@ def main(args):
     dp_group = fs_init.get_data_parallel_group()
 
     # define the model
-    model = MetaModel(args.llama_type, args.llama_config,
-                      args.tokenizer_path, with_visual=not args.no_visual)
+    mixed_precision_dtype = {
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+        "tf32": torch.float32,
+    }[args.precision]
+    with default_tensor_type(dtype=mixed_precision_dtype, device="cuda"):
+        model = MetaModel(args.llama_type, args.llama_config,
+                          args.tokenizer_path, with_visual=not args.no_visual)
+    promote_trainable_params_to_fp32(model)
     print(f"load pretrained from {args.pretrained_path}")
     misc.load_pretrained(args.pretrained_path, args.pretrained_type, model)
     print("Unwrapped Model = %s" % str(model))
@@ -160,11 +168,6 @@ def main(args):
     if args.resume:
         misc.resume_stage1(args, model_without_FSDP=model)
 
-    mixed_precision_dtype = {
-        "fp16": torch.float16,
-        "bf16": torch.bfloat16,
-        "tf32": torch.float32,
-    }[args.precision]
     TransformerBlock = type(model.llma.layers[0])
     # ignored_named_parameters = {name: param for name, param in model.named_parameters() if not param.requires_grad}
     # print(ignored_named_parameters.keys())
diff --git a/accessory/main_pretrain.py b/accessory/main_pretrain.py
@@ -32,6 +32,7 @@
 
 import util.misc as misc
 from util.misc import NativeScalerWithGradNormCount as NativeScaler
+from util.tensor_type import default_tensor_type, promote_trainable_params_to_fp32
 from model.meta import MetaModel
 from engine_pretrain import train_one_epoch, val_one_epoch
 from torch.utils.data import Dataset
@@ -147,8 +148,15 @@ def main(args):
     dp_group = fs_init.get_data_parallel_group()
 
     # define the model
-    model = MetaModel(args.llama_type, args.llama_config,
-                      args.tokenizer_path, with_visual=False)
+    mixed_precision_dtype = {
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+        "tf32": torch.float32,
+    }[args.precision]
+    with default_tensor_type(dtype=mixed_precision_dtype, device="cuda"):
+        model = MetaModel(args.llama_type, args.llama_config,
+                          args.tokenizer_path, with_visual=False)
+    promote_trainable_params_to_fp32(model)
     if args.pretrained_path:
         print(f"load pretrained from {args.pretrained_path}")
         misc.load_pretrained(args.pretrained_path, args.pretrained_type, model)
@@ -158,11 +166,7 @@ def main(args):
     if args.resume:
         misc.resume_stage1(args, model_without_FSDP=model)
 
-    mixed_precision_dtype = {
-        "fp16": torch.float16,
-        "bf16": torch.bfloat16,
-        "tf32": torch.float32,
-    }[args.precision]
+    
     TransformerBlock = type(model.llma.layers[0])
 
     model = FSDP(
diff --git a/accessory/model/LLM/llama.py b/accessory/model/LLM/llama.py
@@ -20,6 +20,7 @@
 from apex.normalization import FusedRMSNorm as RMSNorm
 import open_clip
 
+from util.tensor_type import default_tensor_type
 import configs.global_configs
 if configs.global_configs.USE_FLASH_ATTENTION:
     from flash_attn import flash_attn_func
@@ -308,9 +309,8 @@ def __init__(self, params: ModelArgs, with_visual=False):
         self.cache_image_words = 0 # for inference
         if with_visual:
             print("build llama model with clip")
-            torch.set_default_tensor_type(torch.cuda.HalfTensor)
-            self.clip, _, _ = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
-            torch.set_default_tensor_type(torch.FloatTensor)
+            with default_tensor_type(dtype=torch.half):
+                self.clip, _, _ = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
             for name, param in self.clip.named_parameters():
                 param.requires_grad = False
             in_dim = self.clip.visual.proj.shape[1]
@@ -334,9 +334,7 @@ def get_trainable_params(self):
     def set_default_trainability(self):
         for key, value in self.named_parameters():
             value.requires_grad = False
-            value.data = value.data.half()
         for key, value in self.get_trainable_params().items():
-            value.data = value.data.float()
             value.requires_grad = True
 
 
diff --git a/accessory/model/LLM/llama_adapter.py b/accessory/model/LLM/llama_adapter.py
@@ -24,6 +24,7 @@
 import configs.global_configs
 if configs.global_configs.USE_FLASH_ATTENTION:
     from flash_attn import flash_attn_func
+from util.tensor_type import default_tensor_type
 
 default_linear_init = functools.partial(nn.init.kaiming_uniform_, a=math.sqrt(5))
 
@@ -349,9 +350,8 @@ def __init__(self, params: ModelArgs, with_visual=False):
         self.image_words = 0
         if with_visual:
             print("build llama model with clip")
-            torch.set_default_tensor_type(torch.cuda.HalfTensor)
-            self.clip, _, _ = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
-            torch.set_default_tensor_type(torch.FloatTensor)
+            with default_tensor_type(dtype=torch.half):
+                self.clip, _, _ = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
             for name, param in self.clip.named_parameters():
                 param.requires_grad = False
             in_dim = self.clip.visual.proj.shape[1]
@@ -401,9 +401,7 @@ def get_trainable_params(self):
     def set_default_trainability(self):
         for key, value in self.named_parameters():
             value.requires_grad = False
-            value.data = value.data.half()
         for key, value in self.get_trainable_params().items():
-            value.data = value.data.float()
             value.requires_grad = True
 
 
diff --git a/accessory/model/LLM/llama_peft.py b/accessory/model/LLM/llama_peft.py
@@ -17,6 +17,7 @@
     ColumnParallelLinear
 )
 from ..peft import LoraColumnParallelLinear, LoraRowParallelLinear
+from util.tensor_type import default_tensor_type
 
 from apex.normalization import FusedRMSNorm as RMSNorm
 import open_clip
@@ -323,9 +324,8 @@ def __init__(self, params: ModelArgs, with_visual=False):
         self.cache_image_words = 0 # for inference
         if with_visual:
             print("build llama model with clip")
-            torch.set_default_tensor_type(torch.cuda.HalfTensor)
-            self.clip, _, _ = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
-            torch.set_default_tensor_type(torch.FloatTensor)
+            with default_tensor_type(dtype=torch.half):
+                self.clip, _, _ = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
             for name, param in self.clip.named_parameters():
                 param.requires_grad = False
             in_dim = self.clip.visual.proj.shape[1]
@@ -351,9 +351,7 @@ def get_trainable_params(self):
     def set_default_trainability(self):
         for key, value in self.named_parameters():
             value.requires_grad = False
-            value.data = value.data.half()
         for key, value in self.get_trainable_params().items():
-            value.data = value.data.float()
             value.requires_grad = True
 
 
diff --git a/accessory/model/LLM/llama_qformerv2.py b/accessory/model/LLM/llama_qformerv2.py
@@ -337,9 +337,7 @@ def get_trainable_params(self):
     def set_default_trainability(self):
         for key, value in self.named_parameters():
             value.requires_grad = False
-            value.data = value.data.half()
         for key, value in self.get_trainable_params().items():
-            value.data = value.data.float()
             value.requires_grad = True
 
 
diff --git a/accessory/util/tensor_type.py b/accessory/util/tensor_type.py
@@ -0,0 +1,66 @@
+from types import TracebackType
+from typing import Any, Optional
+import torch
+import torch.nn as nn
+
+
+class default_tensor_type:
+    _tensor_type_stack = [(torch.float, "cpu")]
+    
+    def __init__(
+        self,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[str] = None,
+    ) -> None:
+        # Only limited combinations are supported.
+        assert device is None or device in ["cpu", "cuda"]
+        assert dtype is None or dtype in [torch.float, torch.bfloat16, torch.half]
+        self.dtype, self.device = dtype, device
+    
+    def __enter__(self) -> None:
+        dtype, device = self.dtype, self.device
+        if dtype is None:
+            dtype = default_tensor_type._tensor_type_stack[-1][0]
+        if device is None:
+            device = default_tensor_type._tensor_type_stack[-1][1]
+        default_tensor_type._tensor_type_stack.append((dtype, device))
+        
+        # We use all 3 calls since the new apis (set_default_device, set_default_dtype)
+        # seems to be ineffective sometimes (e.g., set_default_device is ineffective to
+        # torch.Tensor calls).
+        torch.set_default_tensor_type(default_tensor_type.get_tensor_type(dtype, device))
+        torch.set_default_device(device)
+        torch.set_default_dtype(dtype)
+
+    def __exit__(
+        self,
+        exc_type: Optional[type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[TracebackType],
+    ) -> None:
+        default_tensor_type._tensor_type_stack.pop()
+        dtype, device = default_tensor_type._tensor_type_stack[-1]
+
+        torch.set_default_tensor_type(default_tensor_type.get_tensor_type(dtype, device))
+        torch.set_default_device(device)
+        torch.set_default_dtype(dtype)
+
+    @staticmethod
+    def get_tensor_type(dtype: torch.dtype, device: str) -> Any:
+        return {
+            (torch.float, "cpu"): torch.FloatTensor,
+            (torch.bfloat16, "cpu"): torch.BFloat16Tensor,
+            (torch.half, "cpu"): torch.HalfTensor,
+            (torch.float, "cuda"): torch.cuda.FloatTensor,
+            (torch.bfloat16, "cuda"): torch.cuda.BFloat16Tensor,
+            (torch.half, "cuda"): torch.cuda.HalfTensor,
+        }[(dtype, device)]
+
+
+def promote_trainable_params_to_fp32(model: nn.Module) -> None:
+    for param in model.parameters():
+        if param.requires_grad:
+            if param.is_floating_point() and torch.finfo(param.dtype).bits < 32:
+                param.data = param.data.float()
+            if param.is_complex() and torch.finfo(param.dtype).bits < 32:
+                param.data = param.data.to(torch.complex64)