3/n llava

Gasoonjia · Gasoonjia · commit 215331d2e6c7 · 2024-09-16T01:40:24.000-07:00
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -28,10 +28,20 @@
 from torchtune.models.flamingo import flamingo_decoder, flamingo_vision_encoder
 from torchtune.models.llama3_1._component_builders import llama3_1 as llama3_1_builder
 from torchtune.modules.model_fusion import DeepFusionModel
+from torchtune.models.clip import clip_vision_encoder
 
 config_path = Path(f"{str(Path(__file__).parent)}/model_params")
 
 
+class QuickGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input):
+        return input * torch.sigmoid(1.702 * input)
+
+
 def identity(**kwargs):
     if len(kwargs) != 1:
         raise ValueError("Only one argument is expected")
@@ -99,24 +109,25 @@ def forward(
             encoder_output = None
 
         decoder_input = self._get_decoder_input(
-            tokens, encoder_input=encoder_input, post_tokens=post_tokens
+            tokens, encoder_output=encoder_output, post_tokens=post_tokens
         )
         return self.decoder(decoder_input)
 
     def _get_decoder_input(
         self,
         tokens: Tensor,
         *,
-        encoder_input: Optional[Tensor],
+        encoder_output: Optional[Tensor],
         post_tokens: Optional[Tensor],
     ):
-        assert bool(encoder_input) == bool(
+        assert bool(encoder_output) == bool(
             post_tokens
         ), "encoder_input and post_tokens must be both None or not None"
-        if encoder_input is None:
+        if encoder_output is None:
             return self.tok_embeddings(tokens)
         else:
             pre_img_embed = self.tok_embeddings(tokens)
+            image_embeds = self.mm_projector(encoder_output)
             post_img_embed = self.tok_embeddings(post_tokens)
             return torch.cat((pre_img_embed, image_embeds, post_img_embed), dim=1)
 
@@ -261,7 +272,7 @@ class ModelArgs:
 
     def __init__(
         self,
-        transformer_args: Union[TransformerArgs, Dict[str, TransformerArgs]],
+        transformer_args: Union[TransformerArgs, Dict[str, Dict[str, Any]]],
         model_type: ModelType = ModelType.TextOnly,
     ) -> None:
         self._sanity_check(transformer_args, model_type)
@@ -275,7 +286,7 @@ def __init__(
     
     def _sanity_check(
         self,
-        transformer_args: Union[TransformerArgs, Dict[str, TransformerArgs]],
+        transformer_args: Union[TransformerArgs, Dict[str, Dict[str, Any]]],
         model_type: ModelType,
     ) -> None:
         assert isinstance(model_type, ModelType)
@@ -393,12 +404,20 @@ def build_model(self) -> nn.Module:
         modules = {}
         for name, module_class in recipe.modules.items():
             if isinstance(config_args := self.config.transformer_args[name], dict):
+                config_args = self._replace_know_params(config_args)
                 modules[name] = module_class(**config_args)
             else:
                 modules[name] = module_class(config_args)
 
         return recipe.fusion_class(**modules)
     
+    def _replace_know_params(self, params):
+        patterns = {"QuickGELUActivation()": QuickGELUActivation(), "False": False, "True": True}
+        for key, value in params.items():
+            if value in patterns:
+                params[key] = patterns[value]
+        return params
+    
     @abstractmethod
     def forward(self, *args, **kwargs):
         raise NotImplementedError("forward method is not implemented")
diff --git a/torchchat/model_params/llava-1.5.json b/torchchat/model_params/llava-1.5.json
@@ -1,23 +1,24 @@
-@dataclass
-class VisionArgs:
-    tile_size: int = 336
-    patch_size: int = 14
-    embed_dim: int = 1024
-    num_layers: int = 24
-    num_heads: int = 16
-    out_indices: List[int] = field(default_factory=list)
-    output_cls_projection: bool = False
-    max_num_tiles: int = 1
-    in_channels: int = 3
-    intermediate_act: nn.Module = QuickGELUActivation()
-
-    def __post_init__(self):
-        if not self.out_indices:
-            self.out_indices = [self.num_layers - 1]
-
-
-@dataclass
-class ProjectorArgs:
-    in_channels: int = 1024
-    out_channels: int = 4096
-    activation: nn.Module = nn.GELU()
+{
+    "model_type": "llava",
+    "encoder": {
+        "tile_size": 336,
+        "patch_size": 14,
+        "embed_dim": 1024,
+        "num_layers": 24,
+        "num_heads": 16,
+        "out_indices": [
+            23
+        ],
+        "output_cls_projection": False,
+        "max_num_tiles": 1,
+        "in_channels": 3,
+        "intermediate_act": QuickGELUActivation()
+    },
+    "decoder": {
+        "n_layers": 32,
+        "n_heads": 32,
+        "dim": 4096,
+        "vocab_size": 32064,
+        "max_seq_length": 768
+    }
+}