merge with unified model contruction pipeline

Gasoonjia · Gasoonjia · commit a190b0fbca08 · 2024-09-17T01:46:04.000-07:00
diff --git a/distributed/parallelize_llama.py b/distributed/parallelize_llama.py
@@ -62,7 +62,7 @@ def apply_tp(
     # after we apply TP to the model. Because we don't want to change model code 
     # when applying TP. We need to have change to ensure KVCache has the correct
     # size as k and v.
-    model.model.config.n_local_heads = model.model.config.n_local_heads // tp_mesh.size()
+    model.text_transformer_args.n_local_heads = model.text_transformer_args.n_local_heads // tp_mesh.size()
 
     # Apply tensor parallelism to every transformer block
     for transformer_block in model.layers:
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -563,7 +563,7 @@ def _initialize_model(
                 model.setup_caches(
                     max_batch_size=1,
                     max_seq_length=max_seq_length
-                    or model.model.config.max_seq_length,
+                    or model.text_transformer_args.max_seq_length,
                 )
 
         model.to(dtype=builder_args.precision)
diff --git a/torchchat/export.py b/torchchat/export.py
@@ -54,7 +54,7 @@ def export_for_server(
             torch.tensor([0, 1, 2, 3, 4], dtype=torch.int, device=device),
         )
 
-        seq = Dim("seq", min=1, max=model.model.config.max_seq_length)
+        seq = Dim("seq", min=1, max=model.text_transformer_args.max_seq_length)
         # Specify that the first dimension of each input is that batch size
         dynamic_shapes = {"tokens": {1: seq}, "input_pos": {0: seq}}
     else:
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -364,6 +364,8 @@ def prefill(
                 x_sliced, ip_sliced = x[:, i].view(-1, 1), input_pos[i].view(-1)
                 # logging.debug(f"<sliced> x: {x_sliced}, input_pos: {ip_sliced}")
                 logits = model(x_sliced, ip_sliced)  # (x[:, i], input_pos[i])
+        elif self.model.config.model_type == ModelType.Flamingo:
+            logits = model(x)
         else:
             # input_pos: [B, S]
             logits = model(x, input_pos)
@@ -383,11 +385,14 @@ def decode_one_token(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         # input_pos: [B, 1]
         assert input_pos.shape[-1] == 1
-        if model.config.model_type == ModelType.Flamingo and batch is not None:
-            x = x.view(1, -1)
-            logits = model(x, encoder_mask=batch["encoder_mask"][:, -1:])
+        x = x.view(1, -1)
+        if model.config.model_type == ModelType.Flamingo:
+            if batch is not None:
+                logits = model(x, encoder_mask=batch["encoder_mask"][:, -1:])
+            else:
+                logits = model(x)
         else:
-            logits = model(x.view(1, -1), input_pos)
+            logits = model(x, input_pos)
         # print(f"x: {x},\n  input_pos: {input_pos}\n")
         return self.sample(logits, need_probs=need_probs, **sampling_kwargs)
 
@@ -790,7 +795,7 @@ def chat(
 
         # This is a hack to get around the fact that different models have different ways to record their max_seq_length and might be wrong
         # TODO: unify the max_seq_length config representation.
-        text_transformer_args = getattr(self.model.model, "config", None)
+        text_transformer_args = self.model.text_transformer_args
         max_seq_length = (
             text_transformer_args.max_seq_length if text_transformer_args else 2048
         )
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -291,6 +291,18 @@ def from_params(cls, params):
 
 @dataclass
 class ModelArgs:
+    """
+    A data class to describe the structure of a model.
+    Attributes:
+        model_type (ModelType): The type of the model. This attribute is used to categorize the model into different classes.
+        transformer_args (Dict[str, Dict[str, Any]]): A dictionary containing the parameters for each transformer in the model.
+            The outer dictionary has transformer names as keys and inner dictionaries as values. Each inner dictionary contains
+            the parameter names and their corresponding values for the respective transformer.
+        use_tiktoken (bool): A flag indicating whether to use TikToken as the tokenizer for the model.
+    Note:
+        It is recommended to use factory functions to create instances of this class instead of directly using the constructor.
+    """
+
     model_type: ModelType
     transformer_args: Dict[str, Dict[str, Any]]
     use_tiktoken: bool
@@ -326,7 +338,7 @@ def from_params(cls, params_path):
             # The model params is in the transformer_args format
             # set the model_type to TextOnly and reformat the params
             model_type = ModelType.TextOnly
-            transformer_args = {"text": {"config": loaded_params}}
+            transformer_args = {"text": loaded_params}
         else:
             model_type = ModelType(model_type_name)
             transformer_args = {
@@ -420,6 +432,7 @@ def __init__(self, config: ModelArgs) -> None:
         super().__init__()
         self.config = config
         self.model = self.build_model()
+        self.text_transformer_args = None
 
     def build_model(self) -> nn.Module:
         """
@@ -433,7 +446,10 @@ def build_model(self) -> nn.Module:
         modules = {}
         for name, module_class in recipe.modules.items():
             config_args = self.config.transformer_args[name]
-            modules[name] = module_class(**config_args)
+            if module_class == Transformer:
+                modules[name] = module_class(TransformerArgs.from_params(config_args))
+            else:
+                modules[name] = module_class(**config_args)
 
         return recipe.fusion_class(**modules)
     
@@ -486,6 +502,10 @@ def from_gguf(cls, gguf_path: str, **kwargs):
 
 
 class TextOnlyModel(Model):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__(config)
+        self.text_transformer_args = self.model.config
+
     def forward(self, tokens: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
         return self.model(tokens, input_pos)
 
@@ -548,9 +568,8 @@ def setup_caches(self, max_batch_size, max_seq_length):
 
 
 class Transformer(nn.Module):
-    def __init__(self, config: Dict[str, Any]) -> None:
+    def __init__(self, config: TransformerArgs) -> None:
         super().__init__()
-        config = TransformerArgs.from_params(config)
         self.config = config
         layers_per_stage = config.n_layers // config.n_stages
 
@@ -930,11 +949,9 @@ def __init__(self, config, path) -> None:
             super().__init__()
             self.config = config
             self.model_ = exec_lib._load_for_executorch(str(path))
-            
-            # A hacky way to get the model config from the self.model, making it consistent with Model class
-            # TODO: remove the hacky way once get rid of model.model
-            self.model = type('model', (), {'config': self.config})
 
+            self.text_transformer_args = TransformerArgs.from_params(self.config.transformer_args["text"])
+            
         def forward(self, x, input_pos):
             # model_.forward expects inputs to be wrapped in a tuple
             forward_inputs = (x.to(torch.long), input_pos.to(torch.long))
@@ -948,7 +965,7 @@ def forward(self, x, input_pos):
 
         def setup_caches(self, max_batch_size, max_seq_length):
             pass
-
+        
 except:
     pass
 
diff --git a/torchchat/usages/browser.py b/torchchat/usages/browser.py
@@ -1,39 +1,123 @@
+import base64
+import logging
 import time
+from pathlib import Path
+
 import streamlit as st
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
 from openai import OpenAI
 
 st.title("torchchat")
 
 start_state = [
     {
         "role": "system",
-        "content": "You're an assistant. Answer questions directly, be brief, and have fun.",
+        "content": "You're a helpful assistant - have fun.",
     },
     {"role": "assistant", "content": "How can I help you?"},
 ]
 
+st.session_state.uploader_key = 0
+
+
+def reset_per_message_state():
+    # Catch all function for anything that should be reset between each message.
+    _update_uploader_key()
+
+
+def _update_uploader_key():
+    # Increment the uploader key to reset the file uploader after each message.
+    st.session_state.uploader_key = int(time.time())
+
+
 with st.sidebar:
+    # API Configuration
+    api_base_url = st.text_input(
+        label="API Base URL",
+        value="http://127.0.0.1:5000/v1",
+        help="The base URL for the OpenAI API to connect to",
+    )
+
+    st.divider()
+    temperature = st.slider(
+        "Temperature", min_value=0.0, max_value=1.0, value=1.0, step=0.01
+    )
+
     response_max_tokens = st.slider(
         "Max Response Tokens", min_value=10, max_value=1000, value=250, step=10
     )
     if st.button("Reset Chat", type="primary"):
         st.session_state["messages"] = start_state
 
+    image_prompts = st.file_uploader(
+        "Image Prompts",
+        type=["jpeg"],
+        accept_multiple_files=True,
+        key=st.session_state.uploader_key,
+    )
+
+    for image in image_prompts:
+        st.image(image)
+
+
+client = OpenAI(
+    base_url=api_base_url,
+    api_key="813",  # The OpenAI API requires an API key, but since we don't consume it, this can be any non-empty string.
+)
+
 if "messages" not in st.session_state:
     st.session_state["messages"] = start_state
 
 
 for msg in st.session_state.messages:
-    st.chat_message(msg["role"]).write(msg["content"])
+    with st.chat_message(msg["role"]):
+        if type(msg["content"]) is list:
+            for content in msg["content"]:
+                if content["type"] == "image_url":
+                    extension = (
+                        content["image_url"].split(";base64")[0].split("image/")[1]
+                    )
+                    base64_repr = content["image_url"].split("base64,")[1]
+                    st.image(base64.b64decode(base64_repr))
+                else:
+                    st.write(content["text"])
+        elif type(msg["content"]) is dict:
+            if msg["content"]["type"] == "image_url":
+                st.image(msg["content"]["image_url"])
+            else:
+                st.write(msg["content"]["text"])
+        elif type(msg["content"]) is str:
+            st.write(msg["content"])
+        else:
+            st.write(f"Unhandled content type: {type(msg['content'])}")
+
 
 if prompt := st.chat_input():
-    client = OpenAI(
-        base_url="http://127.0.0.1:5000/v1",
-        api_key="813",  # The OpenAI API requires an API key, but since we don't consume it, this can be any non-empty string.
-    )
+    user_message = {"role": "user", "content": [{"type": "text", "text": prompt}]}
+
+    if image_prompts:
+        for image_prompt in image_prompts:
+            extension = Path(image_prompt.name).suffix.strip(".")
+            image_bytes = image_prompt.getvalue()
+            base64_encoded = base64.b64encode(image_bytes).decode("utf-8")
+            user_message["content"].append(
+                {
+                    "type": "image_url",
+                    "image_url": f"data:image/{extension};base64,{base64_encoded}",
+                }
+            )
+    st.session_state.messages.append(user_message)
+
+    with st.chat_message("user"):
+        st.write(prompt)
+        for img in image_prompts:
+            st.image(img)
 
-    st.session_state.messages.append({"role": "user", "content": prompt})
-    st.chat_message("user").write(prompt)
+    image_prompts = None
+    reset_per_message_state()
 
     with st.chat_message("assistant"), st.status(
         "Generating... ", expanded=True
@@ -53,15 +137,20 @@ def get_streamed_completion(completion_generator):
                 state="complete",
             )
 
-        response = st.write_stream(
-            get_streamed_completion(
-                client.chat.completions.create(
-                    model="llama3",
-                    messages=st.session_state.messages,
-                    max_tokens=response_max_tokens,
-                    stream=True,
+        try:
+            response = st.write_stream(
+                get_streamed_completion(
+                    client.chat.completions.create(
+                        model="llama3",
+                        messages=st.session_state.messages,
+                        max_tokens=response_max_tokens,
+                        temperature=temperature,
+                        stream=True,
+                    )
                 )
-            )
-        )[0]
+            )[0]
+        except Exception as e:
+            response = st.error(f"Error: {e}")
+            print(e)
 
     st.session_state.messages.append({"role": "assistant", "content": response})
diff --git a/torchchat/usages/eval.py b/torchchat/usages/eval.py
@@ -59,7 +59,7 @@ def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
     T = prompt.size(0)
     T_new = T + max_new_tokens
     if max_seq_length is None:
-        max_seq_length = min(T_new, model.model.config.block_size)
+        max_seq_length = min(T_new, model.text_transformer_args.block_size)
 
     device, dtype = prompt.device, prompt.dtype
     # create an empty tensor of the expected final shape and
diff --git a/torchchat/usages/openai_api.py b/torchchat/usages/openai_api.py
diff --git a/torchchat/utils/gguf_loader.py b/torchchat/utils/gguf_loader.py

Original file line number	Diff line number	Diff line change
`@@ -563,7 +563,7 @@ def _initialize_model(`
`563`	`563`	`model.setup_caches(`
`564`	`564`	`max_batch_size=1,`
`565`	`565`	`max_seq_length=max_seq_length`
`566`		`- or model.model.config.max_seq_length,`
	`566`	`+ or model.text_transformer_args.max_seq_length,`
`567`	`567`	`)`
`568`	`568`
`569`	`569`	`model.to(dtype=builder_args.precision)`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ def export_for_server(`
`54`	`54`	`torch.tensor([0, 1, 2, 3, 4], dtype=torch.int, device=device),`
`55`	`55`	`)`
`56`	`56`
`57`		`- seq = Dim("seq", min=1, max=model.model.config.max_seq_length)`
	`57`	`+ seq = Dim("seq", min=1, max=model.text_transformer_args.max_seq_length)`
`58`	`58`	`# Specify that the first dimension of each input is that batch size`
`59`	`59`	`dynamic_shapes = {"tokens": {1: seq}, "input_pos": {0: seq}}`
`60`	`60`	`else:`