Merge branch 'main' into patch-2

mikekgfb · web-flow · commit 4cbd6b66b098 · 2024-10-10T14:46:54.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,7 +14,7 @@ add_subdirectory(tokenizer)
 # include et_run executable
 include(runner/et.cmake)
 if(TARGET et_run)
-    target_link_libraries(et_run PUBLIC tokenizer)
+    target_link_libraries(et_run PUBLIC tokenizer microkernels-prod)
 endif()
 
 # include aoti_run executable
diff --git a/README.md b/README.md
@@ -477,9 +477,9 @@ The following assumes you've completed the steps for [Setting up ExecuTorch](#se
 
 1. Download the AAR file, which contains the Java library and corresponding JNI library, to build and run the app.
 
-   - [executorch-240919.aar](https://ossci-android.s3.amazonaws.com/executorch/main/executorch-240919.aar) (SHASUM: c8a5d38ead03bfa28ee8469f6355840ad0d182ba)
+   - [executorch.aar](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar) ([sha256sums](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar.sha256sums))
 
-2. Rename the downloaded AAR file to `executorch.aar` and move the file to `torchchat/edge/android/torchchat/app/libs/`. You may need to create directory `torchchat/edge/android/torchchat/app/libs/` if it does not exist.
+2. Move the downloaded AAR file to `torchchat/edge/android/torchchat/app/libs/`. You may need to create directory `torchchat/edge/android/torchchat/app/libs/` if it does not exist.
 
 3. Push the model and tokenizer file to your device. You can find the model file called `llama3.1.pte` in the current `torchchat` directory and the tokenizer file at `$(python3 torchchat.py where llama3.1)/tokenizer.model` path.
     ```
diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
@@ -1 +1 @@
-286799c9c844ce6427b8eca260f9b2f28be03291
+72b3bb3194c611f7c4861e6f3b24af5de868af72
diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-ae3e7c68eae7085e13241cb3d6b39481868dd162
+49b1fb61c8b8eceda755579a2fd92c756d822de2
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
@@ -47,10 +47,10 @@ fi
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20240901
+PYTORCH_NIGHTLY_VERSION=dev20241002
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20240901
+VISION_NIGHTLY_VERSION=dev20241002
 
 # Nightly version for torchtune
 TUNE_NIGHTLY_VERSION=dev20240928
@@ -76,7 +76,7 @@ fi
 
 # pip packages needed by exir.
 REQUIREMENTS_TO_INSTALL=(
-  torch=="2.5.0.${PYTORCH_NIGHTLY_VERSION}"
+  torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
   torchvision=="0.20.0.${VISION_NIGHTLY_VERSION}"
   torchtune=="0.3.0.${TUNE_NIGHTLY_VERSION}"
 )
diff --git a/runner/et.cmake b/runner/et.cmake
@@ -94,6 +94,7 @@ if(executorch_FOUND)
     optimized_native_cpu_ops_lib
     quantized_ops_lib
     xnnpack_backend
+    microkernels-prod
     XNNPACK
     pthreadpool
     cpuinfo
diff --git a/torchchat/cli/convert_hf_checkpoint.py b/torchchat/cli/convert_hf_checkpoint.py
@@ -81,10 +81,17 @@ def convert_hf_checkpoint(
         "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
         "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
         "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+        "model.layers.{}.self_attn.q_proj.bias": "layers.{}.attention.wq.bias",
+        "model.layers.{}.self_attn.k_proj.bias": "layers.{}.attention.wk.bias",
+        "model.layers.{}.self_attn.v_proj.bias": "layers.{}.attention.wv.bias",
+        "model.layers.{}.self_attn.o_proj.bias": "layers.{}.attention.wo.bias",
         "model.layers.{}.self_attn.rotary_emb.inv_freq": None,
         "model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight",
         "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
         "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+        "model.layers.{}.mlp.gate_proj.bias": "layers.{}.feed_forward.w1.bias",
+        "model.layers.{}.mlp.up_proj.bias": "layers.{}.feed_forward.w3.bias",
+        "model.layers.{}.mlp.down_proj.bias": "layers.{}.feed_forward.w2.bias",
         "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
         "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
         "model.norm.weight": "norm.weight",
@@ -93,11 +100,10 @@ def convert_hf_checkpoint(
     bin_files = {model_dir / bin for bin in bin_index["weight_map"].values()}
 
     def permute(w, n_heads):
-        dim = config.dim
         return (
-            w.view(n_heads, 2, config.head_dim // 2, dim)
+            w.view(n_heads, 2, config.head_dim // 2, *w.shape[1:])
             .transpose(1, 2)
-            .reshape(config.head_dim * n_heads, dim)
+            .reshape(w.shape)
         )
 
     merged_result = {}
@@ -130,6 +136,7 @@ def load_safetensors():
                 continue
         assert state_dict is not None, f"Unable to load tensors from {file}"
         merged_result.update(state_dict)
+
     final_result = {}
     for key, value in merged_result.items():
         if "layers" in key:
@@ -145,16 +152,18 @@ def load_safetensors():
         final_result[new_key] = value
 
     for key in tuple(final_result.keys()):
-        if "wq" in key:
+        if "wq.weight" in key or "wq.bias" in key:
+            wk_key = key.replace("wq", "wk")
+            wv_key = key.replace("wq", "wv")
             q = final_result[key]
-            k = final_result[key.replace("wq", "wk")]
-            v = final_result[key.replace("wq", "wv")]
+            k = final_result[wk_key]
+            v = final_result[wv_key]
             q = permute(q, config.n_heads)
             k = permute(k, config.n_local_heads)
             final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
             del final_result[key]
-            del final_result[key.replace("wq", "wk")]
-            del final_result[key.replace("wq", "wv")]
+            del final_result[wk_key]
+            del final_result[wv_key]
     print(f"Saving checkpoint to {model_dir / 'model.pth'}. This may take a while.")
     torch.save(final_result, model_dir / "model.pth")
     print("Done.")
diff --git a/torchchat/export.py b/torchchat/export.py
@@ -94,7 +94,7 @@ def export_for_server(
     from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
         XnnpackDynamicallyQuantizedPartitioner,
     )
-    from executorch.backends.xnnpack.passes.convert_to_linear import (
+    from executorch.backends.xnnpack._passes.convert_to_linear import (
         ConvertToLinearPass,
     )
     from executorch.exir import EdgeProgramManager, to_edge
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -928,9 +928,22 @@ def chat(
                     self.model_forward, fullgraph=True, **kwargs
                 )
 
-            self.decode_one_token = torch.compile(
-                self.decode_one_token, fullgraph=True, **kwargs
-            )
+            if self.model.config.model_type == ModelType.Flamingo:
+                # Based on https://github.com/pytorch/torchtune/blob/57ab583c84c4a9dcacac23aeabc81f2a679670fe/torchtune/training/_compile.py#L42-L52
+                from torchtune.modules import (
+                    TransformerCrossAttentionLayer,
+                    TransformerSelfAttentionLayer,
+                )
+                decoder = self.model.model.decoder 
+                for m in reversed(list(decoder.modules())):
+                    if isinstance(m, TransformerSelfAttentionLayer) or isinstance(
+                        m, TransformerCrossAttentionLayer
+                    ):
+                        m.compile()
+            else:
+                self.decode_one_token = torch.compile(
+                    self.decode_one_token, fullgraph=True, **kwargs
+                )
 
             if generator_args.compile_prefill:
                 self.prefill = torch.compile(
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -34,7 +34,7 @@
 try:
     # TODO: remove this after we figure out where in torchtune an `evaluate` module
     # is being imported, which is being confused with huggingface's `evaluate``.
-    import lm_eval  # noqa 
+    import lm_eval  # noqa
 except Exception:
     pass
 
@@ -278,6 +278,11 @@ class TransformerArgs:
     # For pipeline parallel
     n_stages: int = 1
     stage_idx: int = 0
+    # Optional biases
+    attention_bias: bool = False
+    feed_forward_bias: bool = False
+    # Whether or not to tie the input word embeddings to the output
+    tie_word_embeddings: bool = False
 
     def __post_init__(self):
         if self.n_local_heads == -1:
@@ -394,7 +399,7 @@ def from_name(cls, name: str):
         config = [
             config
             for config in known_model_params
-            if config in str(name).upper() or config in str(name)
+            if config.upper() in str(name).upper() or config in str(name)
         ]
 
         # We may have two or more configs matched (e.g., "7B" and
@@ -471,7 +476,7 @@ def build_model(self) -> nn.Module:
                 modules[name] = module_class(TransformerArgs.from_params(config_args))
             else:
                 modules[name] = module_class(**config_args)
-        
+
         # Temporary add extra params to the DeepFusionModel.
         # TODO: Remove it once we can make fusion model configurable in model_param.
         if recipe.fusion_class == DeepFusionModel:
@@ -629,12 +634,20 @@ def __init__(self, config: TransformerArgs) -> None:
         if config.stage_idx == config.n_stages - 1:
             self.norm = RMSNorm(config.dim, eps=config.norm_eps)
             self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+            if config.tie_word_embeddings:
+                self.output.weight = self.tok_embeddings.weight
         else:
             self.norm = None
             self.output = None
 
         self.max_batch_size = -1
         self.max_seq_length = -1
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(self, state_dict, prefix, *args):
+        """Handle tied embeddings at load time"""
+        if self.config.tie_word_embeddings:
+            state_dict.setdefault("model.output.weight", state_dict["model.tok_embeddings.weight"])
 
     def setup_caches(self, max_batch_size, max_seq_length, cache_lanes: int = 1):
         if (
@@ -730,16 +743,16 @@ def __init__(self, config: TransformerArgs):
 
         # key, query, value projections for all heads, but in a batch
         # total_head_dim = (config.n_heads + 2 * config.n_local_heads) * config.head_dim
-        # self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
-        self.wq = nn.Linear(config.dim, config.n_heads * config.head_dim, bias=False)
+        # self.wqkv = nn.Linear(config.dim, total_head_dim, bias=config.attention_bias)
+        self.wq = nn.Linear(config.dim, config.n_heads * config.head_dim, bias=config.attention_bias)
         self.wk = nn.Linear(
-            config.dim, config.n_local_heads * config.head_dim, bias=False
+            config.dim, config.n_local_heads * config.head_dim, bias=config.attention_bias
         )
         self.wv = nn.Linear(
-            config.dim, config.n_local_heads * config.head_dim, bias=False
+            config.dim, config.n_local_heads * config.head_dim, bias=config.attention_bias
         )
 
-        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=config.attention_bias)
         self.kv_cache = None
 
         self.n_heads = config.n_heads
@@ -766,14 +779,16 @@ def load_hook(self, state_dict, prefix, *args):
         #     wv = state_dict.pop(prefix + "wv.weight")
         #     state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
 
-        if prefix + "wqkv.weight" in state_dict:
-            wqkv = state_dict.pop(prefix + "wqkv.weight")
-            q_size = self.n_heads * self.head_dim
-            kv_size = self.n_local_heads * self.head_dim
-            wq, wk, wv = torch.split(wqkv, (q_size, kv_size, kv_size), dim=0)
-            state_dict[prefix + "wq.weight"] = wq
-            state_dict[prefix + "wk.weight"] = wk
-            state_dict[prefix + "wv.weight"] = wv
+        for tensor_suffix in ["weight", "bias"]:
+            wqkv_key = f"{prefix}wqkv.{tensor_suffix}"
+            if wqkv_key in state_dict:
+                wqkv = state_dict.pop(wqkv_key)
+                q_size = self.n_heads * self.head_dim
+                kv_size = self.n_local_heads * self.head_dim
+                wq, wk, wv = torch.split(wqkv, (q_size, kv_size, kv_size), dim=0)
+                state_dict[f"{prefix}wq.{tensor_suffix}"] = wq
+                state_dict[f"{prefix}wk.{tensor_suffix}"] = wk
+                state_dict[f"{prefix}wv.{tensor_suffix}"] = wv
 
         return
 
@@ -852,9 +867,9 @@ def forward(
 class FeedForward(nn.Module):
     def __init__(self, config: TransformerArgs) -> None:
         super().__init__()
-        self.w1 = nn.Linear(config.dim, config.hidden_dim, bias=False)
-        self.w2 = nn.Linear(config.hidden_dim, config.dim, bias=False)
-        self.w3 = nn.Linear(config.dim, config.hidden_dim, bias=False)
+        self.w1 = nn.Linear(config.dim, config.hidden_dim, bias=config.feed_forward_bias)
+        self.w2 = nn.Linear(config.hidden_dim, config.dim, bias=config.feed_forward_bias)
+        self.w3 = nn.Linear(config.dim, config.hidden_dim, bias=config.feed_forward_bias)
 
     def distribute(self, device_mesh: DeviceMesh):
         parallelize_module(self.w1, device_mesh, ColwiseParallel())
diff --git a/torchchat/utils/scripts/android_example.sh b/torchchat/utils/scripts/android_example.sh
@@ -30,8 +30,8 @@ else
   exit -1
 fi
 
-LLAMA_AAR_URL="https://ossci-android.s3.amazonaws.com/executorch/main/executorch-240919.aar"
-LLAMA_AAR_SHASUM="c8a5d38ead03bfa28ee8469f6355840ad0d182ba"
+LLAMA_AAR_URL="https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar"
+LLAMA_AAR_SHASUM_URL="https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar.sha256sums"
 
 mkdir -p ${TORCHCHAT_ROOT}/build/android
 
@@ -88,8 +88,10 @@ setup_android_sdk() {
 
 download_aar_library() {
   mkdir -p ${TORCHCHAT_ROOT}/android/torchchat/app/libs
-  curl "${LLAMA_AAR_URL}" -o ${TORCHCHAT_ROOT}/android/torchchat/app/libs/executorch.aar
-  echo "${LLAMA_AAR_SHASUM}  ${TORCHCHAT_ROOT}/android/torchchat/app/libs/executorch.aar" | shasum --check --status
+  curl "${LLAMA_AAR_URL}" -O
+  curl "${LLAMA_AAR_SHASUM_URL}" -O
+  shasum --check --status executorch.aar.sha256sums
+  mv executorch.aar ${TORCHCHAT_ROOT}/android/torchchat/app/libs/
 }
 
 build_app() {

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-286799c9c844ce6427b8eca260f9b2f28be03291`
	`1`	`+72b3bb3194c611f7c4861e6f3b24af5de868af72`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-ae3e7c68eae7085e13241cb3d6b39481868dd162`
	`1`	`+49b1fb61c8b8eceda755579a2fd92c756d822de2`
Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ def export_for_server(`
`94`	`94`	`from executorch.backends.xnnpack.partition.xnnpack_partitioner import (`
`95`	`95`	`XnnpackDynamicallyQuantizedPartitioner,`
`96`	`96`	`)`
`97`		`- from executorch.backends.xnnpack.passes.convert_to_linear import (`
	`97`	`+ from executorch.backends.xnnpack._passes.convert_to_linear import (`
`98`	`98`	`ConvertToLinearPass,`
`99`	`99`	`)`
`100`	`100`	`from executorch.exir import EdgeProgramManager, to_edge`