pytorch · facebook-github-bot · Nov 15, 2024 · Nov 8, 2024 · Nov 11, 2024 · cccclai
@@ -204,7 +204,7 @@ def check_weights_exist(weight_dir):
             f"No weight files found in {weight_dir}! Weight files should be either .bin or .safetensors file types."
         )
     safetensors_l = [f for f in os.listdir(weight_dir) if f.endswith(".safetensors")]
-    bin_l = [f for f in os.listdir(weight_dir) if f.endswith(".bin")]
+    bin_l = [f for f in os.listdir(weight_dir) if f.endswith(".bin") and "embedding" not in f]
     if len(safetensors_l) & len(bin_l):
         raise RuntimeError(
             "Weights should only be in either .bin or .safetensors format, not both."

@@ -419,6 +419,9 @@ def main():
     print(f"Max Num Token: {max_num_token}")
     print(f"Max Cache Size: {max_cache_size}")
 
+    if args.dataset is not None:
+        embedding_layer = get_embedding_layer(config, weight_dir, state_dict)
+
     # Instantiate model chunks
     print("Instantiating submodels")
     models = []
@@ -437,7 +440,6 @@ def main():
     cal_dataset = None
     if args.dataset is not None:
         cal_dataset = load_dataset("text", data_files=args.dataset, split="train")
-        embedding_layer = get_embedding_layer(config, weight_dir, state_dict)
         master_rot_emb = get_master_rot_emb(config, dtype=torch.float32)
         if args.preformatter is not None:
             cal_dataset = cal_dataset.map(
@@ -460,6 +462,7 @@ def main():
                 "eos_token_id_tensor": torch.tensor(tokenizer.eos_token_id),
                 "response_cap": args.response_cap,
             },
+            keep_in_memory=True
         )
 
     for chunk_idx, chunk in enumerate(models):

@@ -675,8 +675,8 @@ def load_weights(self, state_dict, state_dict_start_idx):
                     )
             else:
                 if self.config.tie_word_embeddings:
-                    lm_head_weight_key = "embed_tokens.weight"
-                    lm_head_bias_key = "embed_tokens.bias"
+                    lm_head_weight_key = f"{prefix}embed_tokens.weight"
+                    lm_head_bias_key = f"{prefix}embed_tokens.bias"
                 else:
                     lm_head_weight_key = "lm_head.weight"
                     lm_head_bias_key = "lm_head.bias"
@@ -757,9 +757,9 @@ def get_example_inputs(
             nt = Dim("num_token", max=num_token)
             cache_dims = tuple(({} for _ in range(2 * self.num_blocks)))
             dynamic_shapes = (
-                {0: None, 1: nt, 2: None},
-                {0: None, 1: None, 2: nt, 3: nt + cache_size},
-                {0: None, 1: None, 2: nt, 3: None},
+                {0: Dim.STATIC, 1: nt, 2: Dim.STATIC},
+                {0: Dim.STATIC, 1: Dim.STATIC, 2: nt, 3: nt + cache_size},
+                {0: Dim.STATIC, 1: Dim.STATIC, 2: nt, 3: Dim.STATIC},
                 cache_dims,
             )
             return example_inputs, dynamic_shapes

@@ -0,0 +1,23 @@
+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "head_dim": 64,
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0.dev0",
+  "vocab_size": 128256,
+  "tokenizer": "pretrained_fast"
+}
@@ -0,0 +1,16 @@
+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}