up

metascroy · metascroy · commit 0f3927a50599 · 2024-11-04T17:00:48.000-08:00
diff --git a/examples/models/llama/coreml_enumerated_shape.py b/examples/models/llama/coreml_enumerated_shape.py
@@ -25,7 +25,7 @@ def get_example_inputs(max_batch_size, args, coreml=False, use_enumerated_shapes
                 [1, 1],
                 [1, max_batch_size],
             ],
-            default=[1, max_batch_size],
+            default=[1, 1],
         )
     else:
         ct_tokens_shape = ct.Shape([1, max_batch_size])
@@ -35,8 +35,6 @@ def get_example_inputs(max_batch_size, args, coreml=False, use_enumerated_shapes
         dtype=np.int64,
     )
 
-    print("TOKENS SHAPE: ", tokens.shape)
-
     if args.use_kv_cache:
         input_pos = torch.tensor([0], dtype=torch.long)
         ct_input_pos = ct.TensorType(shape=ct.Shape([1]), dtype=np.int64)
@@ -54,10 +52,13 @@ def get_example_inputs(max_batch_size, args, coreml=False, use_enumerated_shapes
 max_batch_size = args.max_seq_length
 example_inputs = get_example_inputs(max_batch_size, args)
 
-print("Example input shapes: ", [t.shape for t in example_inputs])
 
 traced_model = torch.jit.trace(model, example_inputs)
 
+print("Example input shapes: ", [t.shape for t in example_inputs])
+
+input("Press enter to continue...")
+
 states = None
 if args.use_kv_cache:
     states = [
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -179,9 +179,9 @@ def update(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # input_pos: [S], k_val: [B, H, S, D] or [B, S, H, D] depending on transpose_cache
         if self.enable_dynamic_shape:
-            start_pos = input_pos[0].item()
-            torch._check_is_size(start_pos)
-            torch._check(start_pos < self.max_seq_length)
+            # start_pos = input_pos[0].item()
+            # torch._check_is_size(start_pos)
+            # torch._check(start_pos < self.max_seq_length)
             dim_to_slice = 2 if self.transpose_cache else 1
             seq_length = k_val.size(dim_to_slice)
             # Replace the entry in the cache for this token
@@ -252,13 +252,13 @@ def forward(
 
         k, v = self.kv_cache.update(input_pos, k, v)
         if self.enable_dynamic_shape:
-            start_pos = input_pos[-1].item()
-            torch._check_is_size(start_pos)
-            torch._check(start_pos < self.max_seq_len)
+            # start_pos = input_pos[-1].item()
+            # torch._check_is_size(start_pos)
+            # torch._check(start_pos < self.max_seq_len)
             seq_length = q.size(2)
             # pyre-ignore: Incompatible parameter type [6]
             # attn_mask = mask.narrow(0, start_pos, seq_length)
-            attn_mask = mask[start_pos : (start_pos + seq_length)]
+            attn_mask = mask[input_pos : (input_pos + seq_length)]
         else:
             attn_mask = mask[None, None, input_pos]
 
@@ -515,16 +515,16 @@ def forward(
 
             if self.params.enable_dynamic_shape:
                 # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos.
-                input_pos_item = input_pos[-1].item()
-                torch._check_is_size(input_pos_item)
-                torch._check(input_pos_item < self.params.max_seq_len)
+                # input_pos_item = input_pos[-1].item()
+                # torch._check_is_size(input_pos_item)
+                # torch._check(input_pos_item < self.params.max_seq_len)
                 # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
                 freqs_cos = self.freqs_cos[
-                    input_pos_item : (input_pos_item + seqlen)
+                    input_pos : (input_pos + seqlen)
                 ]  # .narrow(0, input_pos_item, seqlen)
                 # pyre-ignore: Incompatible parameter type [6]
                 freqs_sin = self.freqs_sin[
-                    input_pos_item : (input_pos_item + seqlen)
+                    input_pos : (input_pos + seqlen)
                 ]  # .narrow(0, input_pos_item, seqlen)
             else:
                 # When not using dynamic shape, use of the .item results in