remove torch.narrow

metascroy · metascroy · commit 9924587082e6 · 2024-11-04T15:34:02.000-08:00
diff --git a/examples/models/llama/coreml_enumerated_shape.py b/examples/models/llama/coreml_enumerated_shape.py
@@ -8,7 +8,7 @@
 from numpy import dtype
 
 parser = build_args_parser()
-parser.add_argument('--use_enumerated_shapes', action='store_true')
+parser.add_argument("--use_enumerated_shapes", action="store_true")
 args = parser.parse_args()
 
 model_manager = _prepare_for_llama_export("llama2", args)
@@ -35,9 +35,9 @@ def get_example_inputs(max_batch_size, args, coreml=False, use_enumerated_shapes
         dtype=np.int64,
     )
 
+    print("TOKENS SHAPE: ", tokens.shape)
+
     if args.use_kv_cache:
-        # NOTE: torch.jit.trace does not work if tensor has size 1, but ct.convert does not work if not 512, so for KV cache with batch input, size should be 1
-        # input_pos = torch.tensor([0 for _ in range(max_batch_size)], dtype=torch.long)
         input_pos = torch.tensor([0], dtype=torch.long)
         ct_input_pos = ct.TensorType(shape=ct.Shape([1]), dtype=np.int64)
 
@@ -51,13 +51,7 @@ def get_example_inputs(max_batch_size, args, coreml=False, use_enumerated_shapes
 
 
 # Batch with kv cache runs into issues
-# Either we need input_pos to be size batch_size to export with jit.trace or we need it to be size 1 to export with ct.convert
-# Might try refactoring the model so that jit.trace works when it is size 1 (interested as starting position)
-if args.use_kv_cache:
-    max_batch_size = 1
-else:
-    max_batch_size = 128
-
+max_batch_size = args.max_seq_length
 example_inputs = get_example_inputs(max_batch_size, args)
 
 print("Example input shapes: ", [t.shape for t in example_inputs])
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -196,11 +196,11 @@ def update(
             # narrowed_v = self.v_cache.narrow(dim_to_slice, start_pos, seq_length)
 
             if self.transpose_cache:
-                narrowed_k = self.k_cache[:, :, input_pos:(input_pos+seq_length), :]
-                narrowed_v = self.v_cache[:, :, input_pos:(input_pos+seq_length), :]
+                narrowed_k = self.k_cache[:, :, input_pos : (input_pos + seq_length), :]
+                narrowed_v = self.v_cache[:, :, input_pos : (input_pos + seq_length), :]
             else:
-                narrowed_k = self.k_cache[:, input_pos:(input_pos+seq_length), :, :]
-                narrowed_v = self.v_cache[:, input_pos:(input_pos+seq_length), :, :]
+                narrowed_k = self.k_cache[:, input_pos : (input_pos + seq_length), :, :]
+                narrowed_v = self.v_cache[:, input_pos : (input_pos + seq_length), :, :]
 
             narrowed_k.copy_(k_val)
             narrowed_v.copy_(v_val)
@@ -257,7 +257,8 @@ def forward(
             torch._check(start_pos < self.max_seq_len)
             seq_length = q.size(2)
             # pyre-ignore: Incompatible parameter type [6]
-            attn_mask = mask.narrow(0, start_pos, seq_length)
+            # attn_mask = mask.narrow(0, start_pos, seq_length)
+            attn_mask = mask[start_pos : (start_pos + seq_length)]
         else:
             attn_mask = mask[None, None, input_pos]
 
@@ -518,9 +519,13 @@ def forward(
                 torch._check_is_size(input_pos_item)
                 torch._check(input_pos_item < self.params.max_seq_len)
                 # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
-                freqs_cos = self.freqs_cos[input_pos_item:(input_pos_item + seqlen)] #.narrow(0, input_pos_item, seqlen)
+                freqs_cos = self.freqs_cos[
+                    input_pos_item : (input_pos_item + seqlen)
+                ]  # .narrow(0, input_pos_item, seqlen)
                 # pyre-ignore: Incompatible parameter type [6]
-                freqs_sin = self.freqs_sin[input_pos_item:(input_pos_item + seqlen)] #.narrow(0, input_pos_item, seqlen)
+                freqs_sin = self.freqs_sin[
+                    input_pos_item : (input_pos_item + seqlen)
+                ]  # .narrow(0, input_pos_item, seqlen)
             else:
                 # When not using dynamic shape, use of the .item results in
                 # symints, due to querying the data from tensor.