init

metascroy · metascroy · commit e7f25a3608a2 · 2024-12-11T13:39:25.000-08:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -229,6 +229,18 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Whether or not to export a model using kv cache",
     )
+    parser.add_argument(
+        "--prefill_return_kv",
+        default=False,
+        action="store_true",
+        help="Whether or not to return kv values from prefill model",
+    )
+    parser.add_argument(
+        "--prefill_seq_length",
+        default=False,
+        action="store_true",
+        help="Sequence length for prefill model",
+    )
     parser.add_argument(
         "--quantize_kv_cache",
         default=False,
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -23,6 +23,25 @@
 
 from torch import nn
 
+@torch.library.custom_op("coreml::sdpa", mutates_args=())
+def sdpa(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: torch.Tensor
+) -> torch.Tensor:
+    """Same as F.scaled_dot_product_attention, but with custom op to avoid lowering during dialect conversion."""
+    return torch.ops.aten.scaled_dot_product_attention.default(
+        q, k, v, attn_mask=attn_mask
+    )
+
+
+@torch.library.register_fake("coreml::sdpa")
+def _(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: torch.Tensor
+) -> torch.Tensor:
+    """Fake implementation with the right output shape, which is required for torch.compile/export/fx tracing."""
+    expected_shape = list(q.shape)
+    expected_shape[-1] = v.shape[-1]
+    return q.new_empty(expected_shape)
+
 
 class RMSNorm(torch.nn.Module):
     def __init__(self, dim: int, eps: float = 1e-6):
@@ -431,7 +450,7 @@ def forward(
 
         mask = self.mask[:seqlen, :seqlen]
 
-        output = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+        output = torch.ops.coreml.sdpa(q, k, v, mask)
 
         output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
 
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -273,7 +273,7 @@ def get_example_inputs(self):
         else:
             return (
                 torch.tensor(
-                    [[1, 2, 3]], dtype=torch.long
+                    [[0 for _ in range(self.args.get("prefill_seq_length", 3))]], dtype=torch.long
                 ),  # tokens, with kv cache our input token length is always just 1 token.
             )
 

Original file line number	Diff line number	Diff line change
`@@ -273,7 +273,7 @@ def get_example_inputs(self):`
`273`	`273`	`else:`
`274`	`274`	`return (`
`275`	`275`	`torch.tensor(`
`276`		`- [[1, 2, 3]], dtype=torch.long`
	`276`	`+ [[0 for _ in range(self.args.get("prefill_seq_length", 3))]], dtype=torch.long`
`277`	`277`	`), # tokens, with kv cache our input token length is always just 1 token.`
`278`	`278`	`)`
`279`	`279`