init

metascroy · metascroy · commit 6417869867e7 · 2024-11-15T11:51:43.000-08:00
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -23,6 +23,23 @@
 
 from torch import nn
 
+@torch.library.custom_op("coreml::sdpa", mutates_args=())
+def sdpa(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: torch.Tensor
+) -> torch.Tensor:
+    """Same as F.scaled_dot_product_attention, but with custom op to avoid lowering during dialect conversion."""
+    return torch.ops.aten.scaled_dot_product_attention.default(
+        q, k, v, attn_mask=attn_mask
+    )
+
+@torch.library.register_fake("coreml::sdpa")
+def _(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: torch.Tensor
+) -> torch.Tensor:
+    """Fake implementation with the right output shape, which is required for torch.compile/export/fx tracing."""
+    expected_shape = list(q.shape)
+    expected_shape[-1] = v.shape[-1]
+    return q.new_empty(expected_shape)
 
 class RMSNorm(torch.nn.Module):
     def __init__(self, dim: int, eps: float = 1e-6):
@@ -351,7 +368,8 @@ def forward(
 
         mask = self.mask[:seqlen, :seqlen]
 
-        output = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+        # output = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+        output = torch.ops.coreml.sdpa(q, k, v, mask)
 
         output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
 
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -244,9 +244,7 @@ def get_example_inputs(self):
             return self.get_example_inputs_kvcache_sdpa()
         else:
             return (
-                torch.tensor(
-                    [[1, 2, 3]], dtype=torch.long
-                ),  # tokens, with kv cache our input token length is always just 1 token.
+                torch.ones(size=(1, self.max_seq_len), dtype=torch.long),
             )
 
     # assumption is the custom op doesnt support dynamic shape right now. It might but its untested so lets first get static shape working
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -156,18 +156,17 @@ def source_transform(
     def _get_dynamic_shape(self) -> Any:
         if self.dynamic_shapes:
             return self.dynamic_shapes
-
-        dim = torch.export.Dim("token_dim", max=self.max_seq_len - 1)
-
-        if not self.use_kv_cache:
-            # Only one input argument: tokens
-            self.dynamic_shapes = ({1: dim},)
-        elif self.enable_dynamic_shape:
-            # Two input arguments: tokens and input_pos but input_pos is static shape
-            self.dynamic_shapes = ({1: dim}, {0: 1})
-        else:
-            # Two input arguments: tokens and input_pos but both are of static shape
+        
+        if not self.enable_dynamic_shape:
             self.dynamic_shapes = None
+        else:
+            dim = torch.export.Dim("token_dim", max=self.max_seq_len - 1)
+            if not self.use_kv_cache:
+                # Only one input argument: tokens
+                self.dynamic_shapes = ({1: dim},)
+            else:
+                # Two input arguments: tokens and input_pos but input_pos is static shape
+                self.dynamic_shapes = ({1: dim}, {0: 1})
         return self.dynamic_shapes
 
     def _get_edge_config(self) -> EdgeCompileConfig:
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
@@ -128,11 +128,19 @@ def _validate_ios_version() -> None:
             "block_size": 32,
             "weight_threshold": 512,
         }
+    
+    assert ios == 18
+    print("OVERRIDING CONFIG TO BE 4B PER_CHANNEL")
+    op_linear_quantizer_config = {
+        "mode": "linear_symmetric",
+        "dtype": "int4",
+        "granularity": "per_channel",
+    }
     compile_specs = CoreMLBackend.generate_compile_specs(  # pyre-fixme[16]
         minimum_deployment_target=minimum_deployment_target,
         compute_precision=ct.precision(ct.precision.FLOAT16.value),
         # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
-        compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
+        compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_AE.name.upper()],
         model_type=CoreMLBackend.MODEL_TYPE.MODEL,  # pyre-fixme[16]
         op_linear_quantizer_config=op_linear_quantizer_config,
     )
@@ -142,6 +150,9 @@ def _validate_ios_version() -> None:
     return CoreMLPartitioner(  # pyre-fixme[16]
         compile_specs=compile_specs,
         take_over_mutable_buffer=take_over_mutable_buffer,
+         skip_ops_for_coreml_delegation=[
+            "aten.embedding.default",
+        ],
     )