Add compute unit to coreml llama export (#7280)

metascroy · web-flow · commit 4d2acf21cf25 · 2024-12-15T19:12:41.000-08:00
* init

* up
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -353,8 +353,8 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--coreml-quantize",
         default=None,
-        choices=["b4w"],
-        help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight)",
+        choices=["b4w", "c4w"],
+        help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight), c4w (for channelwise 4 bit weight)",
     )
     parser.add_argument(
         "--coreml-ios",
@@ -363,6 +363,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         choices=(15, 16, 17, 18),
         help="This option is only for coreml: The minimum iOS version to deploy",
     )
+    parser.add_argument(
+        "--coreml-compute-units",
+        type=str,
+        default="cpu_only",
+        choices=("cpu_only", "cpu_and_gpu", "cpu_and_ne", "all"),
+        help="This option is only for coreml: the compute units to use when running the model",
+    )
     parser.add_argument(
         "--qnn",
         action="store_true",
@@ -703,6 +710,7 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
             args.embedding_quantize,
             args.pt2e_quantize,
             args.coreml_quantize,
+            args.coreml_compute_units,
         )
         partitioners.append(coreml_partitioner)
         modelname = f"coreml_{modelname}"
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
@@ -69,6 +69,7 @@ def get_coreml_partitioner(
     embedding_quantize: Optional[str] = None,
     pt2e_quantize: Optional[str] = None,
     coreml_quantize: Optional[str] = None,
+    coreml_compute_units: Optional[str] = None,
 ):
     try:
         import coremltools as ct
@@ -119,6 +120,19 @@ def _validate_ios_version() -> None:
         17: ct.target.iOS17,
         18: ct.target.iOS18,
     }[ios]
+
+    if coreml_compute_units is None:
+        # using `ComputeUnit.ALL` can increase the model load time
+        # On iPhone 15 Pro, CPU decode model is over 8x faster than GPU for stories110M,
+        # so default to CPU_ONLY
+        coreml_compute_units = "cpu_only"
+    coreml_compute_units = {
+        "cpu_only": ct.ComputeUnit.CPU_ONLY,
+        "cpu_and_ne": ct.ComputeUnit.CPU_AND_NE,
+        "cpu_and_gpu": ct.ComputeUnit.CPU_AND_GPU,
+        "all": ct.ComputeUnit.ALL,
+    }[coreml_compute_units.lower()]
+
     op_linear_quantizer_config = None
     if coreml_quantize == "b4w":
         op_linear_quantizer_config = {
@@ -128,17 +142,22 @@ def _validate_ios_version() -> None:
             "block_size": 32,
             "weight_threshold": 512,
         }
+    elif coreml_quantize == "c4w":
+        op_linear_quantizer_config = {
+            "mode": "linear_symmetric",
+            "dtype": "int4",
+            "granularity": "per_channel",
+        }
+
     compile_specs = CoreMLBackend.generate_compile_specs(  # pyre-fixme[16]
         minimum_deployment_target=minimum_deployment_target,
         compute_precision=ct.precision(ct.precision.FLOAT16.value),
-        # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
-        compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
+        compute_unit=coreml_compute_units,
         model_type=CoreMLBackend.MODEL_TYPE.MODEL,  # pyre-fixme[16]
         op_linear_quantizer_config=op_linear_quantizer_config,
     )
 
     take_over_mutable_buffer = minimum_deployment_target >= ct.target.iOS18
-
     return CoreMLPartitioner(  # pyre-fixme[16]
         compile_specs=compile_specs,
         take_over_mutable_buffer=take_over_mutable_buffer,