diff --git a/backends/apple/coreml/TARGETS b/backends/apple/coreml/TARGETS index 6993b699427..c5eec41d5fc 100644 --- a/backends/apple/coreml/TARGETS +++ b/backends/apple/coreml/TARGETS @@ -120,6 +120,7 @@ runtime.python_test( "test/*.py", ]), deps = [ + "fbsource//third-party/pypi/coremltools:coremltools", "fbsource//third-party/pypi/pytest:pytest", ":partitioner", ":quantizer", diff --git a/backends/apple/coreml/recipes/coreml_recipe_provider.py b/backends/apple/coreml/recipes/coreml_recipe_provider.py index 75c937027bb..5d4fee6976d 100644 --- a/backends/apple/coreml/recipes/coreml_recipe_provider.py +++ b/backends/apple/coreml/recipes/coreml_recipe_provider.py @@ -6,6 +6,7 @@ from typing import Any, Optional, Sequence import coremltools as ct +import torch from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition.coreml_partitioner import ( @@ -18,11 +19,15 @@ from executorch.exir import EdgeCompileConfig from executorch.export import ( + AOQuantizationConfig, BackendRecipeProvider, ExportRecipe, LoweringRecipe, + QuantizationRecipe, RecipeType, ) +from torchao.quantization.granularity import PerAxis, PerGroup +from torchao.quantization.quant_api import IntxWeightOnlyConfig class CoreMLRecipeProvider(BackendRecipeProvider): @@ -50,34 +55,98 @@ def create_recipe( # Validate kwargs self._validate_recipe_kwargs(recipe_type, **kwargs) - # Parse recipe type to get precision and compute unit - precision = None if recipe_type == CoreMLRecipeType.FP32: - precision = ct.precision.FLOAT32 + return self._build_fp_recipe(recipe_type, ct.precision.FLOAT32, **kwargs) elif recipe_type == CoreMLRecipeType.FP16: - precision = ct.precision.FLOAT16 - - if precision is None: - raise ValueError(f"Unknown precision for recipe: {recipe_type.value}") + return self._build_fp_recipe(recipe_type, ct.precision.FLOAT16, **kwargs) + elif recipe_type == CoreMLRecipeType.PT2E_INT8_STATIC: + return self._build_pt2e_quantized_recipe( + recipe_type, activation_dtype=torch.quint8, **kwargs + ) + elif recipe_type == CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY: + return self._build_pt2e_quantized_recipe( + recipe_type, activation_dtype=torch.float32, **kwargs + ) + elif recipe_type == CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_CHANNEL: + return self._build_torchao_quantized_recipe( + recipe_type, + weight_dtype=torch.int4, + is_per_channel=True, + **kwargs, + ) + elif recipe_type == CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_GROUP: + group_size = kwargs.pop("group_size", 32) + return self._build_torchao_quantized_recipe( + recipe_type, + weight_dtype=torch.int4, + is_per_channel=False, + group_size=group_size, + **kwargs, + ) + elif recipe_type == CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_CHANNEL: + return self._build_torchao_quantized_recipe( + recipe_type, weight_dtype=torch.int8, is_per_channel=True, **kwargs + ) + elif recipe_type == CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_GROUP: + group_size = kwargs.pop("group_size", 32) + return self._build_torchao_quantized_recipe( + recipe_type, + weight_dtype=torch.int8, + is_per_channel=False, + group_size=group_size, + **kwargs, + ) + elif recipe_type == CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY: + bits = kwargs.pop("bits", 3) + block_size = kwargs.pop("block_size", [-1, 16]) + return self._build_codebook_quantized_recipe( + recipe_type, bits=bits, block_size=block_size, **kwargs + ) - return self._build_recipe(recipe_type, precision, **kwargs) + return None def _validate_recipe_kwargs(self, recipe_type: RecipeType, **kwargs: Any) -> None: - if not kwargs: - return - expected_keys = {"minimum_deployment_target", "compute_unit"} + """Validate kwargs for each recipe type""" + expected_keys = self._get_expected_keys(recipe_type) + unexpected = set(kwargs.keys()) - expected_keys if unexpected: raise ValueError( - f"CoreML Recipes only accept 'minimum_deployment_target' or 'compute_unit' as parameter. " - f"Unexpected parameters: {list(unexpected)}" + f"Recipe '{recipe_type.value}' received unexpected parameters: {list(unexpected)}" ) + + self._validate_base_parameters(kwargs) + self._validate_group_size_parameter(recipe_type, kwargs) + self._validate_codebook_parameters(recipe_type, kwargs) + + def _get_expected_keys(self, recipe_type: RecipeType) -> set: + """Get expected parameter keys for a recipe type""" + common_keys = {"minimum_deployment_target", "compute_unit"} + + if recipe_type in [ + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_GROUP, + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_GROUP, + ]: + return common_keys | {"group_size", "filter_fn"} + elif recipe_type in [ + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_CHANNEL, + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_CHANNEL, + ]: + return common_keys | {"filter_fn"} + elif recipe_type == CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY: + return common_keys | {"bits", "block_size", "filter_fn"} + else: + return common_keys + + def _validate_base_parameters(self, kwargs: Any) -> None: + """Validate minimum_deployment_target and compute_unit parameters""" if "minimum_deployment_target" in kwargs: minimum_deployment_target = kwargs["minimum_deployment_target"] if not isinstance(minimum_deployment_target, ct.target): raise ValueError( f"Parameter 'minimum_deployment_target' must be an enum of type ct.target, got {type(minimum_deployment_target)}" ) + if "compute_unit" in kwargs: compute_unit = kwargs["compute_unit"] if not isinstance(compute_unit, ct.ComputeUnit): @@ -85,12 +154,73 @@ def _validate_recipe_kwargs(self, recipe_type: RecipeType, **kwargs: Any) -> Non f"Parameter 'compute_unit' must be an enum of type ct.ComputeUnit, got {type(compute_unit)}" ) - def _build_recipe( + def _validate_group_size_parameter( + self, recipe_type: RecipeType, kwargs: Any + ) -> None: + """Validate group_size parameter for applicable recipe types""" + if ( + recipe_type + in [ + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_GROUP, + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_GROUP, + ] + and "group_size" in kwargs + ): + group_size = kwargs["group_size"] + if not isinstance(group_size, int): + raise ValueError( + f"Parameter 'group_size' must be an integer, got {type(group_size).__name__}: {group_size}" + ) + if group_size <= 0: + raise ValueError( + f"Parameter 'group_size' must be positive, got: {group_size}" + ) + + def _validate_codebook_parameters( + self, recipe_type: RecipeType, kwargs: Any + ) -> None: + """Validate bits and block_size parameters for codebook recipe type""" + if recipe_type != CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY: + return + + if "bits" in kwargs: + bits = kwargs["bits"] + if not isinstance(bits, int): + raise ValueError( + f"Parameter 'bits' must be an integer, got {type(bits).__name__}: {bits}" + ) + if not (1 <= bits <= 8): + raise ValueError( + f"Parameter 'bits' must be between 1 and 8, got: {bits}" + ) + + if "block_size" in kwargs: + block_size = kwargs["block_size"] + if not isinstance(block_size, list): + raise ValueError( + f"Parameter 'block_size' must be a list, got {type(block_size).__name__}: {block_size}" + ) + + def _validate_and_set_deployment_target( + self, kwargs: Any, min_target: ct.target, quantization_type: str + ) -> None: + """Validate or set minimum deployment target for quantization recipes""" + minimum_deployment_target = kwargs.get("minimum_deployment_target", None) + if minimum_deployment_target and minimum_deployment_target < min_target: + raise ValueError( + f"minimum_deployment_target must be {str(min_target)} or higher for {quantization_type} quantization" + ) + else: + # Default to the minimum target for this quantization type + kwargs["minimum_deployment_target"] = min_target + + def _build_fp_recipe( self, recipe_type: RecipeType, precision: ct.precision, **kwargs: Any, ) -> ExportRecipe: + """Build FP32/FP16 recipe""" lowering_recipe = self._get_coreml_lowering_recipe( compute_precision=precision, **kwargs, @@ -98,18 +228,142 @@ def _build_recipe( return ExportRecipe( name=recipe_type.value, - quantization_recipe=None, # TODO - add quantization recipe + lowering_recipe=lowering_recipe, + ) + + def _build_pt2e_quantized_recipe( + self, + recipe_type: RecipeType, + activation_dtype: torch.dtype, + **kwargs: Any, + ) -> ExportRecipe: + """Build PT2E-based quantization recipe""" + from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer + + self._validate_and_set_deployment_target(kwargs, ct.target.iOS17, "pt2e") + + # Validate activation_dtype + assert activation_dtype in [ + torch.quint8, + torch.float32, + ], f"activation_dtype must be torch.quint8 or torch.float32, got {activation_dtype}" + + # Create quantization config + config = ct.optimize.torch.quantization.LinearQuantizerConfig( + global_config=ct.optimize.torch.quantization.ModuleLinearQuantizerConfig( + quantization_scheme="symmetric", + activation_dtype=activation_dtype, + weight_dtype=torch.qint8, + weight_per_channel=True, + ) + ) + + quantizer = CoreMLQuantizer(config) + quantization_recipe = QuantizationRecipe(quantizers=[quantizer]) + + lowering_recipe = self._get_coreml_lowering_recipe(**kwargs) + + return ExportRecipe( + name=recipe_type.value, + quantization_recipe=quantization_recipe, + lowering_recipe=lowering_recipe, + ) + + def _build_torchao_quantized_recipe( + self, + recipe_type: RecipeType, + weight_dtype: torch.dtype, + is_per_channel: bool, + group_size: int = 32, + **kwargs: Any, + ) -> ExportRecipe: + """Build TorchAO-based quantization recipe""" + if is_per_channel: + weight_granularity = PerAxis(axis=0) + else: + weight_granularity = PerGroup(group_size=group_size) + + # Use user-provided filter_fn if provided + filter_fn = kwargs.get("filter_fn", None) + config = AOQuantizationConfig( + ao_base_config=IntxWeightOnlyConfig( + weight_dtype=weight_dtype, + granularity=weight_granularity, + ), + filter_fn=filter_fn, + ) + + quantization_recipe = QuantizationRecipe( + quantizers=None, + ao_quantization_configs=[config], + ) + + # override minimum_deployment_target to ios18 for torchao (GH issue #13122) + self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao") + lowering_recipe = self._get_coreml_lowering_recipe(**kwargs) + + return ExportRecipe( + name=recipe_type.value, + quantization_recipe=quantization_recipe, + lowering_recipe=lowering_recipe, + ) + + def _build_codebook_quantized_recipe( + self, + recipe_type: RecipeType, + bits: int, + block_size: list, + **kwargs: Any, + ) -> ExportRecipe: + """Build codebook/palettization quantization recipe""" + from torchao.prototype.quantization.codebook_coreml import ( + CodebookWeightOnlyConfig, + ) + + self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "codebook") + + # Get the appropriate dtype (torch.uint1 through torch.uint8) + dtype = getattr(torch, f"uint{bits}") + + # Use user-provided filter_fn or default to Linear/Embedding layers + filter_fn = kwargs.get( + "filter_fn", + lambda m, fqn: ( + isinstance(m, torch.nn.Embedding) or isinstance(m, torch.nn.Linear) + ), + ) + + config = AOQuantizationConfig( + ao_base_config=CodebookWeightOnlyConfig( + dtype=dtype, + block_size=block_size, + ), + filter_fn=filter_fn, + ) + + quantization_recipe = QuantizationRecipe( + quantizers=None, + ao_quantization_configs=[config], + ) + + lowering_recipe = self._get_coreml_lowering_recipe(**kwargs) + + return ExportRecipe( + name=recipe_type.value, + quantization_recipe=quantization_recipe, lowering_recipe=lowering_recipe, ) def _get_coreml_lowering_recipe( self, - compute_precision: ct.precision, + compute_precision: ct.precision = ct.precision.FLOAT16, **kwargs: Any, ) -> LoweringRecipe: + """Get CoreML lowering recipe with optional precision""" compile_specs = CoreMLBackend.generate_compile_specs( compute_precision=compute_precision, - **kwargs, + compute_unit=kwargs.get("compute_unit", ct.ComputeUnit.ALL), + minimum_deployment_target=kwargs.get("minimum_deployment_target", None), ) minimum_deployment_target = kwargs.get("minimum_deployment_target", None) diff --git a/backends/apple/coreml/recipes/coreml_recipe_types.py b/backends/apple/coreml/recipes/coreml_recipe_types.py index 77f808bd982..c2ce102a2fd 100644 --- a/backends/apple/coreml/recipes/coreml_recipe_types.py +++ b/backends/apple/coreml/recipes/coreml_recipe_types.py @@ -12,14 +12,42 @@ class CoreMLRecipeType(RecipeType): """CoreML-specific generic recipe types""" - # FP32 generic recipe, defaults to values published by the CoreML backend and partitioner - # Precision = FP32, Default compute_unit = All (can be overriden by kwargs) + ## All the recipes accept common kwargs + # 1. minimum_deployment_unit (default: None) + # 2. compute_unit (default: ct.ComputeUnit.ALL) + + # FP32 precision recipe, defaults to values published by the CoreML backend and partitioner FP32 = "coreml_fp32" - # FP16 generic recipe, defaults to values published by the CoreML backend and partitioner - # Precision = FP32, Default compute_unit = All (can be overriden by kwargs) + # FP16 precision recipe, defaults to values published by the CoreML backend and partitioner FP16 = "coreml_fp16" + ## PT2E-based quantization recipes + # INT8 Static Quantization (weights + activations), requires calibration dataset + PT2E_INT8_STATIC = "coreml_pt2e_int8_static" + # INT8 Weight-only Quantization (activations remain FP32) + PT2E_INT8_WEIGHT_ONLY = "coreml_pt2e_int8_weight_only" + + ## TorchAO-based quantization recipes + # All TorchAO recipes accept filter_fn kwarg to control which layers are quantized + # INT4 Weight-only Quantization, per-channel (axis=0) + # Additional kwargs: filter_fn (default: None - quantizes linear layers) + INT4_WEIGHT_ONLY_PER_CHANNEL = "coreml_int4_weight_only_per_channel" + # INT4 Weight-only Quantization, per-group + # Additional kwargs: group_size (default: 32), filter_fn (default: None - quantizes linear layers) + INT4_WEIGHT_ONLY_PER_GROUP = "coreml_int4_weight_only_per_group" + # INT8 Weight-only Quantization, per-channel (axis=0) + # Additional kwargs: filter_fn (default: None - quantizes linear layers) + INT8_WEIGHT_ONLY_PER_CHANNEL = "coreml_int8_weight_only_per_channel" + # INT8 Weight-only Quantization, per-group + # Additional kwargs: group_size (default: 32), filter_fn (default: None - quantizes linear layers) + INT8_WEIGHT_ONLY_PER_GROUP = "coreml_int8_weight_only_per_group" + + ## Codebook/Palettization Quantization + # Additional kwargs: bits (1-8, default: 3), block_size (default: [-1, 16]), + # filter_fn (default: targets Linear and Embedding layers only) + CODEBOOK_WEIGHT_ONLY = "coreml_codebook_weight_only" + @classmethod def get_backend_name(cls) -> str: return COREML_BACKEND diff --git a/backends/apple/coreml/test/test_coreml_recipes.py b/backends/apple/coreml/test/test_coreml_recipes.py index ca5c6c30c9c..1d53d5bcd4e 100644 --- a/backends/apple/coreml/test/test_coreml_recipes.py +++ b/backends/apple/coreml/test/test_coreml_recipes.py @@ -4,11 +4,10 @@ import unittest -from typing import List import coremltools as ct - import torch + from executorch.backends.apple.coreml.recipes import ( CoreMLRecipeProvider, CoreMLRecipeType, @@ -17,19 +16,17 @@ from executorch.backends.apple.coreml.test.test_coreml_utils import ( IS_VALID_TEST_RUNTIME, ) -from executorch.exir.schema import DelegateCall, Program +from executorch.exir.schema import DelegateCall from executorch.export import export, ExportRecipe, recipe_registry + +from export.types import StageType from torch import nn from torch.testing._internal.common_quantization import TestHelperModules +from torchao.quantization.utils import compute_error class TestCoreMLRecipes(unittest.TestCase): - fp32_recipes: List[CoreMLRecipeType] = [ - CoreMLRecipeType.FP32, - ] - fp16_recipes: List[CoreMLRecipeType] = [ - CoreMLRecipeType.FP16, - ] + """Test suite for CoreML recipes focusing on quantization functionality""" def setUp(self): torch._dynamo.reset() @@ -41,198 +38,557 @@ def setUp(self): def tearDown(self): super().tearDown() - def check_fully_delegated(self, program: Program) -> None: + def check_fully_delegated(self, session) -> None: + """Helper to verify a program is fully delegated to CoreML""" + session.print_delegation_info() + program = session.get_executorch_program() instructions = program.execution_plan[0].chains[0].instructions assert instructions is not None self.assertEqual(len(instructions), 1) self.assertIsInstance(instructions[0].instr_args, DelegateCall) - def test_all_fp32_recipes_with_simple_model(self): - """Test all FP32 recipes with a simple linear model""" - for recipe_type in self.fp32_recipes: - with self.subTest(recipe=recipe_type.value): - m_eager = TestHelperModules.TwoLinearModule().eval() - example_inputs = [(torch.randn(9, 8),)] + def _compare_eager_quantized_model_outputs(self, session, example_inputs, atol): + """Utility to compare eager quantized model output with session output after coreml lowering""" + if IS_VALID_TEST_RUNTIME: + source_transform_output = session.get_stage_artifacts()[ + StageType.SOURCE_TRANSFORM + ] + eager_quantized_model = source_transform_output.data["forward"] + output = session.run_method("forward", example_inputs[0])[0] + expected = eager_quantized_model(*example_inputs[0]) + self.assertTrue(torch.allclose(output, expected, atol=atol)) + + def _compare_eager_unquantized_model_outputs( + self, session, eager_unquantized_model, example_inputs, sqnr_threshold=20 + ): + """tility to compare eager unquantized model output with session output using SQNR""" + if IS_VALID_TEST_RUNTIME: + quantized_output = session.run_method("forward", example_inputs[0])[0] + original_output = eager_unquantized_model(*example_inputs[0]) + error = compute_error(original_output, quantized_output) + print(f"SQNR: {error} dB") + self.assertTrue(error > sqnr_threshold) + + def test_fp32_recipe(self): + """Test FP32 recipe functionality""" + model = TestHelperModules.TwoLinearModule().eval() + example_inputs = [(torch.randn(9, 8),)] + + session = export( + model=model, + example_inputs=example_inputs, + export_recipe=ExportRecipe.get_recipe(CoreMLRecipeType.FP32), + ) + self.check_fully_delegated(session) + + self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-3) + self._compare_eager_unquantized_model_outputs(session, model, example_inputs) + + def test_fp16_recipe(self): + """Test FP16 recipe functionality""" + model = TestHelperModules.TwoLinearModule().eval() + example_inputs = [(torch.randn(9, 8),)] + session = export( + model=model, + example_inputs=example_inputs, + export_recipe=ExportRecipe.get_recipe(CoreMLRecipeType.FP16), + ) + self.check_fully_delegated(session) + + self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-3) + self._compare_eager_unquantized_model_outputs(session, model, example_inputs) + + def test_fp_recipes_with_custom_parameters(self): + """Test FP recipes with custom deployment target and compute unit""" + test_cases = [ + (CoreMLRecipeType.FP32, {"minimum_deployment_target": ct.target.iOS16}), + (CoreMLRecipeType.FP16, {"compute_unit": ct.ComputeUnit.CPU_ONLY}), + ] + + model = TestHelperModules.TwoLinearModule().eval() + example_inputs = [(torch.randn(9, 8),)] + + for recipe_type, kwargs in test_cases: + with self.subTest(recipe=recipe_type.value, kwargs=kwargs): session = export( - model=m_eager, + model=model, example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe(recipe_type), + export_recipe=ExportRecipe.get_recipe(recipe_type, **kwargs), ) - self.check_fully_delegated(session.get_executorch_program()) - - # Verify outputs match - if IS_VALID_TEST_RUNTIME: - self.assertTrue( - torch.allclose( - session.run_method("forward", example_inputs[0])[0], - m_eager(*example_inputs[0]), - atol=1e-3, - ) - ) + self.check_fully_delegated(session) + + def test_int4_weight_only_per_channel(self): + """Test INT4 weight-only per-channel quantization""" + model = TestHelperModules.TwoLinearModule().eval() + example_inputs = [(torch.randn(9, 8),)] + + session = export( + model=model, + example_inputs=example_inputs, + export_recipe=ExportRecipe.get_recipe( + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_CHANNEL + ), + ) + self.check_fully_delegated(session) + self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-02) + self._compare_eager_unquantized_model_outputs(session, model, example_inputs) - def test_all_fp16_recipes_with_simple_model(self): - """Test all FP16 recipes with a simple linear model""" + def test_int4_weight_only_per_group(self): + """Test INT4 weight-only per-group quantization with different group sizes""" - for recipe_type in self.fp16_recipes: - with self.subTest(recipe=recipe_type.value): - m_eager = TestHelperModules.TwoLinearModule().eval() - example_inputs = [(torch.randn(9, 8),)] + class CustomTwoLinearModel(nn.Module): + def __init__(self): + super().__init__() + self.layer1 = nn.Linear(32, 32) + self.layer2 = nn.Linear(32, 8) + + def forward(self, x): + x = torch.relu(self.layer1(x)) + x = self.layer2(x) + return x + model = CustomTwoLinearModel().eval() + example_inputs = [(torch.randn(1, 32),)] + # Test with different group sizes + for group_size in [8, 16, 32]: + with self.subTest(group_size=group_size): session = export( - model=m_eager, + model=model, example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe(recipe_type), + export_recipe=ExportRecipe.get_recipe( + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_GROUP, + group_size=group_size, + ), ) + self.check_fully_delegated(session) - self.check_fully_delegated(session.get_executorch_program()) + self._compare_eager_quantized_model_outputs( + session, example_inputs, atol=1e-3 + ) + self._compare_eager_unquantized_model_outputs( + session, model, example_inputs + ) - # Verify outputs match (slightly higher tolerance for FP16) - if IS_VALID_TEST_RUNTIME: - self.assertTrue( - torch.allclose( - session.run_method("forward", example_inputs[0])[0], - m_eager(*example_inputs[0]), - atol=1e-3, - ) - ) + def test_int4_weight_only_per_group_validation(self): + """Test INT4 per-group parameter validation""" + # Test invalid group size type + with self.assertRaises(ValueError) as cm: + self.provider.create_recipe( + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_GROUP, group_size="32" + ) + self.assertIn("must be an integer", str(cm.exception)) + + # Test negative group size + with self.assertRaises(ValueError) as cm: + self.provider.create_recipe( + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_GROUP, group_size=-1 + ) + self.assertIn("must be positive", str(cm.exception)) + + # Test unexpected parameter + with self.assertRaises(ValueError) as cm: + self.provider.create_recipe( + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_CHANNEL, + group_size=32, # group_size not valid for per-channel + ) + self.assertIn("unexpected parameters", str(cm.exception)) + + def test_int8_weight_only_per_channel(self): + """Test INT8 weight-only per-channel quantization""" + model = TestHelperModules.TwoLinearModule().eval() + example_inputs = [(torch.randn(9, 8),)] + + session = export( + model=model, + example_inputs=example_inputs, + export_recipe=ExportRecipe.get_recipe( + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_CHANNEL + ), + ) + self.check_fully_delegated(session) + + self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-2) + self._compare_eager_unquantized_model_outputs(session, model, example_inputs) - def test_custom_simple_model(self): - """Test with a custom simple model""" + def test_int8_weight_only_per_group(self): + """Test INT8 weight-only per-group quantization with different group sizes""" - class CustomTestModel(nn.Module): + class SimpleLinearModel(nn.Module): def __init__(self): super().__init__() - self.linear1 = nn.Linear(10, 20) - self.relu = nn.ReLU() - self.linear2 = nn.Linear(20, 1) + self.layer = nn.Linear(64, 2) def forward(self, x): - x = self.linear1(x) - x = self.relu(x) - x = self.linear2(x) - return x + return self.layer(x) - model = CustomTestModel().eval() - example_inputs = [(torch.randn(1, 10),)] - for recipe_type in self.fp32_recipes + self.fp16_recipes: - with self.subTest(recipe=recipe_type.value): + model = SimpleLinearModel().eval() + example_inputs = [(torch.randn(1, 64),)] + + # Test with different group sizes + for group_size in [16, 32, 64]: + with self.subTest(group_size=group_size): session = export( model=model, example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe(recipe_type), + export_recipe=ExportRecipe.get_recipe( + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_GROUP, + group_size=group_size, + ), + ) + self.check_fully_delegated(session) + + self._compare_eager_quantized_model_outputs( + session, example_inputs, atol=1e-2 + ) + self._compare_eager_unquantized_model_outputs( + session, model, example_inputs ) - session.print_delegation_info() - self.check_fully_delegated(session.get_executorch_program()) - - if IS_VALID_TEST_RUNTIME: - self.assertTrue( - torch.allclose( - session.run_method("forward", example_inputs[0])[0], - model(*example_inputs[0]), - atol=1e-3, - ) - ) - def test_unsupported_recipe_type(self): - """Test that unsupported recipe types return None""" - from executorch.export import RecipeType + def test_codebook_weight_only_default(self): + """Test codebook quantization with default parameters (3 bits)""" - class UnsupportedRecipeType(RecipeType): - UNSUPPORTED = "unsupported" + class SimpleLinearModel(nn.Module): + def __init__(self): + super().__init__() + self.layer = nn.Linear(32, 2) - @classmethod - def get_backend_name(cls) -> str: - return "dummy" + def forward(self, x): + return self.layer(x) + + model = SimpleLinearModel().eval() + example_inputs = [(torch.randn(1, 32),)] + + session = export( + model=model, + example_inputs=example_inputs, + export_recipe=ExportRecipe.get_recipe( + CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, + block_size=[-1, 8], + ), + ) + self.check_fully_delegated(session) - recipe = self.provider.create_recipe(UnsupportedRecipeType.UNSUPPORTED) - self.assertIsNone(recipe) + self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-3) + self._compare_eager_unquantized_model_outputs(session, model, example_inputs) - def test_recipe_registry_integration(self): - """Test that recipes work with the global recipe registry""" - for recipe_type in self.fp32_recipes + self.fp16_recipes: - with self.subTest(recipe=recipe_type.value): - recipe = ExportRecipe.get_recipe(recipe_type) - self.assertIsNotNone(recipe) - self.assertEqual(recipe.name, recipe_type.value) + def test_codebook_weight_only_custom_bits(self): + """Test codebook quantization with different bit configurations""" - def test_invalid_recipe_kwargs(self): - """Test detailed error messages for invalid kwargs""" - provider = CoreMLRecipeProvider() + class SimpleLinearModel(nn.Module): + def __init__(self): + super().__init__() + self.layer = nn.Linear(32, 2) - # Test single invalid parameter - with self.assertRaises(ValueError) as cm: - provider.create_recipe(CoreMLRecipeType.FP16, invalid_param=123) + def forward(self, x): + return self.layer(x) + + model = SimpleLinearModel().eval() + example_inputs = [(torch.randn(1, 32),)] + session = export( + model=model, + example_inputs=example_inputs, + export_recipe=ExportRecipe.get_recipe( + CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, bits=4 + ), + ) + self.check_fully_delegated(session) + + self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-3) + self._compare_eager_unquantized_model_outputs(session, model, example_inputs) + + def test_codebook_weight_only_custom_block_size(self): + """Test codebook quantization with custom block sizes""" - error_msg = str(cm.exception) - self.assertIn("Unexpected parameters", error_msg) + class SimpleLinearModel(nn.Module): + def __init__(self): + super().__init__() + self.layer = nn.Linear(32, 2) + + def forward(self, x): + return self.layer(x) + + model = SimpleLinearModel().eval() + example_inputs = [(torch.randn(1, 32),)] + + # Test different block sizes + test_cases = [ + {"bits": 3, "block_size": [-1, 8]}, + ] + + for kwargs in test_cases: + with self.subTest(kwargs=kwargs): + session = export( + model=model, + example_inputs=example_inputs, + export_recipe=ExportRecipe.get_recipe( + CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, **kwargs + ), + ) + self.check_fully_delegated(session) - # Test multiple invalid parameters + def test_codebook_parameter_validation(self): + """Test codebook parameter validation""" + # Test invalid bits type with self.assertRaises(ValueError) as cm: - provider.create_recipe( - CoreMLRecipeType.FP32, param1="value1", param2="value2" - ) + self.provider.create_recipe(CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, bits="3") + self.assertIn("must be an integer", str(cm.exception)) - error_msg = str(cm.exception) - self.assertIn("Unexpected parameters", error_msg) + # Test bits out of range + with self.assertRaises(ValueError) as cm: + self.provider.create_recipe(CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, bits=0) + self.assertIn("must be between 1 and 8", str(cm.exception)) - # Test mix of valid and invalid parameters with self.assertRaises(ValueError) as cm: - provider.create_recipe( - CoreMLRecipeType.FP32, - minimum_deployment_target=ct.target.iOS16, # valid - invalid_param="invalid", # invalid - ) + self.provider.create_recipe(CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, bits=9) + self.assertIn("must be between 1 and 8", str(cm.exception)) - error_msg = str(cm.exception) - self.assertIn("Unexpected parameters", error_msg) + # Test invalid block_size type + with self.assertRaises(ValueError) as cm: + self.provider.create_recipe( + CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, block_size="[-1, 16]" + ) + self.assertIn("must be a list", str(cm.exception)) - def test_valid_kwargs(self): - """Test valid kwargs""" - recipe = self.provider.create_recipe( - CoreMLRecipeType.FP32, - minimum_deployment_target=ct.target.iOS16, - compute_unit=ct.ComputeUnit.CPU_AND_GPU, - ) - self.assertIsNotNone(recipe) - self.assertEqual(recipe.name, "coreml_fp32") + def test_int8_static_quantization(self): + """Test INT8 static quantization (weights + activations)""" - # Verify partitioners are properly configured - partitioners = recipe.lowering_recipe.partitioners - self.assertEqual(len(partitioners), 1, "Expected exactly one partitioner") + class SimpleLinearModel(nn.Module): + def __init__(self): + super().__init__() + self.layer1 = nn.Linear(32, 16) + self.layer2 = nn.Linear(16, 2) - # Verify delegation spec and compile specs - delegation_spec = partitioners[0].delegation_spec - self.assertIsNotNone(delegation_spec, "Delegation spec should not be None") + def forward(self, x): + x = torch.relu(self.layer1(x)) + x = self.layer2(x) + return x - compile_specs = delegation_spec.compile_specs - self.assertIsNotNone(compile_specs, "Compile specs should not be None") + model = SimpleLinearModel().eval() + example_inputs = [(torch.randn(1, 32),)] - spec_dict = {spec.key: spec.value for spec in compile_specs} + recipe = ExportRecipe.get_recipe( + CoreMLRecipeType.PT2E_INT8_STATIC, minimum_deployment_target=ct.target.iOS17 + ) - # Assert that all expected specs are present with correct values - self.assertIn( - "min_deployment_target", - spec_dict, - "minimum_deployment_target should be in compile specs", + session = export( + model=model, + example_inputs=example_inputs, + export_recipe=recipe, ) - min_target_value = spec_dict["min_deployment_target"] - if isinstance(min_target_value, bytes): - min_target_value = min_target_value.decode("utf-8") - self.assertEqual( - str(min_target_value), - str(ct.target.iOS16.value), - "minimum_deployment_target should match the provided value", + self.check_fully_delegated(session) + + self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-3) + self._compare_eager_unquantized_model_outputs(session, model, example_inputs) + + def test_int8_weight_only_pt2e(self): + """Test PT2E-based INT8 weight-only quantization""" + model = TestHelperModules.TwoLinearModule().eval() + example_inputs = [(torch.randn(9, 8),)] + + session = export( + model=model, + example_inputs=example_inputs, + export_recipe=ExportRecipe.get_recipe( + CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY + ), ) + self.check_fully_delegated(session) - self.assertIn( - "compute_units", spec_dict, "compute_unit should be in compile specs" - ) - compute_unit_value = spec_dict["compute_units"] - if isinstance(compute_unit_value, bytes): - compute_unit_value = compute_unit_value.decode("utf-8") - self.assertEqual( - str(compute_unit_value), - ct.ComputeUnit.CPU_AND_GPU.name.lower(), - "compute_unit should match the provided value", + self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-2) + self._compare_eager_unquantized_model_outputs(session, model, example_inputs) + + def test_int8_weight_only_pt2e_with_conv(self): + """Test PT2E-based INT8 weight-only quantization with convolution layers""" + + class ConvModel(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(3, 16, 3, padding=1) + self.conv2 = nn.Conv2d(16, 32, 3, padding=1) + self.pool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(32, 10) + + def forward(self, x): + x = torch.relu(self.conv1(x)) + x = torch.relu(self.conv2(x)) + x = self.pool(x) + x = x.view(x.size(0), -1) + x = self.fc(x) + return x + + model = ConvModel().eval() + example_inputs = [(torch.randn(1, 3, 32, 32),)] + + session = export( + model=model, + example_inputs=example_inputs, + export_recipe=ExportRecipe.get_recipe( + CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY + ), ) + self.check_fully_delegated(session) + + self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-2) + self._compare_eager_unquantized_model_outputs(session, model, example_inputs) + + def test_pt2e_recipes_parameter_rejection(self): + """Test that PT2E recipes reject TorchAO-specific parameters""" + # PT2E recipes should reject TorchAO-specific parameters + pt2e_recipes = [ + CoreMLRecipeType.PT2E_INT8_STATIC, + CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY, + ] + torchao_params = ["filter_fn", "group_size", "bits", "block_size"] + + for recipe_type in pt2e_recipes: + for param in torchao_params: + with self.subTest(recipe=recipe_type.value, param=param): + kwargs = {param: "dummy_value"} + with self.assertRaises(ValueError) as cm: + self.provider.create_recipe(recipe_type, **kwargs) + self.assertIn("unexpected parameters", str(cm.exception).lower()) + + def test_filter_fn_comprehensive(self): + """Comprehensive test for filter_fn parameter functionality""" + + def custom_filter(module, fqn): + return isinstance(module, nn.Linear) and "target" in fqn + + # Test 1: TorchAO recipes accept filter_fn and default to None + torchao_recipes = [ + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_CHANNEL, + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_GROUP, + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_CHANNEL, + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_GROUP, + ] + + for recipe_type in torchao_recipes: + with self.subTest(f"{recipe_type.value}_default"): + # Test default behavior (None) + recipe = self.provider.create_recipe(recipe_type) + config = recipe.quantization_recipe.ao_quantization_configs[0] + self.assertIsNone(config.filter_fn) + + with self.subTest(f"{recipe_type.value}_custom"): + # Test custom filter_fn + recipe = self.provider.create_recipe( + recipe_type, filter_fn=custom_filter + ) + config = recipe.quantization_recipe.ao_quantization_configs[0] + self.assertEqual(config.filter_fn, custom_filter) + + # Test 2: Codebook recipe accepts filter_fn and has sensible default + with self.subTest("codebook_default"): + recipe = self.provider.create_recipe(CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY) + config = recipe.quantization_recipe.ao_quantization_configs[0] + self.assertIsNotNone(config.filter_fn) + + # Test default filter targets Linear and Embedding layers + linear_module = nn.Linear(10, 5) + embedding_module = nn.Embedding(100, 10) + conv_module = nn.Conv2d(3, 16, 3) + + self.assertTrue(config.filter_fn(linear_module, "linear")) + self.assertTrue(config.filter_fn(embedding_module, "embedding")) + self.assertFalse(config.filter_fn(conv_module, "conv")) + + with self.subTest("codebook_custom"): + recipe = self.provider.create_recipe( + CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, filter_fn=custom_filter + ) + config = recipe.quantization_recipe.ao_quantization_configs[0] + self.assertEqual(config.filter_fn, custom_filter) + + def test_quantization_recipe_structure(self): + """Test that quantization recipes have proper structure""" + quantization_recipes = [ + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_CHANNEL, + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_GROUP, + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_CHANNEL, + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_GROUP, + CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, + ] + + for recipe_type in quantization_recipes: + with self.subTest(recipe=recipe_type.value): + recipe = self.provider.create_recipe(recipe_type) + self.assertIsNotNone(recipe) + + # Should have quantization recipe with ao_quantization_configs + self.assertIsNotNone(recipe.quantization_recipe) + self.assertIsNotNone(recipe.quantization_recipe.ao_quantization_configs) + self.assertEqual( + len(recipe.quantization_recipe.ao_quantization_configs), 1 + ) + + # Should have lowering recipe + self.assertIsNotNone(recipe.lowering_recipe) + self.assertIsNotNone(recipe.lowering_recipe.partitioners) + + def test_recipe_creation_with_defaults(self): + """Test that recipes work with default parameters""" + # Test that all recipes can be created without explicit parameters + all_recipes = [ + CoreMLRecipeType.FP32, + CoreMLRecipeType.FP16, + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_CHANNEL, + CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_GROUP, # should use default group_size=32 + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_CHANNEL, + CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_GROUP, # should use default group_size=32 + CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, # should use default bits=3, block_size=[-1,16] + ] + + for recipe_type in all_recipes: + with self.subTest(recipe=recipe_type.value): + recipe = self.provider.create_recipe(recipe_type) + self.assertIsNotNone(recipe) + self.assertEqual(recipe.name, recipe_type.value) + + def test_minimum_deployment_target_validation(self): + """Test that minimum_deployment_target validation works correctly for quantization recipes""" + test_cases = [ + (CoreMLRecipeType.PT2E_INT8_STATIC, ct.target.iOS17), + (CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY, ct.target.iOS17), + (CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_CHANNEL, ct.target.iOS18), + (CoreMLRecipeType.INT4_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18), + (CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_CHANNEL, ct.target.iOS18), + (CoreMLRecipeType.INT8_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18), + (CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, ct.target.iOS18), + ] + + for recipe_type, min_target in test_cases: + with self.subTest(recipe=recipe_type.value): + + # Test 1: Providing deployment target below minimum should raise ValueError + too_low_target = ct.target.iOS15 + with self.assertRaises(ValueError) as cm: + self.provider.create_recipe( + recipe_type, minimum_deployment_target=too_low_target + ) + error_msg = str(cm.exception) + self.assertIn( + f"minimum_deployment_target must be {str(min_target)} or higher", + error_msg, + ) + + # Test 2: Providing valid deployment target should work + valid_recipe = self.provider.create_recipe( + recipe_type, minimum_deployment_target=min_target + ) + self.assertIsNotNone(valid_recipe) + + # Test 3: Not providing deployment target should default to minimum + default_recipe = self.provider.create_recipe(recipe_type) + self.assertIsNotNone(default_recipe) + + # Test 4: Providing deployment target higher than minimum should work + higher_target = ( + ct.target.iOS18 + if min_target == ct.target.iOS17 + else ct.target.iOS18 + ) + higher_recipe = self.provider.create_recipe( + recipe_type, minimum_deployment_target=higher_target + ) + self.assertIsNotNone(higher_recipe)