diff --git a/backends/apple/coreml/TARGETS b/backends/apple/coreml/TARGETS index c5eec41d5fc..6993b699427 100644 --- a/backends/apple/coreml/TARGETS +++ b/backends/apple/coreml/TARGETS @@ -120,7 +120,6 @@ runtime.python_test( "test/*.py", ]), deps = [ - "fbsource//third-party/pypi/coremltools:coremltools", "fbsource//third-party/pypi/pytest:pytest", ":partitioner", ":quantizer", diff --git a/backends/apple/coreml/recipes/coreml_recipe_provider.py b/backends/apple/coreml/recipes/coreml_recipe_provider.py index 90b798f9e0c..75c937027bb 100644 --- a/backends/apple/coreml/recipes/coreml_recipe_provider.py +++ b/backends/apple/coreml/recipes/coreml_recipe_provider.py @@ -6,7 +6,6 @@ from typing import Any, Optional, Sequence import coremltools as ct -import torch from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition.coreml_partitioner import ( @@ -19,15 +18,11 @@ from executorch.exir import EdgeCompileConfig from executorch.export import ( - AOQuantizationConfig, BackendRecipeProvider, ExportRecipe, LoweringRecipe, - QuantizationRecipe, RecipeType, ) -from torchao.quantization.granularity import PerAxis, PerGroup -from torchao.quantization.quant_api import IntxWeightOnlyConfig class CoreMLRecipeProvider(BackendRecipeProvider): @@ -55,98 +50,34 @@ def create_recipe( # Validate kwargs self._validate_recipe_kwargs(recipe_type, **kwargs) + # Parse recipe type to get precision and compute unit + precision = None if recipe_type == CoreMLRecipeType.FP32: - return self._build_fp_recipe(recipe_type, ct.precision.FLOAT32, **kwargs) + precision = ct.precision.FLOAT32 elif recipe_type == CoreMLRecipeType.FP16: - return self._build_fp_recipe(recipe_type, ct.precision.FLOAT16, **kwargs) - elif recipe_type == CoreMLRecipeType.PT2E_INT8_STATIC: - return self._build_pt2e_quantized_recipe( - recipe_type, activation_dtype=torch.quint8, **kwargs - ) - elif recipe_type == CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY: - return self._build_pt2e_quantized_recipe( - recipe_type, activation_dtype=torch.float32, **kwargs - ) - elif recipe_type == CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL: - return self._build_torchao_quantized_recipe( - recipe_type, - weight_dtype=torch.int4, - is_per_channel=True, - **kwargs, - ) - elif recipe_type == CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP: - group_size = kwargs.pop("group_size", 32) - return self._build_torchao_quantized_recipe( - recipe_type, - weight_dtype=torch.int4, - is_per_channel=False, - group_size=group_size, - **kwargs, - ) - elif recipe_type == CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL: - return self._build_torchao_quantized_recipe( - recipe_type, weight_dtype=torch.int8, is_per_channel=True, **kwargs - ) - elif recipe_type == CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP: - group_size = kwargs.pop("group_size", 32) - return self._build_torchao_quantized_recipe( - recipe_type, - weight_dtype=torch.int8, - is_per_channel=False, - group_size=group_size, - **kwargs, - ) - elif recipe_type == CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY: - bits = kwargs.pop("bits") - block_size = kwargs.pop("block_size") - return self._build_codebook_quantized_recipe( - recipe_type, bits=bits, block_size=block_size, **kwargs - ) + precision = ct.precision.FLOAT16 - return None + if precision is None: + raise ValueError(f"Unknown precision for recipe: {recipe_type.value}") - def _validate_recipe_kwargs(self, recipe_type: RecipeType, **kwargs: Any) -> None: - """Validate kwargs for each recipe type""" - expected_keys = self._get_expected_keys(recipe_type) + return self._build_recipe(recipe_type, precision, **kwargs) + def _validate_recipe_kwargs(self, recipe_type: RecipeType, **kwargs: Any) -> None: + if not kwargs: + return + expected_keys = {"minimum_deployment_target", "compute_unit"} unexpected = set(kwargs.keys()) - expected_keys if unexpected: raise ValueError( - f"Recipe '{recipe_type.value}' received unexpected parameters: {list(unexpected)}" + f"CoreML Recipes only accept 'minimum_deployment_target' or 'compute_unit' as parameter. " + f"Unexpected parameters: {list(unexpected)}" ) - - self._validate_base_parameters(kwargs) - self._validate_group_size_parameter(recipe_type, kwargs) - self._validate_codebook_parameters(recipe_type, kwargs) - - def _get_expected_keys(self, recipe_type: RecipeType) -> set: - """Get expected parameter keys for a recipe type""" - common_keys = {"minimum_deployment_target", "compute_unit"} - - if recipe_type in [ - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, - ]: - return common_keys | {"group_size", "filter_fn"} - elif recipe_type in [ - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL, - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL, - ]: - return common_keys | {"filter_fn"} - elif recipe_type == CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY: - return common_keys | {"bits", "block_size", "filter_fn"} - else: - return common_keys - - def _validate_base_parameters(self, kwargs: Any) -> None: - """Validate minimum_deployment_target and compute_unit parameters""" if "minimum_deployment_target" in kwargs: minimum_deployment_target = kwargs["minimum_deployment_target"] if not isinstance(minimum_deployment_target, ct.target): raise ValueError( f"Parameter 'minimum_deployment_target' must be an enum of type ct.target, got {type(minimum_deployment_target)}" ) - if "compute_unit" in kwargs: compute_unit = kwargs["compute_unit"] if not isinstance(compute_unit, ct.ComputeUnit): @@ -154,79 +85,12 @@ def _validate_base_parameters(self, kwargs: Any) -> None: f"Parameter 'compute_unit' must be an enum of type ct.ComputeUnit, got {type(compute_unit)}" ) - def _validate_group_size_parameter( - self, recipe_type: RecipeType, kwargs: Any - ) -> None: - """Validate group_size parameter for applicable recipe types""" - if ( - recipe_type - in [ - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, - ] - and "group_size" in kwargs - ): - group_size = kwargs["group_size"] - if not isinstance(group_size, int): - raise ValueError( - f"Parameter 'group_size' must be an integer, got {type(group_size).__name__}: {group_size}" - ) - if group_size <= 0: - raise ValueError( - f"Parameter 'group_size' must be positive, got: {group_size}" - ) - - def _validate_codebook_parameters( - self, recipe_type: RecipeType, kwargs: Any - ) -> None: - """Validate bits and block_size parameters for codebook recipe type""" - if recipe_type != CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY: - return - - # Both bits and block_size must be present - if not ("bits" in kwargs and "block_size" in kwargs): - raise ValueError( - "Parameters 'bits' and 'block_size' must be present for codebook recipes" - ) - - if "bits" in kwargs: - bits = kwargs["bits"] - if not isinstance(bits, int): - raise ValueError( - f"Parameter 'bits' must be an integer, got {type(bits).__name__}: {bits}" - ) - if not (1 <= bits <= 8): - raise ValueError( - f"Parameter 'bits' must be between 1 and 8, got: {bits}" - ) - - if "block_size" in kwargs: - block_size = kwargs["block_size"] - if not isinstance(block_size, list): - raise ValueError( - f"Parameter 'block_size' must be a list, got {type(block_size).__name__}: {block_size}" - ) - - def _validate_and_set_deployment_target( - self, kwargs: Any, min_target: ct.target, quantization_type: str - ) -> None: - """Validate or set minimum deployment target for quantization recipes""" - minimum_deployment_target = kwargs.get("minimum_deployment_target", None) - if minimum_deployment_target and minimum_deployment_target < min_target: - raise ValueError( - f"minimum_deployment_target must be {str(min_target)} or higher for {quantization_type} quantization" - ) - else: - # Default to the minimum target for this quantization type - kwargs["minimum_deployment_target"] = min_target - - def _build_fp_recipe( + def _build_recipe( self, recipe_type: RecipeType, precision: ct.precision, **kwargs: Any, ) -> ExportRecipe: - """Build FP32/FP16 recipe""" lowering_recipe = self._get_coreml_lowering_recipe( compute_precision=precision, **kwargs, @@ -234,142 +98,18 @@ def _build_fp_recipe( return ExportRecipe( name=recipe_type.value, - lowering_recipe=lowering_recipe, - ) - - def _build_pt2e_quantized_recipe( - self, - recipe_type: RecipeType, - activation_dtype: torch.dtype, - **kwargs: Any, - ) -> ExportRecipe: - """Build PT2E-based quantization recipe""" - from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer - - self._validate_and_set_deployment_target(kwargs, ct.target.iOS17, "pt2e") - - # Validate activation_dtype - assert activation_dtype in [ - torch.quint8, - torch.float32, - ], f"activation_dtype must be torch.quint8 or torch.float32, got {activation_dtype}" - - # Create quantization config - config = ct.optimize.torch.quantization.LinearQuantizerConfig( - global_config=ct.optimize.torch.quantization.ModuleLinearQuantizerConfig( - quantization_scheme="symmetric", - activation_dtype=activation_dtype, - weight_dtype=torch.qint8, - weight_per_channel=True, - ) - ) - - quantizer = CoreMLQuantizer(config) - quantization_recipe = QuantizationRecipe(quantizers=[quantizer]) - - lowering_recipe = self._get_coreml_lowering_recipe(**kwargs) - - return ExportRecipe( - name=recipe_type.value, - quantization_recipe=quantization_recipe, - lowering_recipe=lowering_recipe, - ) - - def _build_torchao_quantized_recipe( - self, - recipe_type: RecipeType, - weight_dtype: torch.dtype, - is_per_channel: bool, - group_size: int = 32, - **kwargs: Any, - ) -> ExportRecipe: - """Build TorchAO-based quantization recipe""" - if is_per_channel: - weight_granularity = PerAxis(axis=0) - else: - weight_granularity = PerGroup(group_size=group_size) - - # Use user-provided filter_fn if provided - filter_fn = kwargs.get("filter_fn", None) - config = AOQuantizationConfig( - ao_base_config=IntxWeightOnlyConfig( - weight_dtype=weight_dtype, - granularity=weight_granularity, - ), - filter_fn=filter_fn, - ) - - quantization_recipe = QuantizationRecipe( - quantizers=None, - ao_quantization_configs=[config], - ) - - # override minimum_deployment_target to ios18 for torchao (GH issue #13122) - self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao") - lowering_recipe = self._get_coreml_lowering_recipe(**kwargs) - - return ExportRecipe( - name=recipe_type.value, - quantization_recipe=quantization_recipe, - lowering_recipe=lowering_recipe, - ) - - def _build_codebook_quantized_recipe( - self, - recipe_type: RecipeType, - bits: int, - block_size: list, - **kwargs: Any, - ) -> ExportRecipe: - """Build codebook/palettization quantization recipe""" - from torchao.prototype.quantization.codebook_coreml import ( - CodebookWeightOnlyConfig, - ) - - self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "codebook") - - # Get the appropriate dtype (torch.uint1 through torch.uint8) - dtype = getattr(torch, f"uint{bits}") - - # Use user-provided filter_fn or default to Linear/Embedding layers - filter_fn = kwargs.get( - "filter_fn", - lambda m, fqn: ( - isinstance(m, torch.nn.Embedding) or isinstance(m, torch.nn.Linear) - ), - ) - - config = AOQuantizationConfig( - ao_base_config=CodebookWeightOnlyConfig( - dtype=dtype, - block_size=block_size, - ), - filter_fn=filter_fn, - ) - - quantization_recipe = QuantizationRecipe( - quantizers=None, - ao_quantization_configs=[config], - ) - - lowering_recipe = self._get_coreml_lowering_recipe(**kwargs) - - return ExportRecipe( - name=recipe_type.value, - quantization_recipe=quantization_recipe, + quantization_recipe=None, # TODO - add quantization recipe lowering_recipe=lowering_recipe, ) def _get_coreml_lowering_recipe( self, - compute_precision: ct.precision = ct.precision.FLOAT16, + compute_precision: ct.precision, **kwargs: Any, ) -> LoweringRecipe: - """Get CoreML lowering recipe with optional precision""" compile_specs = CoreMLBackend.generate_compile_specs( compute_precision=compute_precision, - compute_unit=kwargs.get("compute_unit", ct.ComputeUnit.ALL), - minimum_deployment_target=kwargs.get("minimum_deployment_target", None), + **kwargs, ) minimum_deployment_target = kwargs.get("minimum_deployment_target", None) diff --git a/backends/apple/coreml/recipes/coreml_recipe_types.py b/backends/apple/coreml/recipes/coreml_recipe_types.py index fc7292c3c58..77f808bd982 100644 --- a/backends/apple/coreml/recipes/coreml_recipe_types.py +++ b/backends/apple/coreml/recipes/coreml_recipe_types.py @@ -12,42 +12,14 @@ class CoreMLRecipeType(RecipeType): """CoreML-specific generic recipe types""" - ## All the recipes accept common kwargs - # 1. minimum_deployment_unit (default: None) - # 2. compute_unit (default: ct.ComputeUnit.ALL) - - # FP32 precision recipe, defaults to values published by the CoreML backend and partitioner + # FP32 generic recipe, defaults to values published by the CoreML backend and partitioner + # Precision = FP32, Default compute_unit = All (can be overriden by kwargs) FP32 = "coreml_fp32" - # FP16 precision recipe, defaults to values published by the CoreML backend and partitioner + # FP16 generic recipe, defaults to values published by the CoreML backend and partitioner + # Precision = FP32, Default compute_unit = All (can be overriden by kwargs) FP16 = "coreml_fp16" - ## PT2E-based quantization recipes - # INT8 Static Quantization (weights + activations), requires calibration dataset - PT2E_INT8_STATIC = "coreml_pt2e_int8_static" - # INT8 Weight-only Quantization (activations remain FP32) - PT2E_INT8_WEIGHT_ONLY = "coreml_pt2e_int8_weight_only" - - ## TorchAO-based quantization recipes - # All TorchAO recipes accept filter_fn kwarg to control which layers are quantized - # INT4 Weight-only Quantization, per-channel (axis=0) - # Additional kwargs: filter_fn (default: Embedding and linear layers) - TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL = "coreml_torchao_int4_weight_only_per_channel" - # INT4 Weight-only Quantization, per-group - # Additional kwargs: group_size (default: 32), filter_fn (default: Embedding and linear layers) - TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP = "coreml_torchao_int4_weight_only_per_group" - # INT8 Weight-only Quantization, per-channel (axis=0) - # Additional kwargs: filter_fn (default: Embedding and linear layers) - TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL = "coreml_torchao_int8_weight_only_per_channel" - # INT8 Weight-only Quantization, per-group - # Additional kwargs: group_size (default: 32), filter_fn (default: Embedding and linear layers) - TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP = "coreml_torchao_int8_weight_only_per_group" - - ## Codebook/Palettization Quantization - # Additional mandatory kwargs: bits (range: 1-8), block_size (list of ints), - # filter_fn (default: targets Linear and Embedding layers) - CODEBOOK_WEIGHT_ONLY = "coreml_codebook_weight_only" - @classmethod def get_backend_name(cls) -> str: return COREML_BACKEND diff --git a/backends/apple/coreml/test/test_coreml_recipes.py b/backends/apple/coreml/test/test_coreml_recipes.py index 9b395c44428..ca5c6c30c9c 100644 --- a/backends/apple/coreml/test/test_coreml_recipes.py +++ b/backends/apple/coreml/test/test_coreml_recipes.py @@ -4,10 +4,11 @@ import unittest +from typing import List import coremltools as ct -import torch +import torch from executorch.backends.apple.coreml.recipes import ( CoreMLRecipeProvider, CoreMLRecipeType, @@ -16,17 +17,19 @@ from executorch.backends.apple.coreml.test.test_coreml_utils import ( IS_VALID_TEST_RUNTIME, ) -from executorch.exir.schema import DelegateCall +from executorch.exir.schema import DelegateCall, Program from executorch.export import export, ExportRecipe, recipe_registry - -from export.types import StageType from torch import nn from torch.testing._internal.common_quantization import TestHelperModules -from torchao.quantization.utils import compute_error class TestCoreMLRecipes(unittest.TestCase): - """Test suite for CoreML recipes focusing on quantization functionality""" + fp32_recipes: List[CoreMLRecipeType] = [ + CoreMLRecipeType.FP32, + ] + fp16_recipes: List[CoreMLRecipeType] = [ + CoreMLRecipeType.FP16, + ] def setUp(self): torch._dynamo.reset() @@ -38,538 +41,198 @@ def setUp(self): def tearDown(self): super().tearDown() - def check_fully_delegated(self, session) -> None: - """Helper to verify a program is fully delegated to CoreML""" - session.print_delegation_info() - program = session.get_executorch_program() + def check_fully_delegated(self, program: Program) -> None: instructions = program.execution_plan[0].chains[0].instructions assert instructions is not None self.assertEqual(len(instructions), 1) self.assertIsInstance(instructions[0].instr_args, DelegateCall) - def _compare_eager_quantized_model_outputs(self, session, example_inputs, atol): - """Utility to compare eager quantized model output with session output after coreml lowering""" - if IS_VALID_TEST_RUNTIME: - source_transform_output = session.get_stage_artifacts()[ - StageType.SOURCE_TRANSFORM - ] - eager_quantized_model = source_transform_output.data["forward"] - output = session.run_method("forward", example_inputs[0])[0] - expected = eager_quantized_model(*example_inputs[0]) - self.assertTrue(torch.allclose(output, expected, atol=atol)) - - def _compare_eager_unquantized_model_outputs( - self, session, eager_unquantized_model, example_inputs, sqnr_threshold=20 - ): - """Utility to compare eager unquantized model output with session output using SQNR""" - if IS_VALID_TEST_RUNTIME: - quantized_output = session.run_method("forward", example_inputs[0])[0] - original_output = eager_unquantized_model(*example_inputs[0]) - error = compute_error(original_output, quantized_output) - print(f"SQNR: {error} dB") - self.assertTrue(error > sqnr_threshold) - - def test_fp32_recipe(self): - """Test FP32 recipe functionality""" - model = TestHelperModules.TwoLinearModule().eval() - example_inputs = [(torch.randn(9, 8),)] - - session = export( - model=model, - example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe(CoreMLRecipeType.FP32), - ) - self.check_fully_delegated(session) - - self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-3) - self._compare_eager_unquantized_model_outputs(session, model, example_inputs) - - def test_fp16_recipe(self): - """Test FP16 recipe functionality""" - model = TestHelperModules.TwoLinearModule().eval() - example_inputs = [(torch.randn(9, 8),)] - - session = export( - model=model, - example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe(CoreMLRecipeType.FP16), - ) - self.check_fully_delegated(session) - - self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-3) - self._compare_eager_unquantized_model_outputs(session, model, example_inputs) - - def test_fp_recipes_with_custom_parameters(self): - """Test FP recipes with custom deployment target and compute unit""" - test_cases = [ - (CoreMLRecipeType.FP32, {"minimum_deployment_target": ct.target.iOS16}), - (CoreMLRecipeType.FP16, {"compute_unit": ct.ComputeUnit.CPU_ONLY}), - ] - - model = TestHelperModules.TwoLinearModule().eval() - example_inputs = [(torch.randn(9, 8),)] + def test_all_fp32_recipes_with_simple_model(self): + """Test all FP32 recipes with a simple linear model""" + for recipe_type in self.fp32_recipes: + with self.subTest(recipe=recipe_type.value): + m_eager = TestHelperModules.TwoLinearModule().eval() + example_inputs = [(torch.randn(9, 8),)] - for recipe_type, kwargs in test_cases: - with self.subTest(recipe=recipe_type.value, kwargs=kwargs): session = export( - model=model, + model=m_eager, example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe(recipe_type, **kwargs), - ) - self.check_fully_delegated(session) - - def test_int4_weight_only_per_channel(self): - """Test INT4 weight-only per-channel quantization""" - model = TestHelperModules.TwoLinearModule().eval() - example_inputs = [(torch.randn(9, 8),)] - - session = export( - model=model, - example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe( - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL - ), - ) - self.check_fully_delegated(session) - self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-02) - self._compare_eager_unquantized_model_outputs(session, model, example_inputs) + export_recipe=ExportRecipe.get_recipe(recipe_type), + ) + self.check_fully_delegated(session.get_executorch_program()) + + # Verify outputs match + if IS_VALID_TEST_RUNTIME: + self.assertTrue( + torch.allclose( + session.run_method("forward", example_inputs[0])[0], + m_eager(*example_inputs[0]), + atol=1e-3, + ) + ) - def test_int4_weight_only_per_group(self): - """Test INT4 weight-only per-group quantization with different group sizes""" + def test_all_fp16_recipes_with_simple_model(self): + """Test all FP16 recipes with a simple linear model""" - class CustomTwoLinearModel(nn.Module): - def __init__(self): - super().__init__() - self.layer1 = nn.Linear(32, 32) - self.layer2 = nn.Linear(32, 8) - - def forward(self, x): - x = torch.relu(self.layer1(x)) - x = self.layer2(x) - return x + for recipe_type in self.fp16_recipes: + with self.subTest(recipe=recipe_type.value): + m_eager = TestHelperModules.TwoLinearModule().eval() + example_inputs = [(torch.randn(9, 8),)] - model = CustomTwoLinearModel().eval() - example_inputs = [(torch.randn(1, 32),)] - # Test with different group sizes - for group_size in [8, 16, 32]: - with self.subTest(group_size=group_size): session = export( - model=model, + model=m_eager, example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe( - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, - group_size=group_size, - ), - ) - self.check_fully_delegated(session) - - self._compare_eager_quantized_model_outputs( - session, example_inputs, atol=1e-3 - ) - self._compare_eager_unquantized_model_outputs( - session, model, example_inputs + export_recipe=ExportRecipe.get_recipe(recipe_type), ) - def test_int4_weight_only_per_group_validation(self): - """Test INT4 per-group parameter validation""" - # Test invalid group size type - with self.assertRaises(ValueError) as cm: - self.provider.create_recipe( - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, group_size="32" - ) - self.assertIn("must be an integer", str(cm.exception)) + self.check_fully_delegated(session.get_executorch_program()) - # Test negative group size - with self.assertRaises(ValueError) as cm: - self.provider.create_recipe( - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, group_size=-1 - ) - self.assertIn("must be positive", str(cm.exception)) - - # Test unexpected parameter - with self.assertRaises(ValueError) as cm: - self.provider.create_recipe( - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL, - group_size=32, # group_size not valid for per-channel - ) - self.assertIn("unexpected parameters", str(cm.exception)) - - def test_int8_weight_only_per_channel(self): - """Test INT8 weight-only per-channel quantization""" - model = TestHelperModules.TwoLinearModule().eval() - example_inputs = [(torch.randn(9, 8),)] - - session = export( - model=model, - example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe( - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL - ), - ) - self.check_fully_delegated(session) - - self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-2) - self._compare_eager_unquantized_model_outputs(session, model, example_inputs) + # Verify outputs match (slightly higher tolerance for FP16) + if IS_VALID_TEST_RUNTIME: + self.assertTrue( + torch.allclose( + session.run_method("forward", example_inputs[0])[0], + m_eager(*example_inputs[0]), + atol=1e-3, + ) + ) - def test_int8_weight_only_per_group(self): - """Test INT8 weight-only per-group quantization with different group sizes""" + def test_custom_simple_model(self): + """Test with a custom simple model""" - class SimpleLinearModel(nn.Module): + class CustomTestModel(nn.Module): def __init__(self): super().__init__() - self.layer = nn.Linear(64, 2) + self.linear1 = nn.Linear(10, 20) + self.relu = nn.ReLU() + self.linear2 = nn.Linear(20, 1) def forward(self, x): - return self.layer(x) - - model = SimpleLinearModel().eval() - example_inputs = [(torch.randn(1, 64),)] + x = self.linear1(x) + x = self.relu(x) + x = self.linear2(x) + return x - # Test with different group sizes - for group_size in [16, 32, 64]: - with self.subTest(group_size=group_size): + model = CustomTestModel().eval() + example_inputs = [(torch.randn(1, 10),)] + for recipe_type in self.fp32_recipes + self.fp16_recipes: + with self.subTest(recipe=recipe_type.value): session = export( model=model, example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe( - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, - group_size=group_size, - ), - ) - self.check_fully_delegated(session) + export_recipe=ExportRecipe.get_recipe(recipe_type), + ) + session.print_delegation_info() + self.check_fully_delegated(session.get_executorch_program()) + + if IS_VALID_TEST_RUNTIME: + self.assertTrue( + torch.allclose( + session.run_method("forward", example_inputs[0])[0], + model(*example_inputs[0]), + atol=1e-3, + ) + ) - self._compare_eager_quantized_model_outputs( - session, example_inputs, atol=1e-2 - ) - self._compare_eager_unquantized_model_outputs( - session, model, example_inputs - ) + def test_unsupported_recipe_type(self): + """Test that unsupported recipe types return None""" + from executorch.export import RecipeType - def test_codebook_weight_only_recipe(self): - """Test codebook quantization recipe""" + class UnsupportedRecipeType(RecipeType): + UNSUPPORTED = "unsupported" - class SimpleLinearModel(nn.Module): - def __init__(self): - super().__init__() - self.layer = nn.Linear(32, 2) + @classmethod + def get_backend_name(cls) -> str: + return "dummy" - def forward(self, x): - return self.layer(x) + recipe = self.provider.create_recipe(UnsupportedRecipeType.UNSUPPORTED) + self.assertIsNone(recipe) - model = SimpleLinearModel().eval() - example_inputs = [(torch.randn(1, 32),)] + def test_recipe_registry_integration(self): + """Test that recipes work with the global recipe registry""" + for recipe_type in self.fp32_recipes + self.fp16_recipes: + with self.subTest(recipe=recipe_type.value): + recipe = ExportRecipe.get_recipe(recipe_type) + self.assertIsNotNone(recipe) + self.assertEqual(recipe.name, recipe_type.value) - # Test different block sizes - test_cases = [ - {"bits": 3, "block_size": [-1, 8]}, - ] + def test_invalid_recipe_kwargs(self): + """Test detailed error messages for invalid kwargs""" + provider = CoreMLRecipeProvider() - for kwargs in test_cases: - with self.subTest(kwargs=kwargs): - session = export( - model=model, - example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe( - CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, **kwargs - ), - ) - self.check_fully_delegated(session) - - def test_codebook_parameter_validation(self): - """Test codebook parameter validation""" - # Test invalid bits type + # Test single invalid parameter with self.assertRaises(ValueError) as cm: - self.provider.create_recipe( - CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, bits="3", block_size=[-1, 8] - ) - self.assertIn("must be an integer", str(cm.exception)) + provider.create_recipe(CoreMLRecipeType.FP16, invalid_param=123) - # Test bits out of range - with self.assertRaises(ValueError) as cm: - self.provider.create_recipe( - CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, bits=0, block_size=[-1, 8] - ) - self.assertIn("must be between 1 and 8", str(cm.exception)) + error_msg = str(cm.exception) + self.assertIn("Unexpected parameters", error_msg) + # Test multiple invalid parameters with self.assertRaises(ValueError) as cm: - self.provider.create_recipe( - CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, bits=9, block_size=[-1, 8] + provider.create_recipe( + CoreMLRecipeType.FP32, param1="value1", param2="value2" ) - self.assertIn("must be between 1 and 8", str(cm.exception)) - # Test invalid block_size type + error_msg = str(cm.exception) + self.assertIn("Unexpected parameters", error_msg) + + # Test mix of valid and invalid parameters with self.assertRaises(ValueError) as cm: - self.provider.create_recipe( - CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, bits=3, block_size="[-1, 16]" + provider.create_recipe( + CoreMLRecipeType.FP32, + minimum_deployment_target=ct.target.iOS16, # valid + invalid_param="invalid", # invalid ) - self.assertIn("must be a list", str(cm.exception)) - - def test_int8_static_quantization(self): - """Test INT8 static quantization (weights + activations)""" - - class SimpleLinearModel(nn.Module): - def __init__(self): - super().__init__() - self.layer1 = nn.Linear(32, 16) - self.layer2 = nn.Linear(16, 2) - - def forward(self, x): - x = torch.relu(self.layer1(x)) - x = self.layer2(x) - return x - model = SimpleLinearModel().eval() - example_inputs = [(torch.randn(1, 32),)] + error_msg = str(cm.exception) + self.assertIn("Unexpected parameters", error_msg) - recipe = ExportRecipe.get_recipe( - CoreMLRecipeType.PT2E_INT8_STATIC, minimum_deployment_target=ct.target.iOS17 + def test_valid_kwargs(self): + """Test valid kwargs""" + recipe = self.provider.create_recipe( + CoreMLRecipeType.FP32, + minimum_deployment_target=ct.target.iOS16, + compute_unit=ct.ComputeUnit.CPU_AND_GPU, ) + self.assertIsNotNone(recipe) + self.assertEqual(recipe.name, "coreml_fp32") - session = export( - model=model, - example_inputs=example_inputs, - export_recipe=recipe, - ) - self.check_fully_delegated(session) - - self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-3) - self._compare_eager_unquantized_model_outputs(session, model, example_inputs) - - def test_int8_weight_only_pt2e(self): - """Test PT2E-based INT8 weight-only quantization""" - model = TestHelperModules.TwoLinearModule().eval() - example_inputs = [(torch.randn(9, 8),)] - - session = export( - model=model, - example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe( - CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY - ), - ) - self.check_fully_delegated(session) + # Verify partitioners are properly configured + partitioners = recipe.lowering_recipe.partitioners + self.assertEqual(len(partitioners), 1, "Expected exactly one partitioner") - self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-2) - self._compare_eager_unquantized_model_outputs(session, model, example_inputs) + # Verify delegation spec and compile specs + delegation_spec = partitioners[0].delegation_spec + self.assertIsNotNone(delegation_spec, "Delegation spec should not be None") - def test_int8_weight_only_pt2e_with_conv(self): - """Test PT2E-based INT8 weight-only quantization with convolution layers""" + compile_specs = delegation_spec.compile_specs + self.assertIsNotNone(compile_specs, "Compile specs should not be None") - class ConvModel(nn.Module): - def __init__(self): - super().__init__() - self.conv1 = nn.Conv2d(3, 16, 3, padding=1) - self.conv2 = nn.Conv2d(16, 32, 3, padding=1) - self.pool = nn.AdaptiveAvgPool2d((1, 1)) - self.fc = nn.Linear(32, 10) + spec_dict = {spec.key: spec.value for spec in compile_specs} - def forward(self, x): - x = torch.relu(self.conv1(x)) - x = torch.relu(self.conv2(x)) - x = self.pool(x) - x = x.view(x.size(0), -1) - x = self.fc(x) - return x - - model = ConvModel().eval() - example_inputs = [(torch.randn(1, 3, 32, 32),)] - - session = export( - model=model, - example_inputs=example_inputs, - export_recipe=ExportRecipe.get_recipe( - CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY - ), + # Assert that all expected specs are present with correct values + self.assertIn( + "min_deployment_target", + spec_dict, + "minimum_deployment_target should be in compile specs", + ) + min_target_value = spec_dict["min_deployment_target"] + if isinstance(min_target_value, bytes): + min_target_value = min_target_value.decode("utf-8") + self.assertEqual( + str(min_target_value), + str(ct.target.iOS16.value), + "minimum_deployment_target should match the provided value", ) - self.check_fully_delegated(session) - - self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-2) - self._compare_eager_unquantized_model_outputs(session, model, example_inputs) - - def test_pt2e_recipes_parameter_rejection(self): - """Test that PT2E recipes reject TorchAO-specific parameters""" - # PT2E recipes should reject TorchAO-specific parameters - pt2e_recipes = [ - CoreMLRecipeType.PT2E_INT8_STATIC, - CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY, - ] - torchao_params = ["filter_fn", "group_size", "bits", "block_size"] - - for recipe_type in pt2e_recipes: - for param in torchao_params: - with self.subTest(recipe=recipe_type.value, param=param): - kwargs = {param: "dummy_value"} - with self.assertRaises(ValueError) as cm: - self.provider.create_recipe(recipe_type, **kwargs) - self.assertIn("unexpected parameters", str(cm.exception).lower()) - - def test_filter_fn_comprehensive(self): - """Comprehensive test for filter_fn parameter functionality""" - - def custom_filter(module, fqn): - return isinstance(module, nn.Linear) and "target" in fqn - - # Test 1: TorchAO recipes accept filter_fn and default to None - torchao_recipes = [ - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL, - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL, - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, - ] - - for recipe_type in torchao_recipes: - with self.subTest(f"{recipe_type.value}_default"): - # Test default behavior (None) - recipe = self.provider.create_recipe(recipe_type) - config = recipe.quantization_recipe.ao_quantization_configs[0] - self.assertIsNone(config.filter_fn) - - with self.subTest(f"{recipe_type.value}_custom"): - # Test custom filter_fn - recipe = self.provider.create_recipe( - recipe_type, filter_fn=custom_filter - ) - config = recipe.quantization_recipe.ao_quantization_configs[0] - self.assertEqual(config.filter_fn, custom_filter) - - # Test 2: Codebook recipe accepts filter_fn and has sensible default - with self.subTest("codebook_default"): - recipe = self.provider.create_recipe( - CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, bits=3, block_size=[-1, 16] - ) - config = recipe.quantization_recipe.ao_quantization_configs[0] - self.assertIsNotNone(config.filter_fn) - - # Test default filter targets Linear and Embedding layers - linear_module = nn.Linear(10, 5) - embedding_module = nn.Embedding(100, 10) - conv_module = nn.Conv2d(3, 16, 3) - - self.assertTrue(config.filter_fn(linear_module, "linear")) - self.assertTrue(config.filter_fn(embedding_module, "embedding")) - self.assertFalse(config.filter_fn(conv_module, "conv")) - - with self.subTest("codebook_custom"): - recipe = self.provider.create_recipe( - CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, - filter_fn=custom_filter, - bits=3, - block_size=[-1, 16], - ) - config = recipe.quantization_recipe.ao_quantization_configs[0] - self.assertEqual(config.filter_fn, custom_filter) - - def test_quantization_recipe_structure(self): - """Test that quantization recipes have proper structure""" - quantization_recipes = [ - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL, - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL, - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, - CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, - ] - - for recipe_type in quantization_recipes: - with self.subTest(recipe=recipe_type.value): - kwargs = ( - {"bits": 3, "block_size": [-1, 16]} - if recipe_type == CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY - else {} - ) - recipe = self.provider.create_recipe(recipe_type, **kwargs) - self.assertIsNotNone(recipe) - - # Should have quantization recipe with ao_quantization_configs - self.assertIsNotNone(recipe.quantization_recipe) - self.assertIsNotNone(recipe.quantization_recipe.ao_quantization_configs) - self.assertEqual( - len(recipe.quantization_recipe.ao_quantization_configs), 1 - ) - - # Should have lowering recipe - self.assertIsNotNone(recipe.lowering_recipe) - self.assertIsNotNone(recipe.lowering_recipe.partitioners) - - def test_recipe_creation_with_defaults(self): - """Test that recipes work with default parameters""" - # Test that all recipes can be created without explicit parameters - all_recipes = [ - CoreMLRecipeType.FP32, - CoreMLRecipeType.FP16, - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL, - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, # should use default group_size=32 - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL, - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, # should use default group_size=32 - CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, # should use default bits=3, block_size=[-1,16] - ] - - for recipe_type in all_recipes: - with self.subTest(recipe=recipe_type.value): - kwargs = ( - {"bits": 3, "block_size": [-1, 16]} - if recipe_type == CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY - else {} - ) - recipe = self.provider.create_recipe(recipe_type, **kwargs) - self.assertIsNotNone(recipe) - self.assertEqual(recipe.name, recipe_type.value) - - def test_minimum_deployment_target_validation(self): - """Test that minimum_deployment_target validation works correctly for quantization recipes""" - test_cases = [ - (CoreMLRecipeType.PT2E_INT8_STATIC, ct.target.iOS17, {}), - (CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY, ct.target.iOS17, {}), - ( - CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL, - ct.target.iOS18, - {}, - ), - (CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}), - ( - CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL, - ct.target.iOS18, - {}, - ), - (CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}), - ( - CoreMLRecipeType.CODEBOOK_WEIGHT_ONLY, - ct.target.iOS18, - {"bits": 3, "block_size": [-1, 16]}, - ), - ] - - for recipe_type, min_target, kwargs in test_cases: - with self.subTest(recipe=recipe_type.value): - - # Test 1: Providing deployment target below minimum should raise ValueError - too_low_target = ct.target.iOS15 - with self.assertRaises(ValueError) as cm: - self.provider.create_recipe( - recipe_type, minimum_deployment_target=too_low_target, **kwargs - ) - error_msg = str(cm.exception) - self.assertIn( - f"minimum_deployment_target must be {str(min_target)} or higher", - error_msg, - ) - - # Test 2: Providing valid deployment target should work - valid_recipe = self.provider.create_recipe( - recipe_type, minimum_deployment_target=min_target, **kwargs - ) - self.assertIsNotNone(valid_recipe) - - # Test 3: Not providing deployment target should default to minimum - default_recipe = self.provider.create_recipe(recipe_type, **kwargs) - self.assertIsNotNone(default_recipe) - # Test 4: Providing deployment target higher than minimum should work - higher_target = ( - ct.target.iOS18 - if min_target == ct.target.iOS17 - else ct.target.iOS18 - ) - higher_recipe = self.provider.create_recipe( - recipe_type, minimum_deployment_target=higher_target, **kwargs - ) - self.assertIsNotNone(higher_recipe) + self.assertIn( + "compute_units", spec_dict, "compute_unit should be in compile specs" + ) + compute_unit_value = spec_dict["compute_units"] + if isinstance(compute_unit_value, bytes): + compute_unit_value = compute_unit_value.decode("utf-8") + self.assertEqual( + str(compute_unit_value), + ct.ComputeUnit.CPU_AND_GPU.name.lower(), + "compute_unit should match the provided value", + ) diff --git a/backends/xnnpack/recipes/xnnpack_recipe_provider.py b/backends/xnnpack/recipes/xnnpack_recipe_provider.py index 436eb2db158..8fba58c12c3 100644 --- a/backends/xnnpack/recipes/xnnpack_recipe_provider.py +++ b/backends/xnnpack/recipes/xnnpack_recipe_provider.py @@ -25,7 +25,6 @@ get_xnnpack_executorch_backend_config, ) from executorch.export import ( - AOQuantizationConfig, BackendRecipeProvider, ExportRecipe, LoweringRecipe, @@ -58,37 +57,31 @@ def create_recipe( if recipe_type == XNNPackRecipeType.FP32: return self._build_fp32_recipe(recipe_type) - elif recipe_type == XNNPackRecipeType.PT2E_INT8_DYNAMIC_PER_CHANNEL: + elif recipe_type == XNNPackRecipeType.INT8_DYNAMIC_PER_CHANNEL: return self._build_quantized_recipe( recipe_type, is_per_channel=True, is_dynamic=True ) - elif recipe_type == XNNPackRecipeType.PT2E_INT8_STATIC_PER_CHANNEL: + elif recipe_type == XNNPackRecipeType.INT8_STATIC_PER_CHANNEL: return self._build_quantized_recipe( recipe_type, is_per_channel=True, is_dynamic=False ) - elif recipe_type == XNNPackRecipeType.PT2E_INT8_STATIC_PER_TENSOR: + elif recipe_type == XNNPackRecipeType.INT8_STATIC_PER_TENSOR: return self._build_quantized_recipe( recipe_type, is_per_channel=False, is_dynamic=False ) - elif ( - recipe_type - == XNNPackRecipeType.TORCHAO_INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_CHANNEL - ): - return self._build_torchao_quantized_recipe( + elif recipe_type == XNNPackRecipeType.INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_CHANNEL: + return self._build_int8da_intx_weight_recipe( recipe_type=recipe_type, is_per_channel=True, weight_dtype=torch.int4, ) - elif ( - recipe_type - == XNNPackRecipeType.TORCHAO_INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR - ): + elif recipe_type == XNNPackRecipeType.INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR: group_size = kwargs.get("group_size", 32) - return self._build_torchao_quantized_recipe( + return self._build_int8da_intx_weight_recipe( recipe_type=recipe_type, is_per_channel=False, weight_dtype=torch.int4, @@ -139,7 +132,7 @@ def _build_quantized_recipe( executorch_backend_config=get_xnnpack_executorch_backend_config(), ) - def _build_torchao_quantized_recipe( + def _build_int8da_intx_weight_recipe( self, recipe_type: RecipeType, is_per_channel: bool = True, @@ -148,21 +141,17 @@ def _build_torchao_quantized_recipe( ) -> ExportRecipe: if is_per_channel: weight_granularity = PerAxis(axis=0) - assert weight_dtype == torch.int4 or weight_dtype == torch.int8 else: weight_granularity = PerGroup(group_size=group_size) - assert weight_dtype == torch.int4 - config = AOQuantizationConfig( - Int8DynamicActivationIntxWeightConfig( - weight_dtype=weight_dtype, - weight_granularity=weight_granularity, - ) + config = Int8DynamicActivationIntxWeightConfig( + weight_dtype=weight_dtype, + weight_granularity=weight_granularity, ) quant_recipe = QuantizationRecipe( quantizers=None, - ao_quantization_configs=[config], + ao_base_config=[config], ) return ExportRecipe( @@ -173,10 +162,7 @@ def _build_torchao_quantized_recipe( ) def _validate_recipe_kwargs(self, recipe_type: RecipeType, **kwargs: Any) -> None: - if ( - recipe_type - == XNNPackRecipeType.TORCHAO_INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR - ): + if recipe_type == XNNPackRecipeType.INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR: expected_keys = {"group_size"} unexpected = set(kwargs.keys()) - expected_keys if unexpected: diff --git a/backends/xnnpack/recipes/xnnpack_recipe_types.py b/backends/xnnpack/recipes/xnnpack_recipe_types.py index 61117b94502..5675c3a5ffa 100644 --- a/backends/xnnpack/recipes/xnnpack_recipe_types.py +++ b/backends/xnnpack/recipes/xnnpack_recipe_types.py @@ -13,22 +13,19 @@ class XNNPackRecipeType(RecipeType): """XNNPACK-specific recipe types""" FP32 = "fp32" - - ## PT2E-based quantization recipes # INT8 Dynamic Quantization - PT2E_INT8_DYNAMIC_PER_CHANNEL = "pt2e_int8_dynamic_per_channel" - # INT8 Static Quantization, needs calibration dataset - PT2E_INT8_STATIC_PER_CHANNEL = "pt2e_int8_static_per_channel" - PT2E_INT8_STATIC_PER_TENSOR = "pt2e_int8_static_per_tensor" - - ## TorchAO-based quantization recipes + INT8_DYNAMIC_PER_CHANNEL = "int8_dynamic_per_channel" # INT8 Dynamic Activations INT4 Weight Quantization, Axis = 0 - TORCHAO_INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_CHANNEL = ( - "torchao_int8da_int4w_per_channel" - ) + INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_CHANNEL = "int8da_int4w_per_channel" # INT8 Dynamic Activations INT4 Weight Quantization, default group_size = 32 # can be overriden by group_size kwarg - TORCHAO_INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR = "torchao_int8da_int4w_per_tensor" + INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR = "int8da_int4w_per_tensor" + # INT8 Static Activations INT4 Weight Quantization + INT8_STATIC_ACT_INT4_WEIGHT_PER_CHANNEL = "int8a_int4w_per_channel" + INT8_STATIC_ACT_INT4_WEIGHT_PER_TENSOR = "int8a_int44w_per_tensor" + # INT8 Static Quantization, needs calibration dataset + INT8_STATIC_PER_CHANNEL = "int8_static_per_channel" + INT8_STATIC_PER_TENSOR = "int8_static_per_tensor" @classmethod def get_backend_name(cls) -> str: diff --git a/backends/xnnpack/test/recipes/test_xnnpack_recipes.py b/backends/xnnpack/test/recipes/test_xnnpack_recipes.py index 4ccbbc6f36d..679743e42d3 100644 --- a/backends/xnnpack/test/recipes/test_xnnpack_recipes.py +++ b/backends/xnnpack/test/recipes/test_xnnpack_recipes.py @@ -19,10 +19,8 @@ from executorch.examples.xnnpack import MODEL_NAME_TO_OPTIONS, QuantType from executorch.exir.schema import DelegateCall, Program from executorch.export import export, ExportRecipe, recipe_registry -from export.types import StageType from torch import nn from torch.testing._internal.common_quantization import TestHelperModules -from torchao.quantization.utils import compute_error class TestXnnpackRecipes(unittest.TestCase): @@ -40,29 +38,6 @@ def check_fully_delegated(self, program: Program) -> None: self.assertEqual(len(instructions), 1) self.assertIsInstance(instructions[0].instr_args, DelegateCall) - # pyre-ignore - def _compare_eager_quantized_model_outputs( - self, session, example_inputs, atol: float - ) -> None: - """Utility to compare eager quantized model output with session output after xnnpack lowering""" - torch_export_stage_output = session.get_stage_artifacts()[ - StageType.TORCH_EXPORT - ] - eager_quantized_model = torch_export_stage_output.data["forward"].module() - output = session.run_method("forward", example_inputs[0])[0] - expected = eager_quantized_model(*example_inputs[0]) - Tester._assert_outputs_equal(output, expected, atol=atol) - - def _compare_eager_unquantized_model_outputs( - self, session, eager_unquantized_model, example_inputs, sqnr_threshold=20 - ): - """Utility to compare eager unquantized model output with session output using SQNR""" - quantized_output = session.run_method("forward", example_inputs[0])[0] - original_output = eager_unquantized_model(*example_inputs[0]) - error = compute_error(original_output, quantized_output) - print(f"{self._testMethodName} - SQNR: {error} dB") - self.assertTrue(error > sqnr_threshold) - def test_basic_recipe(self) -> None: m_eager = TestHelperModules.TwoLinearModule().eval() example_inputs = [(torch.randn(9, 8),)] @@ -71,13 +46,18 @@ def test_basic_recipe(self) -> None: example_inputs=example_inputs, export_recipe=ExportRecipe.get_recipe(XNNPackRecipeType.FP32), ) - self._compare_eager_quantized_model_outputs(session, example_inputs, 1e-3) + self.assertTrue( + torch.allclose( + session.run_method("forward", example_inputs[0])[0], + m_eager(*example_inputs[0]), + atol=1e-3, + ) + ) self.check_fully_delegated(session.get_executorch_program()) - self._compare_eager_unquantized_model_outputs(session, m_eager, example_inputs) def test_int8_dynamic_quant_recipe(self) -> None: test_cases = [ - ExportRecipe.get_recipe(XNNPackRecipeType.PT2E_INT8_DYNAMIC_PER_CHANNEL), + ExportRecipe.get_recipe(XNNPackRecipeType.INT8_DYNAMIC_PER_CHANNEL), ] for export_recipe in test_cases: @@ -90,18 +70,19 @@ def test_int8_dynamic_quant_recipe(self) -> None: example_inputs=example_inputs, export_recipe=export_recipe, ) - self._compare_eager_quantized_model_outputs( - session, example_inputs, 1e-1 + self.assertTrue( + torch.allclose( + session.run_method("forward", example_inputs[0])[0], + m_eager(*example_inputs[0]), + atol=1e-1, + ) ) self.check_fully_delegated(session.get_executorch_program()) - self._compare_eager_unquantized_model_outputs( - session, m_eager, example_inputs - ) def test_int8_static_quant_recipe(self) -> None: test_cases = [ - ExportRecipe.get_recipe(XNNPackRecipeType.PT2E_INT8_STATIC_PER_CHANNEL), - ExportRecipe.get_recipe(XNNPackRecipeType.PT2E_INT8_STATIC_PER_TENSOR), + ExportRecipe.get_recipe(XNNPackRecipeType.INT8_STATIC_PER_CHANNEL), + ExportRecipe.get_recipe(XNNPackRecipeType.INT8_STATIC_PER_TENSOR), ] for export_recipe in test_cases: @@ -114,13 +95,14 @@ def test_int8_static_quant_recipe(self) -> None: example_inputs=example_inputs, export_recipe=export_recipe, ) - self._compare_eager_quantized_model_outputs( - session, example_inputs, 1e-2 + self.assertTrue( + torch.allclose( + session.run_method("forward", example_inputs[0])[0], + m_eager(*example_inputs[0]), + atol=1e-1, + ) ) self.check_fully_delegated(session.get_executorch_program()) - self._compare_eager_unquantized_model_outputs( - session, m_eager, example_inputs - ) def test_8a4w_recipe(self) -> None: class SimpleLinearModel(nn.Module): @@ -134,10 +116,10 @@ def forward(self, x) -> torch.Tensor: test_cases = [ ExportRecipe.get_recipe( - XNNPackRecipeType.TORCHAO_INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_CHANNEL, + XNNPackRecipeType.INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_CHANNEL, ), ExportRecipe.get_recipe( - XNNPackRecipeType.TORCHAO_INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR, + XNNPackRecipeType.INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR, group_size=32, ), ] @@ -151,22 +133,23 @@ def forward(self, x) -> torch.Tensor: example_inputs=example_inputs, export_recipe=export_recipe, ) - self.check_fully_delegated(session.get_executorch_program()) - self._compare_eager_quantized_model_outputs( - session, example_inputs, 1e-3 - ) - self._compare_eager_unquantized_model_outputs( - session, model, example_inputs, sqnr_threshold=15 + self.assertTrue( + torch.allclose( + session.run_method("forward", example_inputs[0])[0], + model(*example_inputs[0]), + atol=1e-2, + ) ) + self.check_fully_delegated(session.get_executorch_program()) def _get_recipe_for_quant_type(self, quant_type: QuantType) -> XNNPackRecipeType: # Map QuantType to corresponding recipe name. if quant_type == QuantType.STATIC_PER_CHANNEL: - return XNNPackRecipeType.PT2E_INT8_STATIC_PER_CHANNEL + return XNNPackRecipeType.INT8_STATIC_PER_CHANNEL elif quant_type == QuantType.DYNAMIC_PER_CHANNEL: - return XNNPackRecipeType.PT2E_INT8_DYNAMIC_PER_CHANNEL + return XNNPackRecipeType.INT8_DYNAMIC_PER_CHANNEL elif quant_type == QuantType.STATIC_PER_TENSOR: - return XNNPackRecipeType.PT2E_INT8_STATIC_PER_TENSOR + return XNNPackRecipeType.INT8_STATIC_PER_TENSOR elif quant_type == QuantType.NONE: return XNNPackRecipeType.FP32 else: @@ -241,13 +224,12 @@ def test_validate_recipe_kwargs_int4_tensor_with_valid_group_size( # Should not raise any exception recipe_w_default_group = provider.create_recipe( - XNNPackRecipeType.TORCHAO_INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR + XNNPackRecipeType.INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR ) self.assertIsNotNone(recipe_w_default_group) recipe = provider.create_recipe( - XNNPackRecipeType.TORCHAO_INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR, - group_size=64, + XNNPackRecipeType.INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR, group_size=64 ) self.assertIsNotNone(recipe) @@ -258,7 +240,7 @@ def test_validate_recipe_kwargs_int4_tensor_with_invalid_group_size( with self.assertRaises(ValueError) as cm: provider.create_recipe( - XNNPackRecipeType.TORCHAO_INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR, + XNNPackRecipeType.INT8_DYNAMIC_ACT_INT4_WEIGHT_PER_TENSOR, group_size="32", # String instead of int ) diff --git a/export/__init__.py b/export/__init__.py index a7b165185de..d5f3826ab90 100644 --- a/export/__init__.py +++ b/export/__init__.py @@ -15,19 +15,12 @@ """ from .export import export, ExportSession -from .recipe import ( - AOQuantizationConfig, - ExportRecipe, - LoweringRecipe, - QuantizationRecipe, - RecipeType, -) +from .recipe import ExportRecipe, LoweringRecipe, QuantizationRecipe, RecipeType from .recipe_provider import BackendRecipeProvider from .recipe_registry import recipe_registry from .types import StageType __all__ = [ - "AOQuantizationConfig", "StageType", "ExportRecipe", "LoweringRecipe", diff --git a/export/recipe.py b/export/recipe.py index 086d57f3e38..8f7251cd419 100644 --- a/export/recipe.py +++ b/export/recipe.py @@ -6,9 +6,7 @@ from abc import ABCMeta, abstractmethod from dataclasses import dataclass from enum import Enum, EnumMeta -from typing import Callable, List, Optional, Sequence - -import torch +from typing import List, Optional, Sequence from executorch.exir._warnings import experimental @@ -66,20 +64,6 @@ class Mode(str, Enum): RELEASE = "release" -@dataclass -class AOQuantizationConfig: - """ - Configuration for torchao quantization with optional filter function. - - Attributes: - ao_base_config: The AOBaseConfig for quantization - filter_fn: Optional filter function to selectively apply quantization - """ - - ao_base_config: AOBaseConfig - filter_fn: Optional[Callable[[torch.nn.Module, str], bool]] = None - - @dataclass class QuantizationRecipe: """ @@ -89,12 +73,11 @@ class QuantizationRecipe: Attributes: quantizers: Optional list of quantizers for model quantization - ao_quantization_configs: Optional list of AOQuantizationConfig objects that pair - AOBaseConfig with optional filter functions + ao_base_config: Optional list of AO base configurations """ quantizers: Optional[List[Quantizer]] = None - ao_quantization_configs: Optional[List[AOQuantizationConfig]] = None + ao_base_config: Optional[List[AOBaseConfig]] = None def get_quantizers(self) -> Optional[List[Quantizer]]: """ diff --git a/export/stages.py b/export/stages.py index 2b3f8a42440..f4de59a9b7a 100644 --- a/export/stages.py +++ b/export/stages.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import copy import logging from abc import ABC, abstractmethod from typing import Any, Callable, Dict, List, Optional, Sequence @@ -21,10 +20,7 @@ from torch._export.pass_base import PassType from torchao.quantization import quantize_ from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e -from torchao.quantization.pt2e.quantizer import ( - ComposableQuantizer, - Quantizer as TorchAOPT2EQuantizer, -) +from torchao.quantization.pt2e.quantizer import ComposableQuantizer from torchao.utils import unwrap_tensor_subclass @@ -293,7 +289,7 @@ def run(self, artifact: PipelineArtifact) -> None: """ if ( not self._quantization_recipe - or not self._quantization_recipe.ao_quantization_configs + or not self._quantization_recipe.ao_base_config ): logging.info( "Quantization recipe is invalid to run SourceTransform, returning original artifact" @@ -304,14 +300,15 @@ def run(self, artifact: PipelineArtifact) -> None: assert isinstance(artifact.data, dict) # Store the original models - self._transformed_models = copy.deepcopy(artifact.data) + self._transformed_models = artifact.data # Apply torchao quantize_ to each model - for _, model in artifact.data.items(): + for method_name, model in artifact.data.items(): # pyre-ignore - for ao_config in self._quantization_recipe.ao_quantization_configs: - quantize_(model, ao_config.ao_base_config, ao_config.filter_fn) + for config in self._quantization_recipe.ao_base_config: + quantize_(model, config) unwrap_tensor_subclass(model) + self._transformed_models[method_name] = model self._artifact = artifact.copy_with_new_data(self._transformed_models) @@ -336,36 +333,6 @@ def valid_predecessor_stages(self) -> List["StageType"]: def can_start_pipeline(self) -> bool: return True - def _get_quantizer_for_prepare_pt2e(self, quantizers: List[Any]): - torch_ao_quantizers = [] - torchao_pt2e_quantizers = [] - - for quantizer in quantizers: - if isinstance(quantizer, TorchAOPT2EQuantizer): - torchao_pt2e_quantizers.append(quantizer) - else: - # torch.ao quantizer support will soon be deprecated, remove this once CoreML moves to torchao quantizer - logging.warning( - f"torch.ao quantizer {quantizer} is deprecated, consider moving to torchao quantizer" - ) - torch_ao_quantizers.append(quantizer) - - if torch_ao_quantizers and torchao_pt2e_quantizers: - raise ValueError("Mixed quantizer types are not supported") - if len(torch_ao_quantizers) > 1: - raise ValueError( - "Multiple quantizers of torch.ao.quantization.quantizer not supported" - ) - - if torch_ao_quantizers: - # prepare_pt2e has backward compat with torch.ao quantizer - return torch_ao_quantizers[0] - elif torchao_pt2e_quantizers: - # Multiple torchao quantizers - use ComposableQuantizer - return ComposableQuantizer(torchao_pt2e_quantizers) - else: - raise ValueError("No quantizers detected") - def run(self, artifact: PipelineArtifact) -> None: if not self._quantization_recipe or not self._quantization_recipe.quantizers: logging.info( @@ -390,10 +357,11 @@ def run(self, artifact: PipelineArtifact) -> None: inputs = example_inputs[method_name][0] captured_graph = torch.export.export(model, inputs, strict=True).module() - quantizer = self._get_quantizer_for_prepare_pt2e( + composed_quantizer = ComposableQuantizer( + # pyre-ignore self._quantization_recipe.quantizers ) - prepared_model = prepare_pt2e(captured_graph, quantizer) + prepared_model = prepare_pt2e(captured_graph, composed_quantizer) for calibration_input in example_inputs[method_name]: prepared_model(*calibration_input) diff --git a/export/tests/test_export_session.py b/export/tests/test_export_session.py index fcec1b7a59a..30288941d22 100644 --- a/export/tests/test_export_session.py +++ b/export/tests/test_export_session.py @@ -12,11 +12,7 @@ import torch from executorch.export import ExportRecipe, ExportSession -from executorch.export.recipe import ( - AOQuantizationConfig, - LoweringRecipe, - QuantizationRecipe, -) +from executorch.export.recipe import LoweringRecipe, QuantizationRecipe from executorch.export.stages import PipelineArtifact from executorch.export.types import StageType @@ -24,7 +20,7 @@ class SimpleTestModel(torch.nn.Module): def __init__(self) -> None: super().__init__() - self.linear: torch.nn.Module = torch.nn.Linear(10, 5) + self.linear = torch.nn.Linear(10, 5) def forward(self, x: torch.Tensor) -> torch.Tensor: return self.linear(x) @@ -453,7 +449,7 @@ def test_pipeline_building_with_all_recipes(self) -> None: """Test pipeline building with quantization and lowering recipes.""" # Create comprehensive recipes quant_recipe = QuantizationRecipe( - ao_quantization_configs=[AOQuantizationConfig(Mock())], + ao_base_config=[Mock()], quantizers=[Mock()], ) lowering_recipe = LoweringRecipe( diff --git a/export/tests/test_export_stages.py b/export/tests/test_export_stages.py index 7f82551a48b..4820e508e18 100644 --- a/export/tests/test_export_stages.py +++ b/export/tests/test_export_stages.py @@ -11,7 +11,7 @@ import torch from executorch.exir.program import EdgeProgramManager, ExecutorchProgramManager -from executorch.export import AOQuantizationConfig, QuantizationRecipe +from executorch.export import QuantizationRecipe from executorch.export.stages import ( EdgeTransformAndLowerStage, ExecutorchStage, @@ -29,7 +29,7 @@ class SimpleTestModel(torch.nn.Module): def __init__(self) -> None: super().__init__() - self.linear: torch.nn.Module = torch.nn.Linear(10, 5) + self.linear = torch.nn.Linear(10, 5) def forward(self, x: torch.Tensor) -> torch.Tensor: return self.linear(x) @@ -163,7 +163,7 @@ def setUp(self) -> None: def test_source_transform_stage_no_quantization(self) -> None: mock_recipe = Mock(spec=QuantizationRecipe) - mock_recipe.ao_quantization_configs = None + mock_recipe.ao_base_config = None stage = SourceTransformStage(mock_recipe) artifact = PipelineArtifact(data=self.models_dict, context={}) @@ -174,19 +174,12 @@ def test_source_transform_stage_no_quantization(self) -> None: @patch("executorch.export.stages.quantize_") @patch("executorch.export.stages.unwrap_tensor_subclass") - def test_run_with_ao_quantization_configs( + def test_run_with_ao_base_config( self, mock_unwrap: Mock, mock_quantize: Mock ) -> None: - from torchao.core.config import AOBaseConfig - - mock_config = Mock(spec=AOBaseConfig) - mock_filter_fn = Mock() - # pyre-ignore[28]: Unexpected keyword argument error is a false positive for dataclass - mock_ao_config: AOQuantizationConfig = AOQuantizationConfig( - ao_base_config=mock_config, filter_fn=mock_filter_fn - ) + mock_config = Mock() mock_recipe = Mock(spec=QuantizationRecipe) - mock_recipe.ao_quantization_configs = [mock_ao_config] + mock_recipe.ao_base_config = [mock_config] stage = SourceTransformStage(mock_recipe) @@ -195,7 +188,7 @@ def test_run_with_ao_quantization_configs( stage.run(artifact) # Verify quantize_ was called with the model and config - mock_quantize.assert_called_once_with(self.model, mock_config, mock_filter_fn) + mock_quantize.assert_called_once_with(self.model, mock_config) # Verify unwrap_tensor_subclass was called with the model mock_unwrap.assert_called_once_with(self.model) @@ -208,24 +201,6 @@ def setUp(self) -> None: self.example_inputs = [(torch.randn(2, 10),)] self.context = {"example_inputs": {"forward": self.example_inputs}} - @staticmethod - def create_dummy_quantizer(): - from torchao.quantization.pt2e.quantizer import ( - Quantizer as TorchAOPT2EQuantizer, - ) - - class DummyQuantizer(TorchAOPT2EQuantizer): - def __init__(self): - pass - - def annotate(self, model): - return model - - def validate(self, model): - pass - - return DummyQuantizer() - def test_run_no_quantizers(self) -> None: """Test execution with no quantizers.""" mock_recipe = Mock(spec=QuantizationRecipe) @@ -249,7 +224,7 @@ def test_run_with_quantizers( mock_convert_pt2e: Mock, ) -> None: """Test execution with quantizers""" - mock_quantizer = self.create_dummy_quantizer() + mock_quantizer = Mock() mock_recipe = Mock(spec=QuantizationRecipe) mock_recipe.quantizers = [mock_quantizer] stage = QuantizeStage(mock_recipe) @@ -310,35 +285,6 @@ def test_run_empty_example_inputs(self) -> None: "Example inputs for method forward not found or empty", str(cm.exception) ) - @patch("executorch.export.stages.ComposableQuantizer") - def test_get_quantizer_for_prepare_pt2e( - self, mock_composable_quantizer: Mock - ) -> None: - """Test _get_quantizer_for_prepare_pt2e method with different quantizer scenarios.""" - mock_recipe = Mock(spec=QuantizationRecipe) - stage = QuantizeStage(mock_recipe) - - # Test empty quantizers list - should raise ValueError - with self.assertRaises(ValueError) as cm: - stage._get_quantizer_for_prepare_pt2e([]) - self.assertIn("No quantizers detected", str(cm.exception)) - - # Test ComposableQuantizer path with multiple torchao quantizers - # Create instances of dummy quantizers using the reusable method - quantizer1 = self.create_dummy_quantizer() - quantizer2 = self.create_dummy_quantizer() - - # Set up ComposableQuantizer mock - mock_composed_quantizer = Mock() - mock_composable_quantizer.return_value = mock_composed_quantizer - - # Call the method with multiple torchao quantizers - result = stage._get_quantizer_for_prepare_pt2e([quantizer1, quantizer2]) - - # Verify ComposableQuantizer was called with the quantizers - mock_composable_quantizer.assert_called_once_with([quantizer1, quantizer2]) - self.assertEqual(result, mock_composed_quantizer) - class TestToEdgeStage(unittest.TestCase): def setUp(self) -> None: