From e489d6c7df7c4ed6fdf22414a52d631be5f6eedb Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Fri, 20 Jun 2025 17:13:03 -0700 Subject: [PATCH 01/85] Runtime support for openvino quantized models --- backends/openvino/runtime/OpenvinoBackend.cpp | 24 +++++++++++++++++++ backends/openvino/scripts/openvino_build.sh | 1 + examples/models/llama/CMakeLists.txt | 8 +++++++ tools/cmake/executorch-config.cmake | 1 + 4 files changed, 34 insertions(+) diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp index a3134f72b4b..39a1bf55c32 100644 --- a/backends/openvino/runtime/OpenvinoBackend.cpp +++ b/backends/openvino/runtime/OpenvinoBackend.cpp @@ -114,6 +114,26 @@ exr::Error OpenvinoBackend::execute( ov_type, input_shape, input_tensor.mutable_data_ptr()); infer_request->set_input_tensor(i, ov_input_tensor); + + if (args[i]->isInt()) { + int64_t *val = &(args[i]->payload.copyable_union.as_int); + + // Create OpenVINO tensor from integer input + ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val); + infer_request->set_input_tensor(i, ov_input_tensor); + } else { + auto input_tensor = args[i]->toTensor(); + ov::Shape input_shape( + input_tensor.sizes().begin(), input_tensor.sizes().end()); + + // Convert input tensor to OpenVINO tensor + ov::element::Type ov_type = + convert_to_openvino_type(input_tensor.scalar_type()); + ov::Tensor ov_input_tensor( + ov_type, input_shape, input_tensor.mutable_data_ptr()); + + infer_request->set_input_tensor(i, ov_input_tensor); + } } // Set outputs @@ -165,10 +185,14 @@ ov::element::Type OpenvinoBackend::convert_to_openvino_type( switch (scalar_type) { case exa::ScalarType::Float: return ov::element::f32; + case exa::ScalarType::Half: + return ov::element::f16; case exa::ScalarType::Int: return ov::element::i32; case exa::ScalarType::Char: return ov::element::i8; + case exa::ScalarType::Byte: + return ov::element::u8; case exa::ScalarType::Long: return ov::element::i64; case exa::ScalarType::Bool: diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh index bc85d6b8410..c10a3bb4eeb 100755 --- a/backends/openvino/scripts/openvino_build.sh +++ b/backends/openvino/scripts/openvino_build.sh @@ -29,6 +29,7 @@ main() { -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \ -B"${build_dir}" diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index 8c27de20845..1063ebf2561 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -189,6 +189,14 @@ if(TARGET mpsdelegate) target_link_options_shared_lib(mpsdelegate) endif() +# Openvino backend +if(TARGET openvino_backend) + find_package(OpenVINO REQUIRED) + target_link_libraries(openvino_backend INTERFACE openvino::runtime executorch_core) + list(APPEND link_libraries openvino_backend) + target_link_options_shared_lib(openvino_backend) +endif() + if(TARGET coremldelegate) find_library(SQLITE_LIBRARY sqlite3) list( diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index aa5776163a9..adf978fb70a 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -94,6 +94,7 @@ set(lib_list quantized_kernels quantized_ops_lib quantized_ops_aot_lib + openvino_backend ) foreach(lib ${lib_list}) # Name of the variable which stores result of the find_library search From f0d901f3358fc9bc59b97450111ec0071b90044a Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Fri, 20 Jun 2025 21:41:24 -0700 Subject: [PATCH 02/85] openvino export_llama_lib support --- examples/models/llama/config/llm_config.py | 17 +++++++++++++++++ examples/models/llama/export_llama_lib.py | 17 +++++++++++++++++ extension/llm/export/partitioner_lib.py | 13 +++++++++++++ 3 files changed, 47 insertions(+) diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py index 034d8af7562..2de58fe47eb 100644 --- a/examples/models/llama/config/llm_config.py +++ b/examples/models/llama/config/llm_config.py @@ -437,6 +437,16 @@ class MPSConfig: enabled: bool = False +@dataclass +class OpenvinoConfig: + """ + Configures the QNN backend. + """ + + enabled: bool = False + device: str = "CPU" + + @dataclass class BackendConfig: """ @@ -449,6 +459,7 @@ class BackendConfig: vulkan: VulkanConfig = field(default_factory=VulkanConfig) qnn: QNNConfig = field(default_factory=QNNConfig) mps: MPSConfig = field(default_factory=MPSConfig) + openvino: OpenvinoConfig = field(default_factory=OpenvinoConfig) ################################################################################ @@ -609,6 +620,12 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 if hasattr(args, "mps"): llm_config.backend.mps.enabled = args.mps + # Openvino + if hasattr(args, "openvino"): + llm_config.backend.openvino.enabled = args.openvino + if hasattr(args, "openvino_device"): + llm_config.backend.openvino.device = args.openvino_device + # DebugConfig if hasattr(args, "profile_memory"): llm_config.debug.profile_memory = args.profile_memory diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 1f055d65822..8afaa8bf409 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -39,6 +39,7 @@ from executorch.extension.llm.export.partitioner_lib import ( get_coreml_partitioner, get_mps_partitioner, + get_openvino_partitioner, get_qnn_partitioner, get_vulkan_partitioner, get_xnnpack_partitioner, @@ -443,6 +444,14 @@ def build_args_parser() -> argparse.ArgumentParser: action="store_true", help="Delegate llama2 to qnn backend (Qualcomm), please use it --kv_cahce=True", ) + parser.add_argument("--openvino", action="store_true") + parser.add_argument( + "--openvino_device", + type=str, + default=None, + choices=["CPU", "GPU"], + help="Specify the device for Openvino (CPU or GPU).", + ) parser.add_argument( "--expand_rope_table", @@ -857,6 +866,8 @@ def _to_edge_and_lower_llama( # noqa: C901 mps: bool = False, coreml: bool = False, qnn: bool = False, + openvino: bool = False, + openvino_device: str = "CPU", dtype_override: str = "fp32", enable_dynamic_shape: bool = True, use_kv_cache: bool = False, @@ -901,6 +912,10 @@ def _to_edge_and_lower_llama( # noqa: C901 partitioners.append(coreml_partitioner) modelname = f"coreml_{modelname}" + if openvino: + partitioners.append(get_openvino_partitioner(openvino_device)) + modelname = f"openvino_{modelname}" + if qnn: logging.warning( "The model definition in current repro is not performant, please refer to the instruction" @@ -1068,6 +1083,8 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 mps=llm_config.backend.mps.enabled, coreml=llm_config.backend.coreml.enabled, qnn=llm_config.backend.qnn.enabled, + openvino=llm_config.backend.openvino.enabled, + openvino_device=llm_config.backend.openvino.device, dtype_override=llm_config.model.dtype_override, enable_dynamic_shape=llm_config.model.enable_dynamic_shape, use_kv_cache=llm_config.model.use_kv_cache, diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 20604bbf635..3c795dcdf66 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -63,6 +63,19 @@ def get_mps_partitioner(use_kv_cache: bool = False): compile_specs = [CompileSpec("use_fp16", bytes([True]))] return MPSPartitioner(compile_specs) # pyre-fixme[16] +def get_openvino_partitioner(device: str): + try: + from executorch.exir.backend.backend_details import CompileSpec + from executorch.backends.openvino.partitioner import ( + OpenvinoPartitioner, + ) + except ImportError: + raise ImportError( + "Please install the OpenVINO backend following https://github.com/pytorch/executorch/tree/main/backends/openvino" + ) + + compile_specs = [CompileSpec("device", device.encode())] + return OpenvinoPartitioner(compile_specs) def get_coreml_partitioner( ios: int = 15, From 24f2d930c62484ba038bd9ee9c7fb9fb73cc3fd5 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Sat, 21 Jun 2025 20:43:05 -0700 Subject: [PATCH 03/85] nncf pattern checker in openvino partitioner --- backends/openvino/partitioner.py | 62 +++++++++++++++++++++++ examples/models/llama/export_llama_lib.py | 2 +- 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index bc3fde573e2..4828a96f0dd 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -25,6 +25,11 @@ from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner from torch.fx.passes.operator_support import OperatorSupportBase +class PatternNode: + op_types = {} + + def __init__(self): + self.op_types = {} class OpenvinoOperatorsSupport(OperatorSupportBase): @@ -32,6 +37,7 @@ def __init__( self, op_types_to_skip: Optional[set] = None, op_names_to_skip: Optional[set] = None, + enabled_ops_by_name: Optional[set] = None, ) -> None: """ Initializes the OpenvinoOperatorsSupport class. @@ -43,9 +49,12 @@ def __init__( op_types_to_skip = set() if op_names_to_skip is None: op_names_to_skip = set() + if enabled_ops_by_name is None: + enabled_ops_by_name = set() self._op_types_to_skip = op_types_to_skip self._op_names_to_skip = op_names_to_skip + self._enabled_ops_by_name = enabled_ops_by_name def is_node_supported(self, _, node: torch.fx.Node) -> bool: """ @@ -62,6 +71,10 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: op_type = node.target.__name__ else: op_type = str(node.target) + + if node.name in self._enabled_ops_by_name: + return True + supported_ops = OperatorSupport(options)._support_dict if op_type == "getitem": return True @@ -88,6 +101,7 @@ def __init__( compile_spec: List[CompileSpec], op_types_to_skip: Optional[set] = None, op_names_to_skip: Optional[set] = None, + enabled_ops_by_name: Optional[set] = None, ) -> None: """ Initializes the OpenvinoPartitioner class. @@ -99,6 +113,7 @@ def __init__( self.delegation_spec = DelegationSpec(OpenvinoBackend.__name__, compile_spec) self._op_types_to_skip = op_types_to_skip self._op_names_to_skip = op_names_to_skip + self._enabled_ops_by_name = enabled_ops_by_name def ops_to_not_decompose( self, @@ -120,6 +135,52 @@ def ops_to_not_decompose( ] return (ops_not_decompose, None) + def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool: + if node.op == "call_function": + if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types: + pt_input_nodes = node.all_input_nodes + pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target.__name__)] + if pattern_input_ops is None: + enabled_ops.append(node) + return True + if len(pt_input_nodes) != len(pattern_input_ops): + return False + for i in range(len(pt_input_nodes)): + if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops): + return False + enabled_ops.append(node) + return True + elif node.op == "get_attr": + if "get_attr" in pattern.op_types: + return True + else: + return False + elif node.op == "placeholder": + if "placeholder" in pattern.op_types: + return True + else: + return False + return False + + def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule): + const_node = PatternNode + const_node.op_types["get_attr"] = None + const_node.op_types["placeholder"] = None + bitwise_right_shift_node = PatternNode + bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node] + bitwise_and_node = PatternNode + bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node] + stack_node = PatternNode + stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node] + + for node in graph_module.graph.nodes: + if str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default": + enabled_ops = [] + pattern_match = self.check_pattern(node, stack_node, enabled_ops) + if pattern_match: + for pattern_op in enabled_ops: + self._enabled_ops_by_name.add(pattern_op.name) + def partition(self, exported_program: ExportedProgram) -> PartitionResult: """ Partitions an exported program into supported and unsupported segments. @@ -127,6 +188,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: :param exported_program: The exported program. :return: A PartitionResult containing the partitioned graph and delegation tags. """ + self.capture_nncf_patterns(exported_program.graph_module) partitioner = CapabilityBasedPartitioner( exported_program.graph_module, OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip), diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 8afaa8bf409..a01b05daa17 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -448,7 +448,7 @@ def build_args_parser() -> argparse.ArgumentParser: parser.add_argument( "--openvino_device", type=str, - default=None, + default="CPU", choices=["CPU", "GPU"], help="Specify the device for Openvino (CPU or GPU).", ) From 7dd8d0f17aec743d7796bf7b314df97f2aeb90eb Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 23 Jun 2025 19:11:55 +0400 Subject: [PATCH 04/85] nncf compression init --- examples/models/llama/export_llama_lib.py | 8 ++++++ extension/llm/export/builder.py | 32 +++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index a01b05daa17..087e4d1efdc 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -551,6 +551,13 @@ def build_args_parser() -> argparse.ArgumentParser: help="path to the input pruning token mapping file (token_map.json)", ) + parser.add_argument( + "--nncf_compression", + default=False, + action="store_true", + help="If true, stops right after torch.export() and saves the exported model.", + ) + parser.add_argument( "--export_only", default=False, @@ -1207,6 +1214,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager": use_legacy_export=llm_config.backend.qnn.enabled, save_exported_program=llm_config.export.export_only, verbose=llm_config.debug.verbose, + nncf_compression=llm_config.nncf_compression, metadata=_load_llama_model_metadata( WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA, llm_config.model.use_kv_cache, diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 4128bfd8198..f185d9b346d 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -16,6 +16,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple from unittest.mock import patch +import nncf import torch from executorch.backends.transforms.duplicate_dynamic_quant_chain import ( DuplicateDynamicQuantChainPass, @@ -40,6 +41,7 @@ from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e from torchao.quantization.pt2e.quantizer import ComposableQuantizer, Quantizer from torchao.utils import unwrap_tensor_subclass +from functools import partial FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -98,6 +100,7 @@ def __init__( dynamic_shapes: Optional[Any] = None, use_legacy_export: bool = False, save_exported_program: bool = False, + nncf_compression: bool = False ): # Store necessary constructor arguments. self.model = model @@ -119,6 +122,7 @@ def __init__( self.dynamic_shapes = dynamic_shapes self.use_legacy_export = use_legacy_export self.save_exported_program = save_exported_program + self.nncf_compression = nncf_compression # Note: treat this as the source of truth for the result of # torch.export'ing a model. If the overall ExportedProgram is needed, @@ -428,6 +432,34 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage DuplicateDynamicQuantChainPass()(m) self.pre_autograd_graph_module = m return self + elif (self.nncf_compression): + tokenizer = get_tokenizer(self.tokenizer_path) + + def transform_fn( + prompts: str, tokenizer + ): + tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) + logging.error(tokenized_text) + + inputs = () + inputs = ( + torch.tensor(tokenized_text).unsqueeze(0), + {"input_pos": torch.tensor([0])}, + ) + + return inputs + + self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data + self.calibration_data = [word for prompt in self.calibration_data for word in prompt.split()] if not self.dynamic_shapes else self.calibration_data + + self.pre_autograd_graph_module = nncf.compress_weights( + self.pre_autograd_graph_module, + dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)), + mode=nncf.CompressWeightsMode.INT4_SYM, + ratio=0.8, + sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, + ) + return self else: logging.info("No quantizer provided, passing...") return self From 1716834b5ff3889da366f54e2d6f2a3e3e999117 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Mon, 23 Jun 2025 13:43:11 -0700 Subject: [PATCH 05/85] openvino backend llama nncf support --- backends/openvino/partitioner.py | 5 +- backends/openvino/utils.py | 66 ++++++++++++++++++++++ examples/models/llama/config/llm_config.py | 3 + examples/models/llama/export_llama_lib.py | 4 +- extension/llm/export/builder.py | 39 +++++++++---- 5 files changed, 101 insertions(+), 16 deletions(-) create mode 100644 backends/openvino/utils.py diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index 4828a96f0dd..b1e7f5d436a 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -101,7 +101,6 @@ def __init__( compile_spec: List[CompileSpec], op_types_to_skip: Optional[set] = None, op_names_to_skip: Optional[set] = None, - enabled_ops_by_name: Optional[set] = None, ) -> None: """ Initializes the OpenvinoPartitioner class. @@ -113,7 +112,7 @@ def __init__( self.delegation_spec = DelegationSpec(OpenvinoBackend.__name__, compile_spec) self._op_types_to_skip = op_types_to_skip self._op_names_to_skip = op_names_to_skip - self._enabled_ops_by_name = enabled_ops_by_name + self._enabled_ops_by_name = set() def ops_to_not_decompose( self, @@ -191,7 +190,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: self.capture_nncf_patterns(exported_program.graph_module) partitioner = CapabilityBasedPartitioner( exported_program.graph_module, - OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip), + OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip, self._enabled_ops_by_name), allows_single_node_partition=True, ) partition_list = partitioner.propose_partitions() diff --git a/backends/openvino/utils.py b/backends/openvino/utils.py new file mode 100644 index 00000000000..ec4bebe0d6d --- /dev/null +++ b/backends/openvino/utils.py @@ -0,0 +1,66 @@ +# Copyright (c) Intel Corporation +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file found in the +# LICENSE file in the root directory of this source tree. + +import logging + +from typing import Any, Dict, Optional, Tuple, Union + +import executorch.exir as exir + +import torch +from executorch.exir import EdgeProgramManager +from executorch.exir.program._program import to_edge_with_preserved_ops +from executorch.exir.tracer import Value +from torch.export import ExportedProgram +from executorch.extension.export_util.utils import _to_core_aten + +_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig( + _check_ir_validity=True, + _skip_dim_order=True, # TODO(T189114319): Reuse dim order op after solving the ios oss issue +) + +def nncf_core_aten_to_edge( + core_aten_exir_ep: ExportedProgram, + edge_constant_methods: Optional[Dict[str, Any]] = None, + edge_compile_config=None, + verbose=True, +) -> EdgeProgramManager: + if not edge_compile_config: + edge_compile_config = exir.EdgeCompileConfig( + _check_ir_validity=False, # quant ops currently break ir verification + ) + edge_manager: EdgeProgramManager = to_edge_with_preserved_ops( + core_aten_exir_ep, + constant_methods=edge_constant_methods, + compile_config=edge_compile_config, + preserve_ops=[torch.ops.aten.stack.default,], + ) + if verbose: + logging.info(f"Exported graph:\n{edge_manager.exported_program()}") + return edge_manager + +def nncf_export_to_edge( + model: Union[torch.fx.GraphModule, torch.nn.Module], + example_inputs: Tuple[Value, ...], + *, + example_kwarg_inputs: Optional[Dict] = None, + dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None, + edge_constant_methods: Optional[Dict[str, Any]] = None, + edge_compile_config=_EDGE_COMPILE_CONFIG, + strict=True, + verbose=True, +) -> EdgeProgramManager: + core_aten_ep = _to_core_aten( + model, + example_inputs, + example_kwarg_inputs=example_kwarg_inputs, + dynamic_shapes=dynamic_shapes, + strict=strict, + verbose=verbose, + ) + return nncf_core_aten_to_edge( + core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose + ) diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py index 2de58fe47eb..530f7335d8e 100644 --- a/examples/models/llama/config/llm_config.py +++ b/examples/models/llama/config/llm_config.py @@ -445,6 +445,7 @@ class OpenvinoConfig: enabled: bool = False device: str = "CPU" + nncf_compression = False @dataclass @@ -625,6 +626,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 llm_config.backend.openvino.enabled = args.openvino if hasattr(args, "openvino_device"): llm_config.backend.openvino.device = args.openvino_device + if hasattr(args, "nncf_compression"): + llm_config.backend.openvino.nncf_compression = args.nncf_compression # DebugConfig if hasattr(args, "profile_memory"): diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 087e4d1efdc..1ea82e3224a 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -555,7 +555,7 @@ def build_args_parser() -> argparse.ArgumentParser: "--nncf_compression", default=False, action="store_true", - help="If true, stops right after torch.export() and saves the exported model.", + help="Enables nncf compression for openvino backend", ) parser.add_argument( @@ -1214,7 +1214,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager": use_legacy_export=llm_config.backend.qnn.enabled, save_exported_program=llm_config.export.export_only, verbose=llm_config.debug.verbose, - nncf_compression=llm_config.nncf_compression, + nncf_compression=llm_config.backend.openvino.nncf_compression, metadata=_load_llama_model_metadata( WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA, llm_config.model.use_kv_cache, diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index f185d9b346d..a2bfaeae22d 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -16,7 +16,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple from unittest.mock import patch -import nncf import torch from executorch.backends.transforms.duplicate_dynamic_quant_chain import ( DuplicateDynamicQuantChainPass, @@ -41,7 +40,6 @@ from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e from torchao.quantization.pt2e.quantizer import ComposableQuantizer, Quantizer from torchao.utils import unwrap_tensor_subclass -from functools import partial FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -433,6 +431,13 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage self.pre_autograd_graph_module = m return self elif (self.nncf_compression): + try: + import nncf + from functools import partial + except ImportError: + raise ImportError( + "Please install nncf via backends/openvino/requirements.txt" + ) tokenizer = get_tokenizer(self.tokenizer_path) def transform_fn( @@ -487,15 +492,27 @@ def export_to_edge(self) -> "LLMEdgeManager": ) with override_export_behaviour: - self.edge_manager = export_to_edge( - self.pre_autograd_graph_module, # pyre-fixme[6] - self.example_inputs, - example_kwarg_inputs=self.example_kwarg_inputs, - dynamic_shapes=dynamic_shape, - edge_constant_methods=self.metadata, - edge_compile_config=edge_config, - verbose=self.verbose, - ) + if (self.nncf_compression): + from executorch.backends.openvino.utils import nncf_export_to_edge + self.edge_manager = nncf_export_to_edge( + self.pre_autograd_graph_module, # pyre-fixme[6] + self.example_inputs, + example_kwarg_inputs=self.example_kwarg_inputs, + dynamic_shapes=dynamic_shape, + edge_constant_methods=self.metadata, + edge_compile_config=edge_config, + verbose=self.verbose, + ) + else: + self.edge_manager = export_to_edge( + self.pre_autograd_graph_module, # pyre-fixme[6] + self.example_inputs, + example_kwarg_inputs=self.example_kwarg_inputs, + dynamic_shapes=dynamic_shape, + edge_constant_methods=self.metadata, + edge_compile_config=edge_config, + verbose=self.verbose, + ) return self def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager": From 198190e6a250632ed9921fa346895521e5b22dfb Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 7 Jul 2025 14:38:05 +0400 Subject: [PATCH 06/85] openvino quantizer init --- .../quantizer/observers/nncf_observers.py | 114 ++++++++++++ backends/openvino/quantizer/quantizer.py | 170 ++++++++++++------ 2 files changed, 228 insertions(+), 56 deletions(-) create mode 100644 backends/openvino/quantizer/observers/nncf_observers.py diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py new file mode 100644 index 00000000000..54f4348e0ed --- /dev/null +++ b/backends/openvino/quantizer/observers/nncf_observers.py @@ -0,0 +1,114 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from torch.ao.quantization.observer import MappingType, PerGroup, PerAxis, PerChannelMinMaxObserver, get_block_size +from torch.ao.quantization.pt2e._affine_quantization import ( + _get_reduction_params, + AffineQuantizedMinMaxObserver, +) +from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor, INT4SymmetricWeightsDecompressor, INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor +from nncf.experimental.torch.fx.transformations import constant_update_fn, module_insertion_transformation_builder +from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node +from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType + +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.parameters import CompressWeightsMode +from nncf.tensor.tensor import Tensor + +class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + qmode = CompressWeightsMode.INT4_ASYM if self.mapping_type==MappingType.ASYMMETRIC else CompressWeightsMode.INT4_SYM + assert isinstance(self.granularity, PerGroup), "Only PerGroup granularity is supported" + self.wc_config = WeightCompressionConfig(mode=qmode, group_size=self.granularity.group_size) + + def calculate_qparams(self, weight): + assert hasattr(self, "min_val") and hasattr( + self, "max_val" + ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams" + _, reduction_dims = _get_reduction_params( + self.block_size, weight.size() + ) + assert len(reduction_dims) == 1, "Only 1-D group size is supported" + reduction_dims = reduction_dims[0] - 1 + q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims) + zp = zp.data if zp is not None else None + return q_weight.data, scale.data, zp + + def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): + print("calling convert") + assert ( + self.original_dtype is not None + ), "Expecting original_dtype to be populated" + weight_node = observer_node.args[0] + original_weight = get_tensor_constant_from_node(weight_node, model) + q_weight, scale, zero_point = self.calculate_qparams(original_weight) + + with model.graph.inserting_before(observer_node): + if(zero_point is not None): + decompressor = INT4AsymmetricWeightsDecompressor(scale, zero_point, q_weight.shape, original_weight.shape, original_weight.dtype) + else: + decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype) + packed_q_weight = decompressor.pack_weight(q_weight) + new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) + decompressor_name = f'NNCFDecompressor_{new_weight_node.name}' + + module_insertion_transformation_builder( + decompressor, + [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)], + decompressor_name, + )(model) + decomp_node = observer_node.args[0] + observer_node.replace_all_uses_with(decomp_node) + model.graph.erase_node(observer_node) + + +class NNCFInt8observer(PerChannelMinMaxObserver): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + qmode = CompressWeightsMode.INT8_SYM if self.qscheme==torch.per_channel_symmetric else CompressWeightsMode.INT8_ASYM + self.wc_config = WeightCompressionConfig(mode=qmode) + + def calculate_qparams(self, weight): + assert hasattr(self, "min_val") and hasattr( + self, "max_val" + ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams" + self.granularity = PerAxis(axis=self.ch_axis) + self.block_size = get_block_size(weight.shape, self.granularity) + _, reduction_dims = _get_reduction_params( + self.block_size, weight.size() + ) + q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims) + zp = zp.data if zp is not None else None + return q_weight.data, scale.data, zp + + def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): + print("calling convert") + weight_node = observer_node.args[0] + original_weight = get_tensor_constant_from_node(weight_node, model) + q_weight, scale, zero_point = self.calculate_qparams(original_weight) + + with model.graph.inserting_before(observer_node): + if(zero_point is not None): + decompressor = INT8AsymmetricWeightsDecompressor(scale, zero_point, original_weight.dtype) + else: + decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) + packed_q_weight = decompressor.pack_weight(q_weight) + new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) + decompressor_name = f'NNCFDecompressor_{new_weight_node.name}' + + module_insertion_transformation_builder( + decompressor, + [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)], + decompressor_name, + )(model) + decomp_node = observer_node.args[0] + observer_node.replace_all_uses_with(decomp_node) + model.graph.erase_node(observer_node) \ No newline at end of file diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index d0622b24e6d..f8f08996f53 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -21,6 +21,8 @@ HistogramObserver, PerChannelMinMaxObserver, UniformQuantizationObserverBase, + PerGroup, + MappingType, ) from torchao.quantization.pt2e.quantizer import ( EdgeOrNode, @@ -30,6 +32,9 @@ Quantizer, SharedQuantizationSpec, ) +from nncf.quantization.quantize_model import get_weight_compression_configuration +from nncf.common.quantization.structs import QuantizerConfig, QuantizationScheme +from executorch.backends.openvino.quantizer.observers.nncf_observers import PTPerBlockParamObserver,NNCFInt8observer QUANT_ANNOTATION_KEY = "quantization_annotation" @@ -46,6 +51,10 @@ class QuantizationMode(Enum): INT8_SYM = "int8_sym" INT8_MIXED = "int8_mixed" INT8_TRANSFORMER = "int8_transformer" + INT8_SYM_WC = "int8_sym_wc" + INT8_ASYM_WC = "int8_asym_wc" + INT4_SYM_WC = "int4_sym" + INT4_ASYM_WC = "int4_asym" class OpenVINOQuantizer(Quantizer): @@ -66,8 +75,12 @@ def __init__( - INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights. - INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models Default value is INT8_SYM. + - INT4_SYM: Symmetric INT4 Weights-Only Compression + - INT4_ASYM: Asymmetric INT4 Weights-Only Compression :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm. """ + self.mode = mode + self.wc_modes = [QuantizationMode.INT4_ASYM_WC,QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_ASYM_WC, QuantizationMode.INT8_SYM_WC] if mode == QuantizationMode.INT8_SYM: preset = quantization.structs.QuantizationPreset.PERFORMANCE model_type = None @@ -77,11 +90,24 @@ def __init__( else: preset = None model_type = nncf.parameters.ModelType.TRANSFORMER - self._min_max_algo = ( - nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization( - preset=preset, model_type=model_type, **kwargs + if(self.mode not in self.wc_modes): + self._min_max_algo = ( + nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization( + preset=preset, model_type=model_type, **kwargs + ) ) - ) + self._algo = self._min_max_algo + else: + weight_compression_configuration = get_weight_compression_configuration( + mode.value.replace("_wc", ""), # Mode value has to match NNCF CompressWeightsMode + **kwargs + ) + self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( + subset_size=None, + **weight_compression_configuration + ) + self._algo = self._weight_compression_algo + def set_ignored_scope( self, @@ -102,7 +128,7 @@ def set_ignored_scope( :param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match in the model graph. """ - self._min_max_algo.set_ignored_scope( + self._algo.set_ignored_scope( nncf.IgnoredScope( names=names or [], patterns=patterns or [], @@ -115,63 +141,80 @@ def set_ignored_scope( def get_nncf_quantization_setup( self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph ) -> quantization.quantizer_setup.SingleConfigQuantizerSetup: - self._min_max_algo._set_backend_entity(model) - return self._min_max_algo.find_quantization_setup(model, nncf_graph) + self._algo._set_backend_entity(model) + return self._algo.find_quantization_setup(model, nncf_graph) def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model) - quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) - + graph = model.graph node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = ( defaultdict(QuantizationAnnotation) ) + # Serperate into annotation for quantize and compress + if(self.mode in self.wc_modes): + self._algo.set_backend_entity(model) + nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) + for node in nodes_to_compress: + quantization_insertion_point = quantization.quantizer_setup.WeightQuantizationInsertionPoint(target_node_name=node.node_name) + group_size = self._algo._group_size + num_bits = 4 if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT4_ASYM_WC] else 8 + qmode = QuantizationScheme.SYMMETRIC if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT8_SYM_WC] else QuantizationScheme.ASYMMETRIC + nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode) + qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(qip=quantization_insertion_point, qconfig=nncf_qconfig, directly_quantized_operator_node_names=[node]) + edge_or_node, annotation = self._get_edge_or_node_and_annotation( + graph, nncf_graph, qp, node_vs_torch_annotation + ) + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp, group_size=group_size, weights_only=True) + self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + else: + quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) - for qp in quantization_setup.quantization_points.values(): - edge_or_node, annotation = self._get_edge_or_node_and_annotation( - graph, nncf_graph, qp, node_vs_torch_annotation - ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_qp(qp) - self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + for qp in quantization_setup.quantization_points.values(): + edge_or_node, annotation = self._get_edge_or_node_and_annotation( + graph, nncf_graph, qp, node_vs_torch_annotation + ) + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp) + self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) - for quantizer_ids in quantization_setup.unified_scale_groups.values(): + for quantizer_ids in quantization_setup.unified_scale_groups.values(): - root_quantizer_id = self._get_unified_scales_root_quantizer_id( - nncf_graph, quantizer_ids, quantization_setup - ) - root_qp = quantization_setup.quantization_points[root_quantizer_id] + root_quantizer_id = self._get_unified_scales_root_quantizer_id( + nncf_graph, quantizer_ids, quantization_setup + ) + root_qp = quantization_setup.quantization_points[root_quantizer_id] - if any( - root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig - for q_id in quantizer_ids - ): - qps = [ - quantization_setup.quantization_points[q_id] + if any( + root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig for q_id in quantizer_ids - ] - msg = ( - "Different quantization configs are set to one unified scale group:" - f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}" + ): + qps = [ + quantization_setup.quantization_points[q_id] + for q_id in quantizer_ids + ] + msg = ( + "Different quantization configs are set to one unified scale group:" + f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}" + ) + raise nncf.InternalError(msg) + + root_target_node = nncf_fx.node_utils.get_graph_node_by_name( + graph, root_qp.insertion_point.target_node_name + ) + root_edge_or_node = self._get_edge_or_node( + root_target_node, root_qp, nncf_graph ) - raise nncf.InternalError(msg) - - root_target_node = nncf_fx.node_utils.get_graph_node_by_name( - graph, root_qp.insertion_point.target_node_name - ) - root_edge_or_node = self._get_edge_or_node( - root_target_node, root_qp, nncf_graph - ) - for quantizer_id in quantizer_ids: - if quantizer_id == root_quantizer_id: - continue + for quantizer_id in quantizer_ids: + if quantizer_id == root_quantizer_id: + continue - qspec = SharedQuantizationSpec(root_edge_or_node) - qp = quantization_setup.quantization_points[quantizer_id] - edge_or_node, annotation = self._get_edge_or_node_and_annotation( - graph, nncf_graph, qp, node_vs_torch_annotation - ) - self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + qspec = SharedQuantizationSpec(root_edge_or_node) + qp = quantization_setup.quantization_points[quantizer_id] + edge_or_node, annotation = self._get_edge_or_node_and_annotation( + graph, nncf_graph, qp, node_vs_torch_annotation + ) + self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) for node, annotation in node_vs_torch_annotation.items(): assert QUANT_ANNOTATION_KEY not in node.meta @@ -295,8 +338,8 @@ def _fill_torch_ao_annotation( annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec @staticmethod - def _get_torch_ao_qspec_from_qp( - qp: quantization.quantizer_setup.QuantizationPointBase, + def _get_torch_ao_qspec_from_nncf_config( + qp: quantization.quantizer_setup.QuantizationPointBase, group_size=-1, weights_only=False ) -> QuantizationSpec: """ Retrieves the quantization configuration from the given quantization point and @@ -307,11 +350,10 @@ def _get_torch_ao_qspec_from_qp( """ # Eps value is copied from nncf/torch/quantization/layers.py extra_args = {"eps": 1e-16} - qconfig = qp.qconfig is_weight = qp.is_weight_quantization_point() + qconfig = qp.qconfig observer: Type[UniformQuantizationObserverBase] - if qconfig.per_channel: torch_qscheme = ( torch.per_channel_symmetric @@ -325,11 +367,27 @@ def _get_torch_ao_qspec_from_qp( else torch.per_tensor_affine ) if is_weight: - observer = PerChannelMinMaxObserver - quant_min = -128 - quant_max = 127 - dtype = torch.int8 - channel_axis = 0 + mapping_type = MappingType.SYMMETRIC if qconfig.mode == QuantizationScheme.SYMMETRIC else MappingType.ASYMMETRIC + if qconfig.num_bits==4: + extra_args["mapping_type"] = mapping_type + extra_args["target_dtype"] = torch.int8 + extra_args["granularity"] = PerGroup(group_size=group_size) + observer = PTPerBlockParamObserver + quant_min = -8 + quant_max = 7 + dtype = torch.int8 + channel_axis = 0 + elif qconfig.num_bits==8: + observer = NNCFInt8observer if weights_only else PerChannelMinMaxObserver + quant_min = -128 + quant_max = 127 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = ( + torch.per_channel_symmetric + if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC + else torch.per_channel_affine + ) else: observer = ( HistogramObserver From 3d88a4ea80179ba5b4498a47b3365440c81a37bd Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Tue, 8 Jul 2025 12:45:43 -0700 Subject: [PATCH 07/85] Moved all openvino llama example changes into export_llama_lib --- backends/openvino/partitioner.py | 1 + examples/models/llama/export_llama_lib.py | 85 ++++++++++++++++++++--- extension/llm/export/builder.py | 67 +++--------------- 3 files changed, 86 insertions(+), 67 deletions(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index b1e7f5d436a..b508a698cab 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -131,6 +131,7 @@ def ops_to_not_decompose( torch.ops.aten.upsample_bilinear2d.vec, torch.ops.aten.upsample_nearest2d.default, torch.ops.aten.upsample_nearest2d.vec, + torch.ops.aten.stack.default, ] return (ops_not_decompose, None) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 1ea82e3224a..ecf0ea72dca 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -862,6 +862,73 @@ def _to_edge_and_lower_llama_xnnpack( return builder.to_executorch(passes=additional_passes) +def _to_edge_and_lower_llama_openvino( + builder_exported, + modelname, + additional_passes, + openvino_device: str = "CPU", + nncf_compression: bool = False, + verbose: bool = False, +) -> LLMEdgeManager: # noqa: C901 + partitioners = [] + + # Add OpenVINO partitioner + partitioners.append(get_openvino_partitioner(openvino_device)) + modelname = f"openvino_{modelname}" + + + logging.info("Lowering model using following partitioner(s): ") + for partitioner in partitioners: + logging.info(f"--> {partitioner.__class__.__name__}") + + # Use NNCF compression if enabled + # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize + if nncf_compression: + try: + import nncf + from functools import partial + from pytorch_tokenizers import get_tokenizer + except ImportError: + raise ImportError( + "Please install nncf via backends/openvino/requirements.txt" + ) + tokenizer = get_tokenizer(builder_exported.tokenizer_path) + + def transform_fn( + prompts: str, tokenizer + ): + tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) + logging.error(tokenized_text) + + inputs = () + inputs = ( + torch.tensor(tokenized_text).unsqueeze(0), + {"input_pos": torch.tensor([0])}, + ) + + return inputs + + builder_exported.calibration_data = [builder_exported.calibration_data] if isinstance(builder_exported.calibration_data, str) else builder_exported.calibration_data + builder_exported.calibration_data = [word for prompt in builder_exported.calibration_data for word in prompt.split()] if not builder_exported.dynamic_shapes else builder_exported.calibration_data + + builder_exported.pre_autograd_graph_module = nncf.compress_weights( + builder_exported.pre_autograd_graph_module, + dataset=nncf.Dataset(builder_exported.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)), + mode=nncf.CompressWeightsMode.INT4_SYM, + ratio=0.8, + sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, + ) + + builder = builder_exported.to_edge_transform_and_lower( + partitioners + ) + + if verbose: + print_delegation_info(builder.edge_manager.exported_program().graph_module) + + return builder.to_executorch(passes=additional_passes) + + def _to_edge_and_lower_llama( # noqa: C901 builder_exported, modelname, @@ -873,8 +940,6 @@ def _to_edge_and_lower_llama( # noqa: C901 mps: bool = False, coreml: bool = False, qnn: bool = False, - openvino: bool = False, - openvino_device: str = "CPU", dtype_override: str = "fp32", enable_dynamic_shape: bool = True, use_kv_cache: bool = False, @@ -919,10 +984,6 @@ def _to_edge_and_lower_llama( # noqa: C901 partitioners.append(coreml_partitioner) modelname = f"coreml_{modelname}" - if openvino: - partitioners.append(get_openvino_partitioner(openvino_device)) - modelname = f"openvino_{modelname}" - if qnn: logging.warning( "The model definition in current repro is not performant, please refer to the instruction" @@ -1078,6 +1139,15 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 generate_etrecord=llm_config.debug.generate_etrecord, verbose=llm_config.debug.verbose, ) + elif llm_config.backend.openvino.enabled: + builder = _to_edge_and_lower_llama_openvino( + builder_exported, + modelname, + additional_passes, + openvino_device=llm_config.backend.openvino.device, + nncf_compression=llm_config.backend.openvino.nncf_compression, + verbose=llm_config.debug.verbose, + ) else: builder = _to_edge_and_lower_llama( builder_exported, @@ -1090,8 +1160,6 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 mps=llm_config.backend.mps.enabled, coreml=llm_config.backend.coreml.enabled, qnn=llm_config.backend.qnn.enabled, - openvino=llm_config.backend.openvino.enabled, - openvino_device=llm_config.backend.openvino.device, dtype_override=llm_config.model.dtype_override, enable_dynamic_shape=llm_config.model.enable_dynamic_shape, use_kv_cache=llm_config.model.use_kv_cache, @@ -1214,7 +1282,6 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager": use_legacy_export=llm_config.backend.qnn.enabled, save_exported_program=llm_config.export.export_only, verbose=llm_config.debug.verbose, - nncf_compression=llm_config.backend.openvino.nncf_compression, metadata=_load_llama_model_metadata( WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA, llm_config.model.use_kv_cache, diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index a2bfaeae22d..4128bfd8198 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -98,7 +98,6 @@ def __init__( dynamic_shapes: Optional[Any] = None, use_legacy_export: bool = False, save_exported_program: bool = False, - nncf_compression: bool = False ): # Store necessary constructor arguments. self.model = model @@ -120,7 +119,6 @@ def __init__( self.dynamic_shapes = dynamic_shapes self.use_legacy_export = use_legacy_export self.save_exported_program = save_exported_program - self.nncf_compression = nncf_compression # Note: treat this as the source of truth for the result of # torch.export'ing a model. If the overall ExportedProgram is needed, @@ -430,41 +428,6 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage DuplicateDynamicQuantChainPass()(m) self.pre_autograd_graph_module = m return self - elif (self.nncf_compression): - try: - import nncf - from functools import partial - except ImportError: - raise ImportError( - "Please install nncf via backends/openvino/requirements.txt" - ) - tokenizer = get_tokenizer(self.tokenizer_path) - - def transform_fn( - prompts: str, tokenizer - ): - tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) - logging.error(tokenized_text) - - inputs = () - inputs = ( - torch.tensor(tokenized_text).unsqueeze(0), - {"input_pos": torch.tensor([0])}, - ) - - return inputs - - self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data - self.calibration_data = [word for prompt in self.calibration_data for word in prompt.split()] if not self.dynamic_shapes else self.calibration_data - - self.pre_autograd_graph_module = nncf.compress_weights( - self.pre_autograd_graph_module, - dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)), - mode=nncf.CompressWeightsMode.INT4_SYM, - ratio=0.8, - sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, - ) - return self else: logging.info("No quantizer provided, passing...") return self @@ -492,27 +455,15 @@ def export_to_edge(self) -> "LLMEdgeManager": ) with override_export_behaviour: - if (self.nncf_compression): - from executorch.backends.openvino.utils import nncf_export_to_edge - self.edge_manager = nncf_export_to_edge( - self.pre_autograd_graph_module, # pyre-fixme[6] - self.example_inputs, - example_kwarg_inputs=self.example_kwarg_inputs, - dynamic_shapes=dynamic_shape, - edge_constant_methods=self.metadata, - edge_compile_config=edge_config, - verbose=self.verbose, - ) - else: - self.edge_manager = export_to_edge( - self.pre_autograd_graph_module, # pyre-fixme[6] - self.example_inputs, - example_kwarg_inputs=self.example_kwarg_inputs, - dynamic_shapes=dynamic_shape, - edge_constant_methods=self.metadata, - edge_compile_config=edge_config, - verbose=self.verbose, - ) + self.edge_manager = export_to_edge( + self.pre_autograd_graph_module, # pyre-fixme[6] + self.example_inputs, + example_kwarg_inputs=self.example_kwarg_inputs, + dynamic_shapes=dynamic_shape, + edge_constant_methods=self.metadata, + edge_compile_config=edge_config, + verbose=self.verbose, + ) return self def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager": From e81f60d895fe235e00fa11567f5f85e6d6e25d08 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Tue, 8 Jul 2025 12:57:22 -0700 Subject: [PATCH 08/85] Removed openvino utils.py since it is not needed anymore --- backends/openvino/utils.py | 66 -------------------------------------- 1 file changed, 66 deletions(-) delete mode 100644 backends/openvino/utils.py diff --git a/backends/openvino/utils.py b/backends/openvino/utils.py deleted file mode 100644 index ec4bebe0d6d..00000000000 --- a/backends/openvino/utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) Intel Corporation -# -# Licensed under the BSD License (the "License"); you may not use this file -# except in compliance with the License. See the license file found in the -# LICENSE file in the root directory of this source tree. - -import logging - -from typing import Any, Dict, Optional, Tuple, Union - -import executorch.exir as exir - -import torch -from executorch.exir import EdgeProgramManager -from executorch.exir.program._program import to_edge_with_preserved_ops -from executorch.exir.tracer import Value -from torch.export import ExportedProgram -from executorch.extension.export_util.utils import _to_core_aten - -_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig( - _check_ir_validity=True, - _skip_dim_order=True, # TODO(T189114319): Reuse dim order op after solving the ios oss issue -) - -def nncf_core_aten_to_edge( - core_aten_exir_ep: ExportedProgram, - edge_constant_methods: Optional[Dict[str, Any]] = None, - edge_compile_config=None, - verbose=True, -) -> EdgeProgramManager: - if not edge_compile_config: - edge_compile_config = exir.EdgeCompileConfig( - _check_ir_validity=False, # quant ops currently break ir verification - ) - edge_manager: EdgeProgramManager = to_edge_with_preserved_ops( - core_aten_exir_ep, - constant_methods=edge_constant_methods, - compile_config=edge_compile_config, - preserve_ops=[torch.ops.aten.stack.default,], - ) - if verbose: - logging.info(f"Exported graph:\n{edge_manager.exported_program()}") - return edge_manager - -def nncf_export_to_edge( - model: Union[torch.fx.GraphModule, torch.nn.Module], - example_inputs: Tuple[Value, ...], - *, - example_kwarg_inputs: Optional[Dict] = None, - dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None, - edge_constant_methods: Optional[Dict[str, Any]] = None, - edge_compile_config=_EDGE_COMPILE_CONFIG, - strict=True, - verbose=True, -) -> EdgeProgramManager: - core_aten_ep = _to_core_aten( - model, - example_inputs, - example_kwarg_inputs=example_kwarg_inputs, - dynamic_shapes=dynamic_shapes, - strict=strict, - verbose=verbose, - ) - return nncf_core_aten_to_edge( - core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose - ) From 457a868cb01bc1a4be090da18b3e431cf3b506d0 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Wed, 9 Jul 2025 11:53:26 +0400 Subject: [PATCH 09/85] Update nncf_observers.py --- .../quantizer/observers/nncf_observers.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py index 54f4348e0ed..977458801a4 100644 --- a/backends/openvino/quantizer/observers/nncf_observers.py +++ b/backends/openvino/quantizer/observers/nncf_observers.py @@ -57,12 +57,14 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): else: decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype) packed_q_weight = decompressor.pack_weight(q_weight) - new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) - decompressor_name = f'NNCFDecompressor_{new_weight_node.name}' + constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) + compressed_weight_name = observer_node.all_input_nodes[0].name + decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2]) + decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" module_insertion_transformation_builder( decompressor, - [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)], + [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)], decompressor_name, )(model) decomp_node = observer_node.args[0] @@ -101,14 +103,16 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): else: decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) packed_q_weight = decompressor.pack_weight(q_weight) - new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) - decompressor_name = f'NNCFDecompressor_{new_weight_node.name}' + constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) + compressed_weight_name = observer_node.all_input_nodes[0].name + decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2]) + decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" module_insertion_transformation_builder( decompressor, - [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)], + [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)], decompressor_name, )(model) decomp_node = observer_node.args[0] observer_node.replace_all_uses_with(decomp_node) - model.graph.erase_node(observer_node) \ No newline at end of file + model.graph.erase_node(observer_node) From d1e9330b53f96068590b767ec8896a9317a1e954 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Mon, 14 Jul 2025 18:55:40 -0700 Subject: [PATCH 10/85] Add export llama runner build option into openvino build script --- backends/openvino/scripts/openvino_build.sh | 28 +++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh index c10a3bb4eeb..add946e15ae 100755 --- a/backends/openvino/scripts/openvino_build.sh +++ b/backends/openvino/scripts/openvino_build.sh @@ -17,7 +17,7 @@ main() { # Set build directory local build_dir="cmake-out" - # Create and enter the build directory + # Enter the Executorch root directory cd "$EXECUTORCH_ROOT" rm -rf "${build_dir}" @@ -32,6 +32,7 @@ main() { -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -B"${build_dir}" @@ -42,7 +43,7 @@ main() { elif [[ "$build_type" == "--enable_python" ]]; then echo "Building Python Package with Pybinding" - # Create and enter the build directory + # Enter the Executorch root directory cd "$EXECUTORCH_ROOT" ./install_executorch.sh --clean @@ -58,6 +59,29 @@ main() { # Install torchao pip install third-party/ao + # If the first arguments is --llama_runner, build export llama runner binary + # Note: c++ runtime with openvino backend should be built before building export llama runner + elif [[ "$build_type" == "--llama_runner" ]]; then + echo "Building Export Llama Runner" + + # Set build directory + local build_dir="cmake-out" + + # Enter the Executorch root directory + cd "$EXECUTORCH_ROOT" + + # Configure the project with CMake + # Note: Add any additional configuration options you need here + cmake -DBUILD_TESTING=OFF \ + -DCMAKE_INSTALL_PREFIX="${build_dir}" \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_OPENVINO=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -B"${build_dir}"/examples/models/llama \ + examples/models/llama + + # Build the export llama runner + cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release else echo "Error: Argument is not valid: $build_type" exit 1 # Exit the script with an error code From cedab9d875e2965f4faaa90e16a1be1adc8d507d Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Mon, 14 Jul 2025 19:10:02 -0700 Subject: [PATCH 11/85] Update README.md --- examples/openvino/README.md | 48 +++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/examples/openvino/README.md b/examples/openvino/README.md index 8856ccdce4e..dbce5df1b55 100644 --- a/examples/openvino/README.md +++ b/examples/openvino/README.md @@ -183,3 +183,51 @@ Run inference with a given model for 10 iterations: --model_path=model.pte \ --num_executions=10 ``` + +# Export Llama with OpenVINO Backend + +## Download the Model +Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time. + +## Environment Setup +Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend. + +## Export the model: +Execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. + +``` +LLAMA_CHECKPOINT=/consolidated.00.pth +LLAMA_PARAMS=/params.json +LLAMA_TOKENIZER=/tokenizer.model + +python -u -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${LLAMA_CHECKPOINT:?}" \ + --params "${LLAMA_PARAMS:?}" \ + -kv \ + --openvino \ + -d fp32 \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --output_name="llama.pte" \ + --verbose \ + --disable_dynamic_shape \ + --tokenizer_path "${LLAMA_TOKENIZER:?}" \ + --nncf_compression +``` + +## Build OpenVINO C++ Runtime with Llama Runner: +First, build the backend libraries by executing the script below in `/backends/openvino/scripts` folder: +```bash +./openvino_build.sh +``` +Then, build the llama runner by executing the script below (with `--llama_runner` argument) also in `/backends/openvino/scripts` folder: +```bash +./openvino_build.sh --llama_runner +``` +The executable is saved in `/cmake-out/examples/models/llama/llama_main` + +## Execute Inference Using Llama Runner +Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt. +``` +./cmake-out/examples/models/llama/llama_main --model_path=llama.pte --tokenizer_path=/tokenizer.model --prompt="Your custom prompt" +``` From e54f4c7ef6207733f0907cbe1030124926f6550c Mon Sep 17 00:00:00 2001 From: suryasidd Date: Tue, 19 Aug 2025 15:56:35 -0700 Subject: [PATCH 12/85] Added CMAKE EXPORT Changes --- backends/openvino/CMakeLists.txt | 12 +++++++++--- backends/openvino/scripts/openvino_build.sh | 8 +++----- examples/models/llama/CMakeLists.txt | 3 +-- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt index cb240805665..a2b982babab 100644 --- a/backends/openvino/CMakeLists.txt +++ b/backends/openvino/CMakeLists.txt @@ -38,7 +38,11 @@ add_library(openvino_backend STATIC .) target_compile_options(openvino_backend PRIVATE -frtti -fexceptions) # Include Executorch directories -target_include_directories(openvino_backend PUBLIC ${COMMON_INCLUDE_DIRS}) +target_include_directories(openvino_backend + PUBLIC + $ +) + # Link OpenVINO and ExecuteTorch core libraries target_link_libraries( @@ -77,5 +81,7 @@ if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER) ) endif() -# Install OpenVINO backend library to the lib directory -install(TARGETS openvino_backend DESTINATION lib) +# Install OpenVINO backend library and export target +install(TARGETS openvino_backend + EXPORT ExecuTorchTargets + DESTINATION lib) diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh index 7f903086163..08741840ddb 100755 --- a/backends/openvino/scripts/openvino_build.sh +++ b/backends/openvino/scripts/openvino_build.sh @@ -33,6 +33,8 @@ main() { -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ -B"${build_dir}" @@ -72,14 +74,10 @@ main() { # Configure the project with CMake # Note: Add any additional configuration options you need here - cmake -DBUILD_TESTING=OFF \ - -DCMAKE_INSTALL_PREFIX="${build_dir}" \ + cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_OPENVINO=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -B"${build_dir}"/examples/models/llama \ examples/models/llama - # Build the export llama runner cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release else diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index c469a69596c..a2a1f4efa05 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -192,9 +192,8 @@ endif() # Openvino backend if(TARGET openvino_backend) find_package(OpenVINO REQUIRED) - target_link_libraries(openvino_backend INTERFACE openvino::runtime executorch_core) list(APPEND link_libraries openvino_backend) - target_link_options_shared_lib(openvino_backend) + executorch_target_link_options_shared_lib(openvino_backend) endif() if(TARGET coremldelegate) From c12a4bafd441be0a77f909c063fcb883a8ac900b Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Wed, 20 Aug 2025 18:07:33 -0700 Subject: [PATCH 13/85] code formating updates --- backends/openvino/CMakeLists.txt | 14 +- backends/openvino/partitioner.py | 38 +++-- .../quantizer/observers/nncf_observers.py | 133 +++++++++++++----- backends/openvino/quantizer/quantizer.py | 95 +++++++++---- backends/openvino/runtime/OpenvinoBackend.cpp | 26 ++-- examples/models/llama/export_llama_lib.py | 45 +++--- extension/llm/export/partitioner_lib.py | 6 +- 7 files changed, 243 insertions(+), 114 deletions(-) diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt index a2b982babab..94f47c5e929 100644 --- a/backends/openvino/CMakeLists.txt +++ b/backends/openvino/CMakeLists.txt @@ -38,12 +38,10 @@ add_library(openvino_backend STATIC .) target_compile_options(openvino_backend PRIVATE -frtti -fexceptions) # Include Executorch directories -target_include_directories(openvino_backend - PUBLIC - $ +target_include_directories( + openvino_backend PUBLIC $ ) - # Link OpenVINO and ExecuteTorch core libraries target_link_libraries( openvino_backend PRIVATE openvino::runtime executorch_core @@ -82,6 +80,8 @@ if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER) endif() # Install OpenVINO backend library and export target -install(TARGETS openvino_backend - EXPORT ExecuTorchTargets - DESTINATION lib) +install( + TARGETS openvino_backend + EXPORT ExecuTorchTargets + DESTINATION lib +) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index b508a698cab..a2920285f99 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -25,12 +25,14 @@ from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner from torch.fx.passes.operator_support import OperatorSupportBase + class PatternNode: op_types = {} def __init__(self): self.op_types = {} + class OpenvinoOperatorsSupport(OperatorSupportBase): def __init__( @@ -135,18 +137,24 @@ def ops_to_not_decompose( ] return (ops_not_decompose, None) - def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool: + def check_pattern( + self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list + ) -> bool: if node.op == "call_function": if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types: pt_input_nodes = node.all_input_nodes - pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target.__name__)] + pattern_input_ops = pattern.op_types[ + "call_function" + ":" + str(node.target.__name__) + ] if pattern_input_ops is None: enabled_ops.append(node) return True if len(pt_input_nodes) != len(pattern_input_ops): return False for i in range(len(pt_input_nodes)): - if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops): + if not self.check_pattern( + pt_input_nodes[i], pattern_input_ops[i], enabled_ops + ): return False enabled_ops.append(node) return True @@ -167,14 +175,24 @@ def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule): const_node.op_types["get_attr"] = None const_node.op_types["placeholder"] = None bitwise_right_shift_node = PatternNode - bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node] + bitwise_right_shift_node.op_types[ + "call_function:aten.bitwise_right_shift.Tensor_Scalar" + ] = [const_node] bitwise_and_node = PatternNode - bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node] + bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [ + const_node + ] stack_node = PatternNode - stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node] + stack_node.op_types["call_function:aten.stack.default"] = [ + bitwise_and_node, + bitwise_right_shift_node, + ] for node in graph_module.graph.nodes: - if str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default": + if ( + str(node.op) == "call_function" + and str(node.target.__name__) == "aten.stack.default" + ): enabled_ops = [] pattern_match = self.check_pattern(node, stack_node, enabled_ops) if pattern_match: @@ -191,7 +209,11 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: self.capture_nncf_patterns(exported_program.graph_module) partitioner = CapabilityBasedPartitioner( exported_program.graph_module, - OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip, self._enabled_ops_by_name), + OpenvinoOperatorsSupport( + self._op_types_to_skip, + self._op_names_to_skip, + self._enabled_ops_by_name, + ), allows_single_node_partition=True, ) partition_list = partitioner.propose_partitions() diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py index 977458801a4..aa531336d0c 100644 --- a/backends/openvino/quantizer/observers/nncf_observers.py +++ b/backends/openvino/quantizer/observers/nncf_observers.py @@ -7,38 +7,65 @@ from typing import Tuple import torch -from torch.ao.quantization.observer import MappingType, PerGroup, PerAxis, PerChannelMinMaxObserver, get_block_size +from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node +from nncf.experimental.torch.fx.transformations import ( + constant_update_fn, + module_insertion_transformation_builder, +) +from nncf.parameters import CompressWeightsMode +from nncf.quantization.algorithms.weight_compression.config import ( + WeightCompressionConfig, +) + +from nncf.quantization.algorithms.weight_compression.weight_lowering import ( + do_integer_quantization, +) +from nncf.tensor.tensor import Tensor +from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType +from nncf.torch.quantization.layers import ( + INT4AsymmetricWeightsDecompressor, + INT4SymmetricWeightsDecompressor, + INT8AsymmetricWeightsDecompressor, + INT8SymmetricWeightsDecompressor, +) +from torch.ao.quantization.observer import ( + get_block_size, + MappingType, + PerAxis, + PerChannelMinMaxObserver, + PerGroup, +) from torch.ao.quantization.pt2e._affine_quantization import ( _get_reduction_params, AffineQuantizedMinMaxObserver, ) -from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor, INT4SymmetricWeightsDecompressor, INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor -from nncf.experimental.torch.fx.transformations import constant_update_fn, module_insertion_transformation_builder -from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node -from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization -from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.parameters import CompressWeightsMode -from nncf.tensor.tensor import Tensor class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - qmode = CompressWeightsMode.INT4_ASYM if self.mapping_type==MappingType.ASYMMETRIC else CompressWeightsMode.INT4_SYM - assert isinstance(self.granularity, PerGroup), "Only PerGroup granularity is supported" - self.wc_config = WeightCompressionConfig(mode=qmode, group_size=self.granularity.group_size) + qmode = ( + CompressWeightsMode.INT4_ASYM + if self.mapping_type == MappingType.ASYMMETRIC + else CompressWeightsMode.INT4_SYM + ) + assert isinstance( + self.granularity, PerGroup + ), "Only PerGroup granularity is supported" + self.wc_config = WeightCompressionConfig( + mode=qmode, group_size=self.granularity.group_size + ) def calculate_qparams(self, weight): assert hasattr(self, "min_val") and hasattr( self, "max_val" ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams" - _, reduction_dims = _get_reduction_params( - self.block_size, weight.size() - ) + _, reduction_dims = _get_reduction_params(self.block_size, weight.size()) assert len(reduction_dims) == 1, "Only 1-D group size is supported" reduction_dims = reduction_dims[0] - 1 - q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims) + q_weight, scale, zp = do_integer_quantization( + Tensor(weight), self.wc_config, reduction_axes=reduction_dims + ) zp = zp.data if zp is not None else None return q_weight.data, scale.data, zp @@ -50,23 +77,38 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): weight_node = observer_node.args[0] original_weight = get_tensor_constant_from_node(weight_node, model) q_weight, scale, zero_point = self.calculate_qparams(original_weight) - + with model.graph.inserting_before(observer_node): - if(zero_point is not None): - decompressor = INT4AsymmetricWeightsDecompressor(scale, zero_point, q_weight.shape, original_weight.shape, original_weight.dtype) + if zero_point is not None: + decompressor = INT4AsymmetricWeightsDecompressor( + scale, + zero_point, + q_weight.shape, + original_weight.shape, + original_weight.dtype, + ) else: - decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype) + decompressor = INT4SymmetricWeightsDecompressor( + scale, q_weight.shape, original_weight.shape, original_weight.dtype + ) packed_q_weight = decompressor.pack_weight(q_weight) constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) compressed_weight_name = observer_node.all_input_nodes[0].name - decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2]) + decompressor_suffix = "_".join( + compressed_weight_name.replace(".", "_").split("_")[:-2] + ) decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" module_insertion_transformation_builder( - decompressor, - [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)], - decompressor_name, - )(model) + decompressor, + [ + PTTargetPoint( + TargetType.OPERATOR_POST_HOOK, + target_node_name=compressed_weight_name, + ) + ], + decompressor_name, + )(model) decomp_node = observer_node.args[0] observer_node.replace_all_uses_with(decomp_node) model.graph.erase_node(observer_node) @@ -75,7 +117,11 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): class NNCFInt8observer(PerChannelMinMaxObserver): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - qmode = CompressWeightsMode.INT8_SYM if self.qscheme==torch.per_channel_symmetric else CompressWeightsMode.INT8_ASYM + qmode = ( + CompressWeightsMode.INT8_SYM + if self.qscheme == torch.per_channel_symmetric + else CompressWeightsMode.INT8_ASYM + ) self.wc_config = WeightCompressionConfig(mode=qmode) def calculate_qparams(self, weight): @@ -84,10 +130,10 @@ def calculate_qparams(self, weight): ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams" self.granularity = PerAxis(axis=self.ch_axis) self.block_size = get_block_size(weight.shape, self.granularity) - _, reduction_dims = _get_reduction_params( - self.block_size, weight.size() - ) - q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims) + _, reduction_dims = _get_reduction_params(self.block_size, weight.size()) + q_weight, scale, zp = do_integer_quantization( + Tensor(weight), self.wc_config, reduction_axes=reduction_dims + ) zp = zp.data if zp is not None else None return q_weight.data, scale.data, zp @@ -98,21 +144,32 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): q_weight, scale, zero_point = self.calculate_qparams(original_weight) with model.graph.inserting_before(observer_node): - if(zero_point is not None): - decompressor = INT8AsymmetricWeightsDecompressor(scale, zero_point, original_weight.dtype) + if zero_point is not None: + decompressor = INT8AsymmetricWeightsDecompressor( + scale, zero_point, original_weight.dtype + ) else: - decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) + decompressor = INT8SymmetricWeightsDecompressor( + scale, original_weight.dtype + ) packed_q_weight = decompressor.pack_weight(q_weight) constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) compressed_weight_name = observer_node.all_input_nodes[0].name - decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2]) + decompressor_suffix = "_".join( + compressed_weight_name.replace(".", "_").split("_")[:-2] + ) decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" module_insertion_transformation_builder( - decompressor, - [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)], - decompressor_name, - )(model) + decompressor, + [ + PTTargetPoint( + TargetType.OPERATOR_POST_HOOK, + target_node_name=compressed_weight_name, + ) + ], + decompressor_name, + )(model) decomp_node = observer_node.args[0] observer_node.replace_all_uses_with(decomp_node) model.graph.erase_node(observer_node) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index bf7fd0859d5..820d5dd49ba 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -15,14 +15,20 @@ import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped] import torch.fx +from executorch.backends.openvino.quantizer.observers.nncf_observers import ( + NNCFInt8observer, + PTPerBlockParamObserver, +) from nncf.common.graph.graph import NNCFGraph # type: ignore[import-untyped] +from nncf.common.quantization.structs import QuantizationScheme, QuantizerConfig +from nncf.quantization.quantize_model import get_weight_compression_configuration from torchao.quantization.pt2e import ( HistogramObserver, + MappingType, PerChannelMinMaxObserver, - UniformQuantizationObserverBase, PerGroup, - MappingType, + UniformQuantizationObserverBase, ) from torchao.quantization.pt2e.quantizer import ( EdgeOrNode, @@ -32,9 +38,6 @@ Quantizer, SharedQuantizationSpec, ) -from nncf.quantization.quantize_model import get_weight_compression_configuration -from nncf.common.quantization.structs import QuantizerConfig, QuantizationScheme -from executorch.backends.openvino.quantizer.observers.nncf_observers import PTPerBlockParamObserver,NNCFInt8observer QUANT_ANNOTATION_KEY = "quantization_annotation" from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY @@ -81,7 +84,12 @@ def __init__( :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm. """ self.mode = mode - self.wc_modes = [QuantizationMode.INT4_ASYM_WC,QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_ASYM_WC, QuantizationMode.INT8_SYM_WC] + self.wc_modes = [ + QuantizationMode.INT4_ASYM_WC, + QuantizationMode.INT4_SYM_WC, + QuantizationMode.INT8_ASYM_WC, + QuantizationMode.INT8_SYM_WC, + ] if mode == QuantizationMode.INT8_SYM: preset = quantization.structs.QuantizationPreset.PERFORMANCE model_type = None @@ -91,7 +99,7 @@ def __init__( else: preset = None model_type = nncf.parameters.ModelType.TRANSFORMER - if(self.mode not in self.wc_modes): + if self.mode not in self.wc_modes: self._min_max_algo = ( nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization( preset=preset, model_type=model_type, **kwargs @@ -100,16 +108,16 @@ def __init__( self._algo = self._min_max_algo else: weight_compression_configuration = get_weight_compression_configuration( - mode.value.replace("_wc", ""), # Mode value has to match NNCF CompressWeightsMode - **kwargs + mode.value.replace( + "_wc", "" + ), # Mode value has to match NNCF CompressWeightsMode + **kwargs, ) self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( - subset_size=None, - **weight_compression_configuration + subset_size=None, **weight_compression_configuration ) self._algo = self._weight_compression_algo - def set_ignored_scope( self, names: Optional[List[str]] = None, @@ -153,20 +161,40 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: defaultdict(QuantizationAnnotation) ) # Serperate into annotation for quantize and compress - if(self.mode in self.wc_modes): + if self.mode in self.wc_modes: self._algo.set_backend_entity(model) nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) for node in nodes_to_compress: - quantization_insertion_point = quantization.quantizer_setup.WeightQuantizationInsertionPoint(target_node_name=node.node_name) + quantization_insertion_point = ( + quantization.quantizer_setup.WeightQuantizationInsertionPoint( + target_node_name=node.node_name + ) + ) group_size = self._algo._group_size - num_bits = 4 if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT4_ASYM_WC] else 8 - qmode = QuantizationScheme.SYMMETRIC if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT8_SYM_WC] else QuantizationScheme.ASYMMETRIC + num_bits = ( + 4 + if self.mode + in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT4_ASYM_WC] + else 8 + ) + qmode = ( + QuantizationScheme.SYMMETRIC + if self.mode + in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_SYM_WC] + else QuantizationScheme.ASYMMETRIC + ) nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode) - qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(qip=quantization_insertion_point, qconfig=nncf_qconfig, directly_quantized_operator_node_names=[node]) + qp = quantization.quantizer_setup.SingleConfigQuantizationPoint( + qip=quantization_insertion_point, + qconfig=nncf_qconfig, + directly_quantized_operator_node_names=[node], + ) edge_or_node, annotation = self._get_edge_or_node_and_annotation( graph, nncf_graph, qp, node_vs_torch_annotation ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp, group_size=group_size, weights_only=True) + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config( + qp, group_size=group_size, weights_only=True + ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) else: quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) @@ -175,7 +203,9 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: edge_or_node, annotation = self._get_edge_or_node_and_annotation( graph, nncf_graph, qp, node_vs_torch_annotation ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp) + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config( + qp + ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) for quantizer_ids in quantization_setup.unified_scale_groups.values(): @@ -186,7 +216,8 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: root_qp = quantization_setup.quantization_points[root_quantizer_id] if any( - root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig + root_qp.qconfig + != quantization_setup.quantization_points[q_id].qconfig for q_id in quantizer_ids ): qps = [ @@ -340,7 +371,9 @@ def _fill_torch_ao_annotation( @staticmethod def _get_torch_ao_qspec_from_nncf_config( - qp: quantization.quantizer_setup.QuantizationPointBase, group_size=-1, weights_only=False + qp: quantization.quantizer_setup.QuantizationPointBase, + group_size=-1, + weights_only=False, ) -> QuantizationSpec: """ Retrieves the quantization configuration from the given quantization point and @@ -368,8 +401,12 @@ def _get_torch_ao_qspec_from_nncf_config( else torch.per_tensor_affine ) if is_weight: - mapping_type = MappingType.SYMMETRIC if qconfig.mode == QuantizationScheme.SYMMETRIC else MappingType.ASYMMETRIC - if qconfig.num_bits==4: + mapping_type = ( + MappingType.SYMMETRIC + if qconfig.mode == QuantizationScheme.SYMMETRIC + else MappingType.ASYMMETRIC + ) + if qconfig.num_bits == 4: extra_args["mapping_type"] = mapping_type extra_args["target_dtype"] = torch.int8 extra_args["granularity"] = PerGroup(group_size=group_size) @@ -378,16 +415,18 @@ def _get_torch_ao_qspec_from_nncf_config( quant_max = 7 dtype = torch.int8 channel_axis = 0 - elif qconfig.num_bits==8: - observer = NNCFInt8observer if weights_only else PerChannelMinMaxObserver + elif qconfig.num_bits == 8: + observer = ( + NNCFInt8observer if weights_only else PerChannelMinMaxObserver + ) quant_min = -128 quant_max = 127 dtype = torch.int8 channel_axis = 0 torch_qscheme = ( - torch.per_channel_symmetric - if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC - else torch.per_channel_affine + torch.per_channel_symmetric + if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC + else torch.per_channel_affine ) else: observer = ( diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp index 546f4d68573..bac006ce916 100644 --- a/backends/openvino/runtime/OpenvinoBackend.cpp +++ b/backends/openvino/runtime/OpenvinoBackend.cpp @@ -116,23 +116,23 @@ exr::Error OpenvinoBackend::execute( infer_request->set_input_tensor(i, ov_input_tensor); if (args[i]->isInt()) { - int64_t *val = &(args[i]->payload.copyable_union.as_int); + int64_t* val = &(args[i]->payload.copyable_union.as_int); - // Create OpenVINO tensor from integer input - ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val); - infer_request->set_input_tensor(i, ov_input_tensor); + // Create OpenVINO tensor from integer input + ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val); + infer_request->set_input_tensor(i, ov_input_tensor); } else { - auto input_tensor = args[i]->toTensor(); - ov::Shape input_shape( - input_tensor.sizes().begin(), input_tensor.sizes().end()); + auto input_tensor = args[i]->toTensor(); + ov::Shape input_shape( + input_tensor.sizes().begin(), input_tensor.sizes().end()); - // Convert input tensor to OpenVINO tensor - ov::element::Type ov_type = - convert_to_openvino_type(input_tensor.scalar_type()); - ov::Tensor ov_input_tensor( - ov_type, input_shape, input_tensor.mutable_data_ptr()); + // Convert input tensor to OpenVINO tensor + ov::element::Type ov_type = + convert_to_openvino_type(input_tensor.scalar_type()); + ov::Tensor ov_input_tensor( + ov_type, input_shape, input_tensor.mutable_data_ptr()); - infer_request->set_input_tensor(i, ov_input_tensor); + infer_request->set_input_tensor(i, ov_input_tensor); } } diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 7b74ee21f77..47527a326f9 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -906,7 +906,6 @@ def _to_edge_and_lower_llama_openvino( partitioners.append(get_openvino_partitioner(openvino_device)) modelname = f"openvino_{modelname}" - logging.info("Lowering model using following partitioner(s): ") for partitioner in partitioners: logging.info(f"--> {partitioner.__class__.__name__}") @@ -915,8 +914,9 @@ def _to_edge_and_lower_llama_openvino( # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize if nncf_compression: try: - import nncf from functools import partial + + import nncf from pytorch_tokenizers import get_tokenizer except ImportError: raise ImportError( @@ -924,9 +924,7 @@ def _to_edge_and_lower_llama_openvino( ) tokenizer = get_tokenizer(builder_exported.tokenizer_path) - def transform_fn( - prompts: str, tokenizer - ): + def transform_fn(prompts: str, tokenizer): tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) logging.error(tokenized_text) @@ -938,20 +936,33 @@ def transform_fn( return inputs - builder_exported.calibration_data = [builder_exported.calibration_data] if isinstance(builder_exported.calibration_data, str) else builder_exported.calibration_data - builder_exported.calibration_data = [word for prompt in builder_exported.calibration_data for word in prompt.split()] if not builder_exported.dynamic_shapes else builder_exported.calibration_data + builder_exported.calibration_data = ( + [builder_exported.calibration_data] + if isinstance(builder_exported.calibration_data, str) + else builder_exported.calibration_data + ) + builder_exported.calibration_data = ( + [ + word + for prompt in builder_exported.calibration_data + for word in prompt.split() + ] + if not builder_exported.dynamic_shapes + else builder_exported.calibration_data + ) builder_exported.pre_autograd_graph_module = nncf.compress_weights( - builder_exported.pre_autograd_graph_module, - dataset=nncf.Dataset(builder_exported.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)), - mode=nncf.CompressWeightsMode.INT4_SYM, - ratio=0.8, - sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, - ) - - builder = builder_exported.to_edge_transform_and_lower( - partitioners - ) + builder_exported.pre_autograd_graph_module, + dataset=nncf.Dataset( + builder_exported.calibration_data, + transform_func=partial(transform_fn, tokenizer=tokenizer), + ), + mode=nncf.CompressWeightsMode.INT4_SYM, + ratio=0.8, + sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, + ) + + builder = builder_exported.to_edge_transform_and_lower(partitioners) if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index b34f0a85344..185bc011a32 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -63,12 +63,11 @@ def get_mps_partitioner(use_kv_cache: bool = False): compile_specs = [CompileSpec("use_fp16", bytes([True]))] return MPSPartitioner(compile_specs) # pyre-fixme[16] + def get_openvino_partitioner(device: str): try: + from executorch.backends.openvino.partitioner import OpenvinoPartitioner from executorch.exir.backend.backend_details import CompileSpec - from executorch.backends.openvino.partitioner import ( - OpenvinoPartitioner, - ) except ImportError: raise ImportError( "Please install the OpenVINO backend following https://github.com/pytorch/executorch/tree/main/backends/openvino" @@ -77,6 +76,7 @@ def get_openvino_partitioner(device: str): compile_specs = [CompileSpec("device", device.encode())] return OpenvinoPartitioner(compile_specs) + def get_coreml_partitioner( ios: int = 15, embedding_quantize: Optional[str] = None, From bf659439771f5a52ec40a00070ef5ac5c6237cfa Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Wed, 20 Aug 2025 18:54:57 -0700 Subject: [PATCH 14/85] code formating changes --- .../quantizer/observers/nncf_observers.py | 31 ++++++++++--------- backends/openvino/quantizer/quantizer.py | 9 ++++-- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py index aa531336d0c..f6ac2a3cb91 100644 --- a/backends/openvino/quantizer/observers/nncf_observers.py +++ b/backends/openvino/quantizer/observers/nncf_observers.py @@ -4,41 +4,42 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import Tuple - import torch -from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node -from nncf.experimental.torch.fx.transformations import ( +from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped] + get_tensor_constant_from_node, +) +from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped] constant_update_fn, module_insertion_transformation_builder, ) -from nncf.parameters import CompressWeightsMode -from nncf.quantization.algorithms.weight_compression.config import ( +from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped] +from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] WeightCompressionConfig, ) -from nncf.quantization.algorithms.weight_compression.weight_lowering import ( +from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped] do_integer_quantization, ) -from nncf.tensor.tensor import Tensor -from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType -from nncf.torch.quantization.layers import ( +from nncf.tensor.tensor import Tensor # type: ignore[import-untyped] +from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped] + PTTargetPoint, + TargetType, +) +from nncf.torch.quantization.layers import ( # type: ignore[import-untyped] INT4AsymmetricWeightsDecompressor, INT4SymmetricWeightsDecompressor, INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor, ) -from torch.ao.quantization.observer import ( +from torchao.quantization.observer import AffineQuantizedMinMaxObserver +from torchao.quantization.pt2e import ( get_block_size, MappingType, PerAxis, PerChannelMinMaxObserver, PerGroup, ) -from torch.ao.quantization.pt2e._affine_quantization import ( - _get_reduction_params, - AffineQuantizedMinMaxObserver, -) +from torchao.quantization.quant_primitives import _get_reduction_params class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver): diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 820d5dd49ba..cd78f6907c7 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -21,8 +21,13 @@ ) from nncf.common.graph.graph import NNCFGraph # type: ignore[import-untyped] -from nncf.common.quantization.structs import QuantizationScheme, QuantizerConfig -from nncf.quantization.quantize_model import get_weight_compression_configuration +from nncf.common.quantization.structs import ( # type: ignore[import-untyped] + QuantizationScheme, + QuantizerConfig, +) +from nncf.quantization.quantize_model import ( # type: ignore[import-untyped] + get_weight_compression_configuration, +) from torchao.quantization.pt2e import ( HistogramObserver, MappingType, From 30a1a258b22d1471c0aae328f30a5910af6af118 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 26 Aug 2025 12:31:49 +0400 Subject: [PATCH 15/85] openvino quantizer refactored --- backends/openvino/quantizer/__init__.py | 4 +- backends/openvino/quantizer/observers.py | 286 ++++++++++++ .../quantizer/observers/nncf_observers.py | 176 -------- backends/openvino/quantizer/quantizer.py | 412 ++++++++++-------- examples/models/llama/export_llama_lib.py | 9 + extension/llm/export/quantizer_lib.py | 38 +- 6 files changed, 573 insertions(+), 352 deletions(-) create mode 100644 backends/openvino/quantizer/observers.py delete mode 100644 backends/openvino/quantizer/observers/nncf_observers.py diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py index df038483f2f..0fd8c10b249 100644 --- a/backends/openvino/quantizer/__init__.py +++ b/backends/openvino/quantizer/__init__.py @@ -1,3 +1,3 @@ -from .quantizer import OpenVINOQuantizer, quantize_model +from .quantizer import OpenVINOQuantizer, quantize_model, QuantizationMode -__all__ = ["OpenVINOQuantizer", "quantize_model"] +__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"] diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py new file mode 100644 index 00000000000..2ea66f11a55 --- /dev/null +++ b/backends/openvino/quantizer/observers.py @@ -0,0 +1,286 @@ +# Copyright (c) Intel Corporation +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file found in the +# LICENSE file in the root directory of this source tree. + +# mypy: disable-error-code=import-not-found + +from abc import ABC, abstractmethod +from typing import Optional, Tuple + +import nncf.torch.graph.operator_metatypes as om # type: ignore[import-untyped] + +import torch +from nncf.experimental.torch.fx.nncf_graph_builder import ( # type: ignore[import-untyped] + GraphConverter, +) + +from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped] + get_tensor_constant_from_node, +) +from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped] + constant_update_fn, + module_insertion_transformation_builder, +) +from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped] +from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] + WeightCompressionConfig, +) +from nncf.quantization.algorithms.weight_compression.torch_fx_backend import ( # type: ignore[import-untyped] + FXWeightCompressionAlgoBackend, +) +from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped] + do_integer_quantization, +) +from nncf.tensor.tensor import Tensor # type: ignore[import-untyped] +from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped] + PTTargetPoint, + TargetType, +) +from nncf.torch.quantization.layers import ( # type: ignore[import-untyped] + BaseWeightsDecompressor, + INT4AsymmetricWeightsDecompressor, + INT4SymmetricWeightsDecompressor, + INT8AsymmetricWeightsDecompressor, + INT8SymmetricWeightsDecompressor, +) +from torchao.quantization.pt2e import MappingType, ObserverBase +from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes + +class WeightObserverBase(ObserverBase, ABC): + """ + Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation. + """ + + def calculate_qparams( # type: ignore[override] + self, + weight: torch.Tensor, + observer_node: torch.fx.Node, + model: torch.fx.GraphModule, + ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + """ + Calculate quantization parameters such as scale, quantized weight and zero point. + + :param weight: FP weight to be used for calculating qparams. + :return: quantization params quantized weight, scale and zero point + """ + ndims = len(weight.size()) + node_with_weight, weight_port_id = ( + WeightObserverBase.get_node_with_weight_and_port_ids(observer_node, model) + ) + _, node_metatype = GraphConverter.get_node_type_and_metatype( + node_with_weight, model + ) + # Special case where embedding metatype has to be mapped to AtenEmbedding metatype + node_metatype = ( + om.PTAtenEmbeddingMetatype + if node_metatype == om.PTEmbeddingMetatype + else node_metatype + ) + reduction_dims = get_weight_compression_reduction_axes( + node_metatype, weight_port_id, ndims + ) + reduction_dims = tuple(reduction_dims) + + q_weight, scale, zp = do_integer_quantization( + Tensor(weight), self.wc_config, reduction_axes=reduction_dims + ) + zp = zp.data if zp is not None else None + return q_weight.data, scale.data, zp + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + + @staticmethod + def get_node_with_weight_and_port_ids( + observer_node: torch.fx.Node, model: torch.fx.GraphModule + ) -> Tuple[torch.fx.Node, int]: + """ + Returns the node which contains the weight and the weight port id. + + :param observer_node: Observer node for the weight. + :param graph: The model. + :return: Node which contains the weight (for eg. Linear node) and the port ID for the weight. + """ + for node in model.graph.nodes: + if observer_node in node.all_input_nodes: + return node, node.all_input_nodes.index(observer_node) + msg = f"Observer node {observer_node.name} has no consumer node" + raise RuntimeError(msg) + + def convert( + self, model: torch.fx.GraphModule, observer_node: torch.fx.Node + ) -> None: + """ + Converts the weight observer node into a decompression subgraph after calibration. + This method is responsible for transforming the model after the quantization preparation + and calibration phases. It replaces the observer node with the quantized weight and a decompression + module. + + :param model: A `torch.fx.GraphModule` representing the statically traced model + with observer nodes attached and calibrated. + :param observer_node: The `torch.fx.Node` corresponding to the observer module for + the weight that is being transformed into a compressed representation. + """ + weight_node = observer_node.args[0] + original_weight = get_tensor_constant_from_node(weight_node, model) + q_weight, scale, zero_point = self.calculate_qparams( + original_weight, observer_node, model + ) + + decompressor = self._create_decompressor( + scale, zero_point, q_weight, original_weight + ) + packed_q_weight = decompressor.pack_weight(q_weight) + + constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) + + compressed_weight_name = observer_node.all_input_nodes[0].name + decompressor_suffix = "_".join( + compressed_weight_name.replace(".", "_").split("_")[:-2] + ) + decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" + + module_insertion_transformation_builder( + decompressor, + [ + PTTargetPoint( + TargetType.OPERATOR_POST_HOOK, + target_node_name=compressed_weight_name, + ) + ], + decompressor_name, + )(model) + + decomp_node = observer_node.args[0] + observer_node.replace_all_uses_with(decomp_node) # type: ignore[arg-type] + model.graph.erase_node(observer_node) + + @abstractmethod + def _create_decompressor( + self, + scale: torch.Tensor, + zero_point: Optional[torch.Tensor], + q_weight: torch.Tensor, + original_weight: torch.Tensor, + ) -> BaseWeightsDecompressor: + """ + Used to return the respective NNCF decompressor for different types of quantization. + + :param scale: Calculated scale quantization parameter. + :param zero_point: Calculated zero_point quantization parameter. + :param q_weight: Calculated quantized weight. + :param original_weight: FP weight. + :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO. + """ + pass + + @abstractmethod + def get_wc_config(self) -> WeightCompressionConfig: + """ + Used to return the respective NNCF Weight Compression Config. + + :return: Weight compression config with the compression information such as qmode, group_size etc. + """ + pass + + +class INT4WeightObserver(WeightObserverBase): + """ + This class defines the behavior for INT4 Weight Compression which has per-group granularity. + """ + + def __init__( + self, + group_size: int, + mapping_type: MappingType, + target_dtype: torch.dtype, + *args, + **kwargs, + ) -> None: + """ + :param group_size: Group size for group wise quantization. group_size=-1 means it is per-channel quantization. + :param mapping_type: MappingType.SYMMETRIC and MappingType.ASYMMETRIC are supported types for this argument for symmetric or asymmetric quantization. + :param target_dtype: target dtype for quantization such as int8, uint8, etc. + """ + super().__init__(dtype=target_dtype, is_dynamic=False) + self.wc_config = None + self.mapping_type = mapping_type + + qmode = ( + CompressWeightsMode.INT4_ASYM + if self.mapping_type == MappingType.ASYMMETRIC + else CompressWeightsMode.INT4_SYM + ) + self.wc_config = WeightCompressionConfig(mode=qmode, group_size=group_size) + + def _create_decompressor( + self, + scale: torch.Tensor, + zero_point: Optional[torch.Tensor], + q_weight: torch.Tensor, + original_weight: torch.Tensor, + ) -> BaseWeightsDecompressor: + if zero_point is not None: + return INT4AsymmetricWeightsDecompressor( + scale, + zero_point, + q_weight.shape, + original_weight.shape, + original_weight.dtype, + ) + else: + return INT4SymmetricWeightsDecompressor( + scale, q_weight.shape, original_weight.shape, original_weight.dtype + ) + + def get_wc_config(self): + return self.wc_config + + +class INT8WeightObserver(WeightObserverBase): + """ + This class defines the behavior for Int8 WC which has per channel granularity. + """ + + def __init__( + self, + qscheme: torch.qscheme, + dtype: torch.dtype, + ch_axis: int = 0, + *args, + **kwargs, + ) -> None: + """ + :param qscheme: Quantization scheme which is per-channel for Int8 WC. + :param dtype: dtype for quantization such as int8, uint8, etc.. + :param ch_axis: Channel axis. + """ + super().__init__(dtype=dtype, is_dynamic=False) + self.wc_config = None + self.qscheme = qscheme + + qmode = ( + CompressWeightsMode.INT8_SYM + if self.qscheme == torch.per_channel_symmetric + else CompressWeightsMode.INT8_ASYM + ) + self.wc_config = WeightCompressionConfig(mode=qmode) + + def _create_decompressor( + self, + scale: torch.Tensor, + zero_point: Optional[torch.Tensor], + q_weight: torch.Tensor, + original_weight: torch.Tensor, + ) -> BaseWeightsDecompressor: + if zero_point is not None: + return INT8AsymmetricWeightsDecompressor( + scale, zero_point, original_weight.dtype + ) + else: + return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) + + def get_wc_config(self): + return self.wc_config \ No newline at end of file diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py deleted file mode 100644 index f6ac2a3cb91..00000000000 --- a/backends/openvino/quantizer/observers/nncf_observers.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import torch -from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped] - get_tensor_constant_from_node, -) -from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped] - constant_update_fn, - module_insertion_transformation_builder, -) -from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped] -from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] - WeightCompressionConfig, -) - -from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped] - do_integer_quantization, -) -from nncf.tensor.tensor import Tensor # type: ignore[import-untyped] -from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped] - PTTargetPoint, - TargetType, -) -from nncf.torch.quantization.layers import ( # type: ignore[import-untyped] - INT4AsymmetricWeightsDecompressor, - INT4SymmetricWeightsDecompressor, - INT8AsymmetricWeightsDecompressor, - INT8SymmetricWeightsDecompressor, -) -from torchao.quantization.observer import AffineQuantizedMinMaxObserver -from torchao.quantization.pt2e import ( - get_block_size, - MappingType, - PerAxis, - PerChannelMinMaxObserver, - PerGroup, -) -from torchao.quantization.quant_primitives import _get_reduction_params - - -class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - qmode = ( - CompressWeightsMode.INT4_ASYM - if self.mapping_type == MappingType.ASYMMETRIC - else CompressWeightsMode.INT4_SYM - ) - assert isinstance( - self.granularity, PerGroup - ), "Only PerGroup granularity is supported" - self.wc_config = WeightCompressionConfig( - mode=qmode, group_size=self.granularity.group_size - ) - - def calculate_qparams(self, weight): - assert hasattr(self, "min_val") and hasattr( - self, "max_val" - ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams" - _, reduction_dims = _get_reduction_params(self.block_size, weight.size()) - assert len(reduction_dims) == 1, "Only 1-D group size is supported" - reduction_dims = reduction_dims[0] - 1 - q_weight, scale, zp = do_integer_quantization( - Tensor(weight), self.wc_config, reduction_axes=reduction_dims - ) - zp = zp.data if zp is not None else None - return q_weight.data, scale.data, zp - - def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): - print("calling convert") - assert ( - self.original_dtype is not None - ), "Expecting original_dtype to be populated" - weight_node = observer_node.args[0] - original_weight = get_tensor_constant_from_node(weight_node, model) - q_weight, scale, zero_point = self.calculate_qparams(original_weight) - - with model.graph.inserting_before(observer_node): - if zero_point is not None: - decompressor = INT4AsymmetricWeightsDecompressor( - scale, - zero_point, - q_weight.shape, - original_weight.shape, - original_weight.dtype, - ) - else: - decompressor = INT4SymmetricWeightsDecompressor( - scale, q_weight.shape, original_weight.shape, original_weight.dtype - ) - packed_q_weight = decompressor.pack_weight(q_weight) - constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) - compressed_weight_name = observer_node.all_input_nodes[0].name - decompressor_suffix = "_".join( - compressed_weight_name.replace(".", "_").split("_")[:-2] - ) - decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" - - module_insertion_transformation_builder( - decompressor, - [ - PTTargetPoint( - TargetType.OPERATOR_POST_HOOK, - target_node_name=compressed_weight_name, - ) - ], - decompressor_name, - )(model) - decomp_node = observer_node.args[0] - observer_node.replace_all_uses_with(decomp_node) - model.graph.erase_node(observer_node) - - -class NNCFInt8observer(PerChannelMinMaxObserver): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - qmode = ( - CompressWeightsMode.INT8_SYM - if self.qscheme == torch.per_channel_symmetric - else CompressWeightsMode.INT8_ASYM - ) - self.wc_config = WeightCompressionConfig(mode=qmode) - - def calculate_qparams(self, weight): - assert hasattr(self, "min_val") and hasattr( - self, "max_val" - ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams" - self.granularity = PerAxis(axis=self.ch_axis) - self.block_size = get_block_size(weight.shape, self.granularity) - _, reduction_dims = _get_reduction_params(self.block_size, weight.size()) - q_weight, scale, zp = do_integer_quantization( - Tensor(weight), self.wc_config, reduction_axes=reduction_dims - ) - zp = zp.data if zp is not None else None - return q_weight.data, scale.data, zp - - def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): - print("calling convert") - weight_node = observer_node.args[0] - original_weight = get_tensor_constant_from_node(weight_node, model) - q_weight, scale, zero_point = self.calculate_qparams(original_weight) - - with model.graph.inserting_before(observer_node): - if zero_point is not None: - decompressor = INT8AsymmetricWeightsDecompressor( - scale, zero_point, original_weight.dtype - ) - else: - decompressor = INT8SymmetricWeightsDecompressor( - scale, original_weight.dtype - ) - packed_q_weight = decompressor.pack_weight(q_weight) - constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) - compressed_weight_name = observer_node.all_input_nodes[0].name - decompressor_suffix = "_".join( - compressed_weight_name.replace(".", "_").split("_")[:-2] - ) - decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" - - module_insertion_transformation_builder( - decompressor, - [ - PTTargetPoint( - TargetType.OPERATOR_POST_HOOK, - target_node_name=compressed_weight_name, - ) - ], - decompressor_name, - )(model) - decomp_node = observer_node.args[0] - observer_node.replace_all_uses_with(decomp_node) - model.graph.erase_node(observer_node) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index cd78f6907c7..31d41bff7be 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -15,16 +15,11 @@ import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped] import torch.fx -from executorch.backends.openvino.quantizer.observers.nncf_observers import ( - NNCFInt8observer, - PTPerBlockParamObserver, +from executorch.backends.openvino.quantizer.observers import ( + INT4WeightObserver, + INT8WeightObserver, ) - from nncf.common.graph.graph import NNCFGraph # type: ignore[import-untyped] -from nncf.common.quantization.structs import ( # type: ignore[import-untyped] - QuantizationScheme, - QuantizerConfig, -) from nncf.quantization.quantize_model import ( # type: ignore[import-untyped] get_weight_compression_configuration, ) @@ -32,7 +27,6 @@ HistogramObserver, MappingType, PerChannelMinMaxObserver, - PerGroup, UniformQuantizationObserverBase, ) from torchao.quantization.pt2e.quantizer import ( @@ -45,7 +39,6 @@ ) QUANT_ANNOTATION_KEY = "quantization_annotation" -from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY class QuantizationMode(Enum): @@ -55,15 +48,19 @@ class QuantizationMode(Enum): - INT8_SYM: INT8 symmetric quantization for both activations and weights. - INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights. - INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models + - INT8WO_SYM: INT8 symmetric quantization for weights only. + - INT8WO_ASYM: INT8 asymmetric quantization for weights only. + - INT4WO_SYM: INT4 symmetric quantization for weights only. + - INT4WO_ASYM: INT4 asymmetric quantization for weights only """ INT8_SYM = "int8_sym" INT8_MIXED = "int8_mixed" INT8_TRANSFORMER = "int8_transformer" - INT8_SYM_WC = "int8_sym_wc" - INT8_ASYM_WC = "int8_asym_wc" - INT4_SYM_WC = "int4_sym" - INT4_ASYM_WC = "int4_asym" + INT8WO_SYM = "int8wo_sym" + INT8WO_ASYM = "int8wo_asym" + INT4WO_SYM = "int4wo_sym" + INT4WO_ASYM = "int4wo_asym" class OpenVINOQuantizer(Quantizer): @@ -72,10 +69,17 @@ class OpenVINOQuantizer(Quantizer): optimally for the inference via OpenVINO. """ + WEIGHTS_ONLY_COMPRESSION_MODES = ( + QuantizationMode.INT4WO_SYM, + QuantizationMode.INT4WO_ASYM, + QuantizationMode.INT8WO_SYM, + QuantizationMode.INT8WO_ASYM, + ) + def __init__( self, *, - mode: Optional[QuantizationMode] = QuantizationMode.INT8_SYM, + mode: QuantizationMode = QuantizationMode.INT8_SYM, **kwargs, ): """ @@ -89,28 +93,21 @@ def __init__( :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm. """ self.mode = mode - self.wc_modes = [ - QuantizationMode.INT4_ASYM_WC, - QuantizationMode.INT4_SYM_WC, - QuantizationMode.INT8_ASYM_WC, - QuantizationMode.INT8_SYM_WC, - ] - if mode == QuantizationMode.INT8_SYM: - preset = quantization.structs.QuantizationPreset.PERFORMANCE - model_type = None - elif mode == QuantizationMode.INT8_MIXED: - preset = quantization.structs.QuantizationPreset.MIXED - model_type = None - else: - preset = None - model_type = nncf.parameters.ModelType.TRANSFORMER - if self.mode not in self.wc_modes: - self._min_max_algo = ( + if self.mode not in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES: + if mode == QuantizationMode.INT8_SYM: + preset = quantization.structs.QuantizationPreset.PERFORMANCE + model_type = None + elif mode == QuantizationMode.INT8_MIXED: + preset = quantization.structs.QuantizationPreset.MIXED + model_type = None + else: + preset = None + model_type = nncf.parameters.ModelType.TRANSFORMER + self._algo = ( nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization( preset=preset, model_type=model_type, **kwargs ) ) - self._algo = self._min_max_algo else: weight_compression_configuration = get_weight_compression_configuration( mode.value.replace( @@ -118,10 +115,9 @@ def __init__( ), # Mode value has to match NNCF CompressWeightsMode **kwargs, ) - self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( + self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( subset_size=None, **weight_compression_configuration ) - self._algo = self._weight_compression_algo def set_ignored_scope( self, @@ -158,104 +154,131 @@ def get_nncf_quantization_setup( self._algo._set_backend_entity(model) return self._algo.find_quantization_setup(model, nncf_graph) - def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: - nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model) + def _annotate_weight_compression( + self, + model: torch.fx.GraphModule, + graph: torch.fx.Graph, + nncf_graph: NNCFGraph, + node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation], + ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]: + """ + Annotates the model graph with weight-only quantization specs. - graph = model.graph - node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = ( - defaultdict(QuantizationAnnotation) - ) - # Serperate into annotation for quantize and compress - if self.mode in self.wc_modes: - self._algo.set_backend_entity(model) - nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) - for node in nodes_to_compress: - quantization_insertion_point = ( - quantization.quantizer_setup.WeightQuantizationInsertionPoint( - target_node_name=node.node_name - ) - ) - group_size = self._algo._group_size - num_bits = ( - 4 - if self.mode - in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT4_ASYM_WC] - else 8 - ) - qmode = ( - QuantizationScheme.SYMMETRIC - if self.mode - in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_SYM_WC] - else QuantizationScheme.ASYMMETRIC - ) - nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode) - qp = quantization.quantizer_setup.SingleConfigQuantizationPoint( - qip=quantization_insertion_point, - qconfig=nncf_qconfig, - directly_quantized_operator_node_names=[node], - ) - edge_or_node, annotation = self._get_edge_or_node_and_annotation( - graph, nncf_graph, qp, node_vs_torch_annotation - ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config( - qp, group_size=group_size, weights_only=True + Identifies compressible nodes in the NNCF graph and attaches the corresponding + TorchAO quantization specifications to their weight edges for later transformation. + + :param model: The FX GraphModule to annotate. + :param graph: The underlying FX graph. + :param nncf_graph: The corresponding NNCF graph. + :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations. + + :return: Updated mapping of FX nodes with weight compression annotations. + """ + self._algo.set_backend_entity(model) + nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) + + for node in nodes_to_compress: + target_node = nncf_fx.node_utils.get_graph_node_by_name( + graph, node.node_name + ) + annotation = node_vs_torch_annotation[target_node] + edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph) + group_size = getattr(self._algo, "_group_size", -1) + qspec = self._get_torch_ao_qspec_from_nncf_config( + qp=None, group_size=group_size, qmode=self.mode, weights_only=True + ) + self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + + return node_vs_torch_annotation + + def _annotate_post_training_quantization( + self, + model: torch.fx.GraphModule, + graph: torch.fx.Graph, + nncf_graph: NNCFGraph, + node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation], + ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]: + """ + Annotates the model graph with post-training quantization configurations. + + Converts NNCF quantization points into TorchAO-compatible quantization specs, + assigning them to corresponding nodes or edges. Also handles unified scale groups, + ensuring shared quantization specs across grouped quantizers with consistent configs. + + :param model: The FX GraphModule to annotate. + :param graph: The underlying FX graph. + :param nncf_graph: The corresponding NNCF graph. + :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations. + + :return: Updated mapping of FX nodes with post-training quantization annotations. + """ + quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) + + for qp in quantization_setup.quantization_points.values(): + edge_or_node, annotation = self._get_edge_or_node_and_annotation( + graph, nncf_graph, qp, node_vs_torch_annotation + ) + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp) + self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + + for quantizer_ids in quantization_setup.unified_scale_groups.values(): + root_quantizer_id = self._get_unified_scales_root_quantizer_id( + nncf_graph, quantizer_ids, quantization_setup + ) + root_qp = quantization_setup.quantization_points[root_quantizer_id] + + if any( + root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig + for q_id in quantizer_ids + ): + qps = [ + quantization_setup.quantization_points[qid] for qid in quantizer_ids + ] + raise nncf.InternalError( + "Different quantization configs are set to one unified scale group:" + f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}" ) - self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) - else: - quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) - for qp in quantization_setup.quantization_points.values(): + root_target_node = nncf_fx.node_utils.get_graph_node_by_name( + graph, root_qp.insertion_point.target_node_name + ) + root_edge_or_node = self._get_edge_or_node( + root_target_node, root_qp, nncf_graph + ) + + for quantizer_id in quantizer_ids: + if quantizer_id == root_quantizer_id: + continue + + qspec = SharedQuantizationSpec(root_edge_or_node) # type: ignore[assignment] + qp = quantization_setup.quantization_points[quantizer_id] edge_or_node, annotation = self._get_edge_or_node_and_annotation( graph, nncf_graph, qp, node_vs_torch_annotation ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config( - qp - ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) - for quantizer_ids in quantization_setup.unified_scale_groups.values(): + return node_vs_torch_annotation - root_quantizer_id = self._get_unified_scales_root_quantizer_id( - nncf_graph, quantizer_ids, quantization_setup - ) - root_qp = quantization_setup.quantization_points[root_quantizer_id] - - if any( - root_qp.qconfig - != quantization_setup.quantization_points[q_id].qconfig - for q_id in quantizer_ids - ): - qps = [ - quantization_setup.quantization_points[q_id] - for q_id in quantizer_ids - ] - msg = ( - "Different quantization configs are set to one unified scale group:" - f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}" - ) - raise nncf.InternalError(msg) - - root_target_node = nncf_fx.node_utils.get_graph_node_by_name( - graph, root_qp.insertion_point.target_node_name - ) - root_edge_or_node = self._get_edge_or_node( - root_target_node, root_qp, nncf_graph - ) - - for quantizer_id in quantizer_ids: - if quantizer_id == root_quantizer_id: - continue + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model) + graph = model.graph + node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = ( + defaultdict(QuantizationAnnotation) + ) - qspec = SharedQuantizationSpec(root_edge_or_node) - qp = quantization_setup.quantization_points[quantizer_id] - edge_or_node, annotation = self._get_edge_or_node_and_annotation( - graph, nncf_graph, qp, node_vs_torch_annotation - ) - self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + if self.mode in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES: + node_vs_torch_annotation = self._annotate_weight_compression( + model, graph, nncf_graph, node_vs_torch_annotation + ) + else: + node_vs_torch_annotation = self._annotate_post_training_quantization( + model, graph, nncf_graph, node_vs_torch_annotation + ) for node, annotation in node_vs_torch_annotation.items(): - assert Q_ANNOTATION_KEY not in node.meta - node.meta[Q_ANNOTATION_KEY] = annotation + assert QUANT_ANNOTATION_KEY not in node.meta + node.meta[QUANT_ANNOTATION_KEY] = annotation + return model @staticmethod @@ -317,6 +340,36 @@ def _get_edge_or_node_and_annotation( edge_or_node = OpenVINOQuantizer._get_edge_or_node(target_node, qp, nncf_graph) return edge_or_node, annotation + @staticmethod + def _get_weight_edge( + target_node: torch.fx.Node, + nncf_graph: NNCFGraph, + ): + """ + Returns the FX node corresponding to the weight tensor input of a given operator node. + Uses the NNCF graph to identify which input port of the target node holds the weight. + If multiple weight ports are present, a warning is issued and only the first one is used. + + :param target_node: FX node representing a weighted operation (e.g., Linear, Conv). + :param nncf_graph: NNCFGraph used to determine weight port indices. + + :return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight. + """ + nncf_node = nncf_graph.get_node_by_name(target_node.name) + weights_ports_ids = nncf.torch.model_graph_manager.get_weight_tensor_port_ids( + nncf_node, nncf_graph + ) + if len(weights_ports_ids) > 1: + # TODO(dlyakhov): support quantization for nodes with several weights + nncf.common.logging.nncf_logger.warning( + f"Quantization of the weighted node {target_node.name}" + " is not yet supported by the OpenVINOQuantizer." + f" Only the weight on port ID {weights_ports_ids[0]} will be quantized." + f" Quantizable weights are located on ports: {weights_ports_ids}." + ) + weight_node = target_node.all_input_nodes[weights_ports_ids[0]] + return (weight_node, target_node) + @staticmethod def _get_edge_or_node( target_node: torch.fx.Node, @@ -333,22 +386,7 @@ def _get_edge_or_node( """ ip = qp.insertion_point if qp.is_weight_quantization_point(): - nncf_node = nncf_graph.get_node_by_name(target_node.name) - weights_ports_ids = ( - nncf.torch.model_graph_manager.get_weight_tensor_port_ids( - nncf_node, nncf_graph - ) - ) - if len(weights_ports_ids) > 1: - # TODO(dlyakhov): support quantization for nodes with several weights - nncf.common.logging.nncf_logger.warning( - f"Quantization of the weighted node {target_node.name}" - " is not yet supported by the OpenVINOQuantizer." - f" Only the weight on port ID {weights_ports_ids[0]} will be quantized." - f" Quantizable weights are located on ports: {weights_ports_ids}." - ) - weight_node = target_node.all_input_nodes[weights_ports_ids[0]] - return (weight_node, target_node) + OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph) if ip.input_port_id is None: return target_node @@ -377,22 +415,67 @@ def _fill_torch_ao_annotation( @staticmethod def _get_torch_ao_qspec_from_nncf_config( qp: quantization.quantizer_setup.QuantizationPointBase, - group_size=-1, - weights_only=False, + group_size: int = -1, + qmode: Optional[QuantizationMode] = None, + weights_only: bool = False, ) -> QuantizationSpec: """ - Retrieves the quantization configuration from the given quantization point and - converts it into a QuantizationSpec. - - :param qp: An instance of QuantizationPointBase. - :return: A QuantizationSpec retrieved and converted from the quantization point. + Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments. + For weight-only quantization (e.g., INT4/INT8 compression), uses `qmode`, `group_size`, + and `weights_only`. For post-training quantization, only `qp` is required. + + :param qp: Quantization point from NNCF. + :param group_size: Group size for INT4 group-wise quantization. + :param qmode: Quantization mode for weight compression. + :param weights_only: If True, applies weight-only quantization logic. + :return: A TorchAO QuantizationSpec. """ + observer: Type[UniformQuantizationObserverBase] + # Eps value is copied from nncf/torch/quantization/layers.py - extra_args = {"eps": 1e-16} + extra_args: Dict[str, Any] = {"eps": 1e-16} + + if weights_only: + mapping_type = ( + MappingType.SYMMETRIC + if qmode == QuantizationMode.INT4WO_SYM + else MappingType.ASYMMETRIC + ) + if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]: + extra_args["mapping_type"] = mapping_type + extra_args["target_dtype"] = torch.int8 + extra_args["group_size"] = group_size + observer = INT4WeightObserver + quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0 + quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = None + else: + observer = INT8WeightObserver + quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0 + quant_max = 1277 if mapping_type == MappingType.SYMMETRIC else 255 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = ( + torch.per_channel_symmetric + if qmode == QuantizationMode.INT8WO_SYM + else torch.per_channel_affine + ) + + return QuantizationSpec( + dtype=dtype, + observer_or_fake_quant_ctr=observer.with_args(**extra_args), + quant_min=quant_min, + quant_max=quant_max, + qscheme=torch_qscheme, + ch_axis=channel_axis, + is_dynamic=False, + ) + is_weight = qp.is_weight_quantization_point() qconfig = qp.qconfig - observer: Type[UniformQuantizationObserverBase] if qconfig.per_channel: torch_qscheme = ( torch.per_channel_symmetric @@ -406,33 +489,16 @@ def _get_torch_ao_qspec_from_nncf_config( else torch.per_tensor_affine ) if is_weight: - mapping_type = ( - MappingType.SYMMETRIC - if qconfig.mode == QuantizationScheme.SYMMETRIC - else MappingType.ASYMMETRIC + observer = PerChannelMinMaxObserver + quant_min = -128 + quant_max = 127 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = ( + torch.per_channel_symmetric + if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC + else torch.per_channel_affine ) - if qconfig.num_bits == 4: - extra_args["mapping_type"] = mapping_type - extra_args["target_dtype"] = torch.int8 - extra_args["granularity"] = PerGroup(group_size=group_size) - observer = PTPerBlockParamObserver - quant_min = -8 - quant_max = 7 - dtype = torch.int8 - channel_axis = 0 - elif qconfig.num_bits == 8: - observer = ( - NNCFInt8observer if weights_only else PerChannelMinMaxObserver - ) - quant_min = -128 - quant_max = 127 - dtype = torch.int8 - channel_axis = 0 - torch_qscheme = ( - torch.per_channel_symmetric - if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC - else torch.per_channel_affine - ) else: observer = ( HistogramObserver @@ -514,4 +580,4 @@ def quantize_model( smooth_quant=smooth_quant, **kwargs, ) - return quantized_model + return quantized_model \ No newline at end of file diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 47527a326f9..54acf67a21d 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -50,6 +50,7 @@ get_pt2e_quantization_params, get_pt2e_quantizers, get_qnn_quantizer, + get_ov_quantizer, get_vulkan_quantizer, ) from executorch.util.activation_memory_profiler import generate_memory_trace @@ -205,6 +206,8 @@ def build_args_parser() -> argparse.ArgumentParser: choices=[ "xnnpack_dynamic", "xnnpack_dynamic_qc4", + "openvino_8da4w", + "openvino_8da8w", "qnn_8a8w", "qnn_16a16w", "qnn_16a4w", @@ -786,6 +789,12 @@ def get_quantizer_and_quant_params(llm_config): llm_config.quantization.pt2e_quantize.value, llm_config.quantization.qmode ) quantizers.append(qnn_quantizer) + if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: + assert len(quantizers) == 0, "Should not enable both xnnpack and openvino" + ov_quantizer = get_ov_quantizer( + llm_config.quantization.pt2e_quantize.value, llm_config.quantization.group_size + ) + quantizers.append(ov_quantizer) if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize: assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml" coreml_quantizer = get_coreml_quantizer( diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index d87c722363f..4669d09e0e7 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -207,7 +207,7 @@ def get_qnn_quantizer( f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w." ) - assert ( + assert (get_qnn_quantizer quantization_mode is None ), "Currently qnn backend only supports QnnQuantizer via pt2e flow" qnn_quantizer.add_custom_quant_annotations(custom_annotations) @@ -215,6 +215,42 @@ def get_qnn_quantizer( return qnn_quantizer, quant_dtype +def get_ov_quantizer( + pt2e_quantize: str, + group_size: int = 32, +): + try: + from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode + + except ImportError: + raise ImportError( + "Please install nncf via backends/openvino/requirements.txt" + ) + + backend, quant_config = pt2e_quantize.split("_") + assert ( + backend == "openvino" + ), f"The quantization config is for backend {backend} instead of openvino." + ov_quantizer = OpenVINOQuantizer() + # Manually ignore MP layers. + # ov_quantizer.set_ignored_scope() + + extra_quantizer_options = {"group_size": group_size} + if quant_config == "8da4w": + mode = QuantizationMode.INT4WO_SYM + + elif quant_config == "8da8w": + mode = QuantizationMode.INT8WO_SYM + else: + raise AssertionError( + f"No support for quant type {quant_config}. Support 8a4w, 8a8w only." + ) + + ov_quantizer = OpenVINOQuantizer(mode=mode, **extra_quantizer_options) + + return ov_quantizer + + def get_coreml_quantizer(pt2e_quantize: str): try: from coremltools.optimize.torch.quantization.quantization_config import ( From 4cc7694433b12f7c8afe4c61b785e5158e0798e0 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 26 Aug 2025 18:32:27 +0400 Subject: [PATCH 16/85] fixes --- backends/openvino/quantizer/quantizer.py | 10 ++++-- examples/models/llama/export_llama_lib.py | 9 +++-- extension/llm/export/config/llm_config.py | 2 ++ extension/llm/export/quantizer_lib.py | 42 +++++++++++++++++++---- 4 files changed, 51 insertions(+), 12 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 31d41bff7be..f594c6fffa8 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -12,6 +12,7 @@ import nncf # type: ignore[import-untyped] import nncf.common.quantization as quantization # type: ignore[import-untyped] +from nncf.common.scopes import should_consider_scope # type: ignore[import-untyped] import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped] import torch.fx @@ -176,8 +177,12 @@ def _annotate_weight_compression( """ self._algo.set_backend_entity(model) nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) + ignored_names = self._algo.get_ignored_node_names(nncf_graph) for node in nodes_to_compress: + is_target_node = should_consider_scope(node.node_name, ignored_names) + if not is_target_node: + continue target_node = nncf_fx.node_utils.get_graph_node_by_name( graph, node.node_name ) @@ -442,9 +447,9 @@ def _get_torch_ao_qspec_from_nncf_config( else MappingType.ASYMMETRIC ) if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]: + extra_args["group_size"] = group_size extra_args["mapping_type"] = mapping_type extra_args["target_dtype"] = torch.int8 - extra_args["group_size"] = group_size observer = INT4WeightObserver quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0 quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15 @@ -454,7 +459,7 @@ def _get_torch_ao_qspec_from_nncf_config( else: observer = INT8WeightObserver quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0 - quant_max = 1277 if mapping_type == MappingType.SYMMETRIC else 255 + quant_max = 127 if mapping_type == MappingType.SYMMETRIC else 255 dtype = torch.int8 channel_axis = 0 torch_qscheme = ( @@ -462,7 +467,6 @@ def _get_torch_ao_qspec_from_nncf_config( if qmode == QuantizationMode.INT8WO_SYM else torch.per_channel_affine ) - return QuantizationSpec( dtype=dtype, observer_or_fake_quant_ctr=observer.with_args(**extra_args), diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 54acf67a21d..269f927e9f6 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -791,8 +791,10 @@ def get_quantizer_and_quant_params(llm_config): quantizers.append(qnn_quantizer) if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: assert len(quantizers) == 0, "Should not enable both xnnpack and openvino" + group_size = llm_config.quantization.group_size + group_size = group_size if group_size else 32 ov_quantizer = get_ov_quantizer( - llm_config.quantization.pt2e_quantize.value, llm_config.quantization.group_size + llm_config.quantization.pt2e_quantize.value, ) quantizers.append(ov_quantizer) if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize: @@ -904,6 +906,7 @@ def _to_edge_and_lower_llama_xnnpack( def _to_edge_and_lower_llama_openvino( builder_exported, modelname, + quantizers, additional_passes, openvino_device: str = "CPU", nncf_compression: bool = False, @@ -935,7 +938,6 @@ def _to_edge_and_lower_llama_openvino( def transform_fn(prompts: str, tokenizer): tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) - logging.error(tokenized_text) inputs = () inputs = ( @@ -971,7 +973,7 @@ def transform_fn(prompts: str, tokenizer): sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, ) - builder = builder_exported.to_edge_transform_and_lower(partitioners) + builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) @@ -1214,6 +1216,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 builder = _to_edge_and_lower_llama_openvino( builder_exported, modelname, + quantizers, additional_passes, openvino_device=llm_config.backend.openvino.device, nncf_compression=llm_config.backend.openvino.nncf_compression, diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index ab18c19159b..b4175d54cd7 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -275,6 +275,8 @@ class Pt2eQuantize(str, Enum): xnnpack_dynamic = "xnnpack_dynamic" xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4" + openvino_8da4w = "openvino_8da4w" + openvino_8da8w = "openvino_8da8w" qnn_8a8w = "qnn_8a8w" qnn_16a16w = "qnn_16a16w" qnn_16a4w = "qnn_16a4w" diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 4669d09e0e7..2a20a90d55a 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -207,7 +207,7 @@ def get_qnn_quantizer( f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w." ) - assert (get_qnn_quantizer + assert ( quantization_mode is None ), "Currently qnn backend only supports QnnQuantizer via pt2e flow" qnn_quantizer.add_custom_quant_annotations(custom_annotations) @@ -231,22 +231,52 @@ def get_ov_quantizer( assert ( backend == "openvino" ), f"The quantization config is for backend {backend} instead of openvino." - ov_quantizer = OpenVINOQuantizer() + assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel." + # Manually ignore MP layers. - # ov_quantizer.set_ignored_scope() + fp_node_names = linear_list = [ + "embedding", # First embedding is kept in Full precision + "linear_14", + "linear_15", + "linear_35", + "linear_56", + "linear_57", + "linear_63", + "linear_70", + "linear_71", + "linear_77", + "linear_78", + "linear_81", + "linear_84", + "linear_85", + "linear_88", + "linear_89", + "linear_91", + "linear_92", + "linear_95", + "linear_96", + "linear_98", + "linear_99", + "linear_102", + "linear_103", + "linear_105", + "linear_106", + "linear_109", + "linear_110", + "linear_112",] - extra_quantizer_options = {"group_size": group_size} if quant_config == "8da4w": mode = QuantizationMode.INT4WO_SYM elif quant_config == "8da8w": + group_size = -1 mode = QuantizationMode.INT8WO_SYM else: raise AssertionError( f"No support for quant type {quant_config}. Support 8a4w, 8a8w only." ) - - ov_quantizer = OpenVINOQuantizer(mode=mode, **extra_quantizer_options) + ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size) + ov_quantizer.set_ignored_scope(names=fp_node_names) return ov_quantizer From 5da40a57d7d42363b795d483630b00d9ce4b5f31 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 27 Aug 2025 13:48:41 +0400 Subject: [PATCH 17/85] support all_layers, backup mode in OVQuantizer --- backends/openvino/quantizer/quantizer.py | 25 ++++--- examples/models/llama/export_llama_lib.py | 82 ++++++++++------------- extension/llm/export/quantizer_lib.py | 8 +-- 3 files changed, 55 insertions(+), 60 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index f594c6fffa8..2ede04e53db 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -116,8 +116,14 @@ def __init__( ), # Mode value has to match NNCF CompressWeightsMode **kwargs, ) + subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve + dataset = None # Only Data Free Quantization is Supported in OVQuantizer + compression_format = nncf.CompressionFormat.DQ + nncf.quantization.algorithms.weight_compression.algorithm.check_user_compression_configuration( + subset_size=subset_size, dataset=dataset, compression_format=compression_format, **weight_compression_configuration + ) self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( - subset_size=None, **weight_compression_configuration + subset_size=subset_size, **weight_compression_configuration ) def set_ignored_scope( @@ -176,21 +182,20 @@ def _annotate_weight_compression( :return: Updated mapping of FX nodes with weight compression annotations. """ self._algo.set_backend_entity(model) - nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) - ignored_names = self._algo.get_ignored_node_names(nncf_graph) + all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph) - for node in nodes_to_compress: - is_target_node = should_consider_scope(node.node_name, ignored_names) - if not is_target_node: - continue + for wc_param in all_wc_params: + wc_config = wc_param.compression_config + node_with_weight = wc_param.node_with_weight target_node = nncf_fx.node_utils.get_graph_node_by_name( - graph, node.node_name + graph, node_with_weight.node_name ) annotation = node_vs_torch_annotation[target_node] edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph) - group_size = getattr(self._algo, "_group_size", -1) + group_size = wc_config.group_size + qmode = wc_config.mode qspec = self._get_torch_ao_qspec_from_nncf_config( - qp=None, group_size=group_size, qmode=self.mode, weights_only=True + qp=None, group_size=group_size, qmode=qmode, weights_only=True ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 269f927e9f6..00785491100 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -792,9 +792,9 @@ def get_quantizer_and_quant_params(llm_config): if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: assert len(quantizers) == 0, "Should not enable both xnnpack and openvino" group_size = llm_config.quantization.group_size - group_size = group_size if group_size else 32 + group_size = group_size if group_size else 32 ov_quantizer = get_ov_quantizer( - llm_config.quantization.pt2e_quantize.value, + llm_config.quantization.pt2e_quantize.value, group_size ) quantizers.append(ov_quantizer) if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize: @@ -921,59 +921,51 @@ def _to_edge_and_lower_llama_openvino( logging.info("Lowering model using following partitioner(s): ") for partitioner in partitioners: logging.info(f"--> {partitioner.__class__.__name__}") - + try: + import nncf + from functools import partial + from pytorch_tokenizers import get_tokenizer + except ImportError: + raise ImportError( + "Please install nncf via backends/openvino/requirements.txt" + ) + + tokenizer = get_tokenizer(builder_exported.tokenizer_path) + from datasets import load_dataset # Use NNCF compression if enabled # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize if nncf_compression: - try: - from functools import partial - - import nncf - from pytorch_tokenizers import get_tokenizer - except ImportError: - raise ImportError( - "Please install nncf via backends/openvino/requirements.txt" - ) - tokenizer = get_tokenizer(builder_exported.tokenizer_path) - - def transform_fn(prompts: str, tokenizer): - tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) - + dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + dataset = dataset.filter(lambda example: example['text'].strip() != "") + dataset = dataset.filter(lambda example: example['text'].strip() != "\n") + def transform_fn( + prompts: str, tokenizer + ): + tokenized_text = tokenizer.encode(prompts["text"], bos=False, eos=False) + device = torch.device("cpu") if openvino_device=="CPU" else torch.device("cuda") inputs = () inputs = ( - torch.tensor(tokenized_text).unsqueeze(0), - {"input_pos": torch.tensor([0])}, + torch.tensor(tokenized_text[:128], device=device).unsqueeze(0), + {"input_pos": torch.tensor([0], device=device)}, ) return inputs - - builder_exported.calibration_data = ( - [builder_exported.calibration_data] - if isinstance(builder_exported.calibration_data, str) - else builder_exported.calibration_data - ) - builder_exported.calibration_data = ( - [ - word - for prompt in builder_exported.calibration_data - for word in prompt.split() - ] - if not builder_exported.dynamic_shapes - else builder_exported.calibration_data - ) - + builder_exported.pre_autograd_graph_module = nncf.compress_weights( - builder_exported.pre_autograd_graph_module, - dataset=nncf.Dataset( - builder_exported.calibration_data, - transform_func=partial(transform_fn, tokenizer=tokenizer), - ), - mode=nncf.CompressWeightsMode.INT4_SYM, - ratio=0.8, - sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, - ) + builder_exported.pre_autograd_graph_module, + dataset=nncf.Dataset(dataset, partial(transform_fn, tokenizer=tokenizer)), + mode=nncf.CompressWeightsMode.INT4_SYM, + group_size=32, + backup_mode=nncf.BackupMode.NONE, + ratio=0.8, + sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, + ) + + builder = builder_exported.to_edge_transform_and_lower(partitioners) + + else: + builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) - builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 2a20a90d55a..9220c1efbdc 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -221,7 +221,7 @@ def get_ov_quantizer( ): try: from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode - + import nncf except ImportError: raise ImportError( "Please install nncf via backends/openvino/requirements.txt" @@ -234,8 +234,7 @@ def get_ov_quantizer( assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel." # Manually ignore MP layers. - fp_node_names = linear_list = [ - "embedding", # First embedding is kept in Full precision + fp_node_names = [ "linear_14", "linear_15", "linear_35", @@ -262,8 +261,7 @@ def get_ov_quantizer( "linear_105", "linear_106", "linear_109", - "linear_110", - "linear_112",] + "linear_110",] if quant_config == "8da4w": mode = QuantizationMode.INT4WO_SYM From 9e65a7ef860e5725522859bbf8d863c76e26503d Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 27 Aug 2025 17:29:05 +0400 Subject: [PATCH 18/85] clean up and use new nncf method for obtaining compression parameters --- backends/openvino/quantizer/observers.py | 127 ++++++----------------- backends/openvino/quantizer/quantizer.py | 52 ++++------ 2 files changed, 48 insertions(+), 131 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 2ea66f11a55..845a091d24b 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -25,10 +25,7 @@ ) from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped] from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] - WeightCompressionConfig, -) -from nncf.quantization.algorithms.weight_compression.torch_fx_backend import ( # type: ignore[import-untyped] - FXWeightCompressionAlgoBackend, + WeightCompressionParameters, ) from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped] do_integer_quantization, @@ -45,19 +42,31 @@ INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor, ) -from torchao.quantization.pt2e import MappingType, ObserverBase -from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes +from torchao.quantization.pt2e import ObserverBase + class WeightObserverBase(ObserverBase, ABC): """ Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation. """ + def __init__( + self, + wc_param: WeightCompressionParameters, + dtype: torch.dtype, + **kwargs, + ) -> None: + """ + :param wc_param: Weight compression parameter which contains information such as group_size + reduction_axes, quantization mode etc. + :param dtype: target dtype for quantization such as int8, uint8, etc. + """ + super().__init__(dtype=dtype, is_dynamic=False) + self.wc_param = wc_param + def calculate_qparams( # type: ignore[override] self, weight: torch.Tensor, - observer_node: torch.fx.Node, - model: torch.fx.GraphModule, ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """ Calculate quantization parameters such as scale, quantized weight and zero point. @@ -65,26 +74,11 @@ def calculate_qparams( # type: ignore[override] :param weight: FP weight to be used for calculating qparams. :return: quantization params quantized weight, scale and zero point """ - ndims = len(weight.size()) - node_with_weight, weight_port_id = ( - WeightObserverBase.get_node_with_weight_and_port_ids(observer_node, model) - ) - _, node_metatype = GraphConverter.get_node_type_and_metatype( - node_with_weight, model - ) - # Special case where embedding metatype has to be mapped to AtenEmbedding metatype - node_metatype = ( - om.PTAtenEmbeddingMetatype - if node_metatype == om.PTEmbeddingMetatype - else node_metatype - ) - reduction_dims = get_weight_compression_reduction_axes( - node_metatype, weight_port_id, ndims - ) - reduction_dims = tuple(reduction_dims) - + wc_param = self.get_wc_param() + wc_config = wc_param.compression_config + reduction_axes = wc_param.reduction_axes q_weight, scale, zp = do_integer_quantization( - Tensor(weight), self.wc_config, reduction_axes=reduction_dims + Tensor(weight), wc_config, reduction_axes=reduction_axes ) zp = zp.data if zp is not None else None return q_weight.data, scale.data, zp @@ -92,23 +86,6 @@ def calculate_qparams( # type: ignore[override] def forward(self, x: torch.Tensor) -> torch.Tensor: return x - @staticmethod - def get_node_with_weight_and_port_ids( - observer_node: torch.fx.Node, model: torch.fx.GraphModule - ) -> Tuple[torch.fx.Node, int]: - """ - Returns the node which contains the weight and the weight port id. - - :param observer_node: Observer node for the weight. - :param graph: The model. - :return: Node which contains the weight (for eg. Linear node) and the port ID for the weight. - """ - for node in model.graph.nodes: - if observer_node in node.all_input_nodes: - return node, node.all_input_nodes.index(observer_node) - msg = f"Observer node {observer_node.name} has no consumer node" - raise RuntimeError(msg) - def convert( self, model: torch.fx.GraphModule, observer_node: torch.fx.Node ) -> None: @@ -126,7 +103,7 @@ def convert( weight_node = observer_node.args[0] original_weight = get_tensor_constant_from_node(weight_node, model) q_weight, scale, zero_point = self.calculate_qparams( - original_weight, observer_node, model + original_weight ) decompressor = self._create_decompressor( @@ -134,6 +111,7 @@ def convert( ) packed_q_weight = decompressor.pack_weight(q_weight) + # Weight port id is 0 since observer is inserted for a single weight only. constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) compressed_weight_name = observer_node.all_input_nodes[0].name @@ -177,7 +155,7 @@ def _create_decompressor( pass @abstractmethod - def get_wc_config(self) -> WeightCompressionConfig: + def get_wc_param(self) -> WeightCompressionParameters: """ Used to return the respective NNCF Weight Compression Config. @@ -191,30 +169,6 @@ class INT4WeightObserver(WeightObserverBase): This class defines the behavior for INT4 Weight Compression which has per-group granularity. """ - def __init__( - self, - group_size: int, - mapping_type: MappingType, - target_dtype: torch.dtype, - *args, - **kwargs, - ) -> None: - """ - :param group_size: Group size for group wise quantization. group_size=-1 means it is per-channel quantization. - :param mapping_type: MappingType.SYMMETRIC and MappingType.ASYMMETRIC are supported types for this argument for symmetric or asymmetric quantization. - :param target_dtype: target dtype for quantization such as int8, uint8, etc. - """ - super().__init__(dtype=target_dtype, is_dynamic=False) - self.wc_config = None - self.mapping_type = mapping_type - - qmode = ( - CompressWeightsMode.INT4_ASYM - if self.mapping_type == MappingType.ASYMMETRIC - else CompressWeightsMode.INT4_SYM - ) - self.wc_config = WeightCompressionConfig(mode=qmode, group_size=group_size) - def _create_decompressor( self, scale: torch.Tensor, @@ -235,8 +189,8 @@ def _create_decompressor( scale, q_weight.shape, original_weight.shape, original_weight.dtype ) - def get_wc_config(self): - return self.wc_config + def get_wc_param(self) -> WeightCompressionParameters: + return self.wc_param class INT8WeightObserver(WeightObserverBase): @@ -244,30 +198,6 @@ class INT8WeightObserver(WeightObserverBase): This class defines the behavior for Int8 WC which has per channel granularity. """ - def __init__( - self, - qscheme: torch.qscheme, - dtype: torch.dtype, - ch_axis: int = 0, - *args, - **kwargs, - ) -> None: - """ - :param qscheme: Quantization scheme which is per-channel for Int8 WC. - :param dtype: dtype for quantization such as int8, uint8, etc.. - :param ch_axis: Channel axis. - """ - super().__init__(dtype=dtype, is_dynamic=False) - self.wc_config = None - self.qscheme = qscheme - - qmode = ( - CompressWeightsMode.INT8_SYM - if self.qscheme == torch.per_channel_symmetric - else CompressWeightsMode.INT8_ASYM - ) - self.wc_config = WeightCompressionConfig(mode=qmode) - def _create_decompressor( self, scale: torch.Tensor, @@ -282,5 +212,6 @@ def _create_decompressor( else: return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) - def get_wc_config(self): - return self.wc_config \ No newline at end of file + def get_wc_param(self) -> WeightCompressionParameters: + return self.wc_param + \ No newline at end of file diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 2ede04e53db..ef9a83ca77c 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -24,9 +24,11 @@ from nncf.quantization.quantize_model import ( # type: ignore[import-untyped] get_weight_compression_configuration, ) +from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] + WeightCompressionParameters, +) from torchao.quantization.pt2e import ( HistogramObserver, - MappingType, PerChannelMinMaxObserver, UniformQuantizationObserverBase, ) @@ -112,16 +114,11 @@ def __init__( else: weight_compression_configuration = get_weight_compression_configuration( mode.value.replace( - "_wc", "" + "wo", "" ), # Mode value has to match NNCF CompressWeightsMode **kwargs, ) subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve - dataset = None # Only Data Free Quantization is Supported in OVQuantizer - compression_format = nncf.CompressionFormat.DQ - nncf.quantization.algorithms.weight_compression.algorithm.check_user_compression_configuration( - subset_size=subset_size, dataset=dataset, compression_format=compression_format, **weight_compression_configuration - ) self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( subset_size=subset_size, **weight_compression_configuration ) @@ -185,17 +182,14 @@ def _annotate_weight_compression( all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph) for wc_param in all_wc_params: - wc_config = wc_param.compression_config node_with_weight = wc_param.node_with_weight target_node = nncf_fx.node_utils.get_graph_node_by_name( graph, node_with_weight.node_name ) annotation = node_vs_torch_annotation[target_node] edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph) - group_size = wc_config.group_size - qmode = wc_config.mode qspec = self._get_torch_ao_qspec_from_nncf_config( - qp=None, group_size=group_size, qmode=qmode, weights_only=True + qp=None, wc_param=wc_param ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) @@ -425,19 +419,16 @@ def _fill_torch_ao_annotation( @staticmethod def _get_torch_ao_qspec_from_nncf_config( qp: quantization.quantizer_setup.QuantizationPointBase, - group_size: int = -1, - qmode: Optional[QuantizationMode] = None, - weights_only: bool = False, + wc_param: WeightCompressionParameters = None, ) -> QuantizationSpec: """ Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments. - For weight-only quantization (e.g., INT4/INT8 compression), uses `qmode`, `group_size`, - and `weights_only`. For post-training quantization, only `qp` is required. + For weight-only quantization (e.g., INT4/INT8 compression), uses `wc_param` which carries + weight only quantization info such as group_size, reduction_axes etc. For post-training + quantization, only `qp` is required. :param qp: Quantization point from NNCF. - :param group_size: Group size for INT4 group-wise quantization. - :param qmode: Quantization mode for weight compression. - :param weights_only: If True, applies weight-only quantization logic. + :param wc_param: NNCF Weight compression parameters for the node. :return: A TorchAO QuantizationSpec. """ observer: Type[UniformQuantizationObserverBase] @@ -445,26 +436,21 @@ def _get_torch_ao_qspec_from_nncf_config( # Eps value is copied from nncf/torch/quantization/layers.py extra_args: Dict[str, Any] = {"eps": 1e-16} - if weights_only: - mapping_type = ( - MappingType.SYMMETRIC - if qmode == QuantizationMode.INT4WO_SYM - else MappingType.ASYMMETRIC - ) - if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]: - extra_args["group_size"] = group_size - extra_args["mapping_type"] = mapping_type - extra_args["target_dtype"] = torch.int8 + if wc_param: + qmode = wc_param.compression_config.mode + if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: + extra_args["wc_param"] = wc_param observer = INT4WeightObserver - quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0 - quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15 + quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0 + quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15 dtype = torch.int8 channel_axis = 0 torch_qscheme = None else: + extra_args["wc_param"] = wc_param observer = INT8WeightObserver - quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0 - quant_max = 127 if mapping_type == MappingType.SYMMETRIC else 255 + quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0 + quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255 dtype = torch.int8 channel_axis = 0 torch_qscheme = ( From 53e0f4cd0e01ed5a8adb85a7c08a2722d4a5a622 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 1 Sep 2025 10:39:20 +0400 Subject: [PATCH 19/85] review changes & update method names according to wc algo --- backends/openvino/quantizer/observers.py | 4 ++-- backends/openvino/quantizer/quantizer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 845a091d24b..50fcc673ed6 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -30,7 +30,7 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped] do_integer_quantization, ) -from nncf.tensor.tensor import Tensor # type: ignore[import-untyped] +from nncf.tensor.tensor import Tensor as NNCFTensor # type: ignore[import-untyped] from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped] PTTargetPoint, TargetType, @@ -78,7 +78,7 @@ def calculate_qparams( # type: ignore[override] wc_config = wc_param.compression_config reduction_axes = wc_param.reduction_axes q_weight, scale, zp = do_integer_quantization( - Tensor(weight), wc_config, reduction_axes=reduction_axes + NNCFTensor(weight), wc_config, reduction_axes=reduction_axes ) zp = zp.data if zp is not None else None return q_weight.data, scale.data, zp diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index ef9a83ca77c..2e364424b16 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -179,7 +179,7 @@ def _annotate_weight_compression( :return: Updated mapping of FX nodes with weight compression annotations. """ self._algo.set_backend_entity(model) - all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph) + all_wc_params, _ = self._algo.get_weight_compression_parameters(model, nncf_graph) for wc_param in all_wc_params: node_with_weight = wc_param.node_with_weight From bf959305dc210416f20c327509291db3655028e9 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 1 Sep 2025 11:14:13 +0400 Subject: [PATCH 20/85] review changes --- backends/openvino/quantizer/observers.py | 2 +- backends/openvino/quantizer/quantizer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 50fcc673ed6..b1054460a16 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -166,7 +166,7 @@ def get_wc_param(self) -> WeightCompressionParameters: class INT4WeightObserver(WeightObserverBase): """ - This class defines the behavior for INT4 Weight Compression which has per-group granularity. + OpenVINO INT4 Weight Compression observer. """ def _create_decompressor( diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 2e364424b16..485d67e3bb9 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -187,7 +187,7 @@ def _annotate_weight_compression( graph, node_with_weight.node_name ) annotation = node_vs_torch_annotation[target_node] - edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph) + edge_or_node = self._get_weight_edge(target_node, nncf_graph) qspec = self._get_torch_ao_qspec_from_nncf_config( qp=None, wc_param=wc_param ) From 2d4bec7a4b0041ead027a6c651e00eee32343dc4 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 1 Sep 2025 11:31:40 +0400 Subject: [PATCH 21/85] review changes --- backends/openvino/quantizer/observers.py | 38 ++++++----------------- backends/openvino/quantizer/quantizer.py | 7 +---- examples/models/llama/export_llama_lib.py | 2 +- 3 files changed, 12 insertions(+), 35 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index b1054460a16..d44a22556dd 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -9,12 +9,7 @@ from abc import ABC, abstractmethod from typing import Optional, Tuple -import nncf.torch.graph.operator_metatypes as om # type: ignore[import-untyped] - import torch -from nncf.experimental.torch.fx.nncf_graph_builder import ( # type: ignore[import-untyped] - GraphConverter, -) from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped] get_tensor_constant_from_node, @@ -23,7 +18,6 @@ constant_update_fn, module_insertion_transformation_builder, ) -from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped] from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] WeightCompressionParameters, ) @@ -57,9 +51,8 @@ def __init__( **kwargs, ) -> None: """ - :param wc_param: Weight compression parameter which contains information such as group_size - reduction_axes, quantization mode etc. - :param dtype: target dtype for quantization such as int8, uint8, etc. + :param wc_param: Weight compression parameters container. + :param dtype: target dtype for the quantization. """ super().__init__(dtype=dtype, is_dynamic=False) self.wc_param = wc_param @@ -69,10 +62,10 @@ def calculate_qparams( # type: ignore[override] weight: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """ - Calculate quantization parameters such as scale, quantized weight and zero point. + Calculates quantization parameters: quantized weight, quantization scale and quantization zero point. :param weight: FP weight to be used for calculating qparams. - :return: quantization params quantized weight, scale and zero point + :return: A tuple containing the quantized weight, quantization scale and quantization zero point. """ wc_param = self.get_wc_param() wc_config = wc_param.compression_config @@ -90,10 +83,8 @@ def convert( self, model: torch.fx.GraphModule, observer_node: torch.fx.Node ) -> None: """ - Converts the weight observer node into a decompression subgraph after calibration. - This method is responsible for transforming the model after the quantization preparation - and calibration phases. It replaces the observer node with the quantized weight and a decompression - module. + Replaces the given observer node from the given model with a quantized + weight and a OpenVINO specific decompression module. :param model: A `torch.fx.GraphModule` representing the statically traced model with observer nodes attached and calibrated. @@ -144,7 +135,7 @@ def _create_decompressor( original_weight: torch.Tensor, ) -> BaseWeightsDecompressor: """ - Used to return the respective NNCF decompressor for different types of quantization. + Returns a respective NNCF decompressor for different types of quantization. :param scale: Calculated scale quantization parameter. :param zero_point: Calculated zero_point quantization parameter. @@ -152,17 +143,14 @@ def _create_decompressor( :param original_weight: FP weight. :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO. """ - pass - @abstractmethod def get_wc_param(self) -> WeightCompressionParameters: """ - Used to return the respective NNCF Weight Compression Config. + Returns a respective NNCF Weight Compression Config. :return: Weight compression config with the compression information such as qmode, group_size etc. """ - pass - + return self.wc_param class INT4WeightObserver(WeightObserverBase): """ @@ -189,13 +177,10 @@ def _create_decompressor( scale, q_weight.shape, original_weight.shape, original_weight.dtype ) - def get_wc_param(self) -> WeightCompressionParameters: - return self.wc_param - class INT8WeightObserver(WeightObserverBase): """ - This class defines the behavior for Int8 WC which has per channel granularity. + OpenVINO INT8 Weight Compression per channel observer. """ def _create_decompressor( @@ -212,6 +197,3 @@ def _create_decompressor( else: return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) - def get_wc_param(self) -> WeightCompressionParameters: - return self.wc_param - \ No newline at end of file diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 485d67e3bb9..7f86686d03c 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -205,15 +205,10 @@ def _annotate_post_training_quantization( """ Annotates the model graph with post-training quantization configurations. - Converts NNCF quantization points into TorchAO-compatible quantization specs, - assigning them to corresponding nodes or edges. Also handles unified scale groups, - ensuring shared quantization specs across grouped quantizers with consistent configs. - :param model: The FX GraphModule to annotate. :param graph: The underlying FX graph. :param nncf_graph: The corresponding NNCF graph. :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations. - :return: Updated mapping of FX nodes with post-training quantization annotations. """ quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) @@ -575,4 +570,4 @@ def quantize_model( smooth_quant=smooth_quant, **kwargs, ) - return quantized_model \ No newline at end of file + return quantized_model diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 00785491100..269022f2cf7 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -790,7 +790,7 @@ def get_quantizer_and_quant_params(llm_config): ) quantizers.append(qnn_quantizer) if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: - assert len(quantizers) == 0, "Should not enable both xnnpack and openvino" + assert quantizers, "Should not enable both xnnpack and openvino" group_size = llm_config.quantization.group_size group_size = group_size if group_size else 32 ov_quantizer = get_ov_quantizer( From 0a2e361f04aa724c8af7d88c1dbd286b4c7556d6 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Wed, 3 Sep 2025 20:48:10 +0400 Subject: [PATCH 22/85] Update export_llama_lib.py --- examples/models/llama/export_llama_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 269022f2cf7..8eab3eefbc0 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -790,7 +790,7 @@ def get_quantizer_and_quant_params(llm_config): ) quantizers.append(qnn_quantizer) if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: - assert quantizers, "Should not enable both xnnpack and openvino" + assert not quantizers, "Should not enable both xnnpack and openvino" group_size = llm_config.quantization.group_size group_size = group_size if group_size else 32 ov_quantizer = get_ov_quantizer( From 4c86a9c91d6eeec8eca53ea66d4f5132cd007a6d Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Wed, 3 Sep 2025 13:32:08 -0700 Subject: [PATCH 23/85] enable group_size parameter for nncf compression --- backends/openvino/requirements.txt | 2 +- examples/models/llama/export_llama_lib.py | 3 +++ extension/llm/export/config/llm_config.py | 5 ++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt index 316633e9004..2ada445414c 100644 --- a/backends/openvino/requirements.txt +++ b/backends/openvino/requirements.txt @@ -1,2 +1,2 @@ transformers -git+https://github.com/openvinotoolkit/nncf@6b0fc1c#egg=nncf +git+https://github.com/openvinotoolkit/nncf@5cb2b58#egg=nncf diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 47527a326f9..417d25550ab 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -898,6 +898,7 @@ def _to_edge_and_lower_llama_openvino( additional_passes, openvino_device: str = "CPU", nncf_compression: bool = False, + nncf_compression_group_size: int = 32, verbose: bool = False, ) -> LLMEdgeManager: # noqa: C901 partitioners = [] @@ -959,6 +960,7 @@ def transform_fn(prompts: str, tokenizer): ), mode=nncf.CompressWeightsMode.INT4_SYM, ratio=0.8, + group_size=nncf_compression_group_size, sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, ) @@ -1208,6 +1210,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 additional_passes, openvino_device=llm_config.backend.openvino.device, nncf_compression=llm_config.backend.openvino.nncf_compression, + nncf_compression_group_size=llm_config.backend.openvino.nncf_compression_group_size, verbose=llm_config.debug.verbose, ) else: diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index ab18c19159b..c8f15bc1f9a 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -456,7 +456,8 @@ class OpenvinoConfig: enabled: bool = False device: str = "CPU" - nncf_compression = False + nncf_compression: bool = False + nncf_compression_group_size: int = 32 @dataclass @@ -645,6 +646,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 llm_config.backend.openvino.device = args.openvino_device if hasattr(args, "nncf_compression"): llm_config.backend.openvino.nncf_compression = args.nncf_compression + if hasattr(args, "group_size") and args.group_size: + llm_config.backend.openvino.nncf_compression_group_size = args.group_size # DebugConfig if hasattr(args, "profile_memory"): From 46ed3f6d5ca71439c13c781eea1156bd4383ad3c Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Wed, 3 Sep 2025 15:09:13 -0700 Subject: [PATCH 24/85] Update README.md --- backends/openvino/README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/backends/openvino/README.md b/backends/openvino/README.md index a67cf12eca2..73b6bd9b20a 100644 --- a/backends/openvino/README.md +++ b/backends/openvino/README.md @@ -42,11 +42,23 @@ executorch Before you begin, ensure you have openvino installed and configured on your system. -### Build OpenVINO from Source +### Use OpenVINO from Release Packages + +1. Download the OpenVINO release package from [here](https://docs.openvino.ai/2025/get-started/install-openvino.html). Make sure to select your configuration and click on **OpenVINO Archives** under the distribution section to download the appropriate archive for your platform. + +2. Extract the release package from the archive and set the environment variables. + + ```bash + tar -zxf openvino_toolkit_.tgz + cd openvino_toolkit_ + source setupvars.sh + ``` + +### (Optional) Build OpenVINO from Source ```bash git clone https://github.com/openvinotoolkit/openvino.git -cd openvino && git checkout b16b776ac119dafda51f69a80f1e6b7376d02c3b +cd openvino git submodule update --init --recursive sudo ./install_build_dependencies.sh mkdir build && cd build @@ -59,18 +71,6 @@ cd source setupvars.sh ``` -### Use OpenVINO from Release Packages - -1. Download the OpenVINO release package from [here](https://docs.openvino.ai/2025/get-started/install-openvino.html). Make sure to select your configuration and click on **OpenVINO Archives** under the distribution section to download the appropriate archive for your platform. - -2. Extract the release package from the archive and set the environment variables. - - ```bash - tar -zxf openvino_toolkit_.tgz - cd openvino_toolkit_ - source setupvars.sh - ``` - For more information about OpenVINO build, refer to the [OpenVINO Build Instructions](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/build_linux.md). ### Setup From 0a1256eb351a5562e593f82ed921da2eeb9b245f Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Wed, 3 Sep 2025 15:26:08 -0700 Subject: [PATCH 25/85] Update README.md --- backends/openvino/README.md | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/backends/openvino/README.md b/backends/openvino/README.md index 73b6bd9b20a..ce10b902646 100644 --- a/backends/openvino/README.md +++ b/backends/openvino/README.md @@ -77,17 +77,27 @@ For more information about OpenVINO build, refer to the [OpenVINO Build Instruct Follow the steps below to setup your build environment: -1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment. -2. **Setup OpenVINO Backend Environment** +1. **Create a Virtual Environment** +- Create a virtual environment and activate it by executing the commands below. + ```bash + python -m venv env + source env/bin/activate + ``` +2. **Clone ExecuTorch Repository from Github** +- Clone Executorch repository by executing the command below. + ```bash + git clone --recurse-submodules https://github.com/pytorch/executorch.git + ``` +3. **Setup OpenVINO Backend Environment** - Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory ```bash pip install -r requirements.txt ``` Note: To achieve optimal performance with NNCF quantization, you should install the latest development version of NNCF (version 2.16.0.dev0+191b53d9 or higher). -3. Navigate to `scripts/` directory. +4. Navigate to `scripts/` directory. -4. **Build OpenVINO Backend C++ Libraries and Executor Runner**: Once the prerequisites are in place, run the `openvino_build.sh` script to start the build process. By default, OpenVINO backend will be built under `cmake-out/backends/openvino/` as `libopenvino_backend.a` +5. **Build OpenVINO Backend C++ Libraries and Executor Runner**: Once the prerequisites are in place, run the `openvino_build.sh` script to start the build process. By default, OpenVINO backend will be built under `cmake-out/backends/openvino/` as `libopenvino_backend.a` ```bash ./openvino_build.sh @@ -97,6 +107,7 @@ Follow the steps below to setup your build environment: ```bash ./openvino_build.sh --enable_python ``` +For more information about ExecuTorch environment setup, refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide. ### Run From f2151e3baddd32003f5d0e5bb36e34830207a76c Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Wed, 3 Sep 2025 17:25:15 -0700 Subject: [PATCH 26/85] Update README.md --- backends/openvino/README.md | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/backends/openvino/README.md b/backends/openvino/README.md index ce10b902646..cc5b20cbab8 100644 --- a/backends/openvino/README.md +++ b/backends/openvino/README.md @@ -89,24 +89,25 @@ Follow the steps below to setup your build environment: ```bash git clone --recurse-submodules https://github.com/pytorch/executorch.git ``` -3. **Setup OpenVINO Backend Environment** -- Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory +3. **Build ExecuTorch with OpenVINO Backend** +- Ensure that you are inside `executorch/backends/openvino/scripts` directory. The following command builds and installs ExecuTorch with the OpenVINO backend, and also compiles the C++ runtime binaries into `/cmake-out` for quick inference testing. ```bash - pip install -r requirements.txt - ``` - Note: To achieve optimal performance with NNCF quantization, you should install the latest development version of NNCF (version 2.16.0.dev0+191b53d9 or higher). -4. Navigate to `scripts/` directory. - -5. **Build OpenVINO Backend C++ Libraries and Executor Runner**: Once the prerequisites are in place, run the `openvino_build.sh` script to start the build process. By default, OpenVINO backend will be built under `cmake-out/backends/openvino/` as `libopenvino_backend.a` - - ```bash - ./openvino_build.sh + openvino_build.sh ``` +- Optionally, `openvino_build.sh` script can be used to build python package or C++ bineries seperately. **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder. - - ```bash + ```bash ./openvino_build.sh --enable_python ``` + **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` argument to build C++ runtime libraries into `/cmake-out` folder. `/cmake-out/backends/openvino/openvino_executor_runner` binary file can be used for quick inferencing with vision models. + ```bash + ./openvino_build.sh --cpp_runtime + ``` + **Build C++ Llama Runner**: This step requires first building the C++ runtime libraries by following the previous instructions. Then, run `openvino_build.sh` script with the `--llama_runner` argument to compile the llama runner to execute inference with models exported using `export_llama`. The compiled binary file is located in `/cmake-out/examples/models/llama/llama_main`. + ```bash + ./openvino_build.sh --llama_runner + ``` + For more information about ExecuTorch environment setup, refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide. ### Run From dfc8eab6d862a9be10e95fd6ae82e122c9869574 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Wed, 3 Sep 2025 17:55:26 -0700 Subject: [PATCH 27/85] openvino backend build script updates --- backends/openvino/scripts/openvino_build.sh | 155 ++++++++++++-------- 1 file changed, 91 insertions(+), 64 deletions(-) diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh index 08741840ddb..b7e5f5270ab 100755 --- a/backends/openvino/scripts/openvino_build.sh +++ b/backends/openvino/scripts/openvino_build.sh @@ -7,79 +7,106 @@ set -e EXECUTORCH_ROOT=$(realpath "$(dirname "$0")/../../..") echo EXECUTORCH_ROOT=${EXECUTORCH_ROOT} -main() { - build_type=${1:-"--cpp_runtime"} - - # If the first arguments is --cpp_runtime (default), build libraries for C++ runtime - if [[ -z "$build_type" || "$build_type" == "--cpp_runtime" ]]; then - echo "Building C++ Runtime Libraries" - - # Set build directory - local build_dir="cmake-out" - - # Enter the Executorch root directory - cd "$EXECUTORCH_ROOT" - rm -rf "${build_dir}" - - # Configure the project with CMake - # Note: Add any additional configuration options you need here - cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_OPENVINO=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ - -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ - -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ - -B"${build_dir}" - - - # Build the project - cmake --build ${build_dir} --target install --config Release -j$(nproc) +install_requirements() { + echo "Installing Requirements For OpenVINO Backend" + cd "$EXECUTORCH_ROOT" + pip install -r backends/openvino/requirements.txt +} - # If the first arguments is --enable_python, build python package with python bindings - elif [[ "$build_type" == "--enable_python" ]]; then - echo "Building Python Package with Pybinding" +build_cpp_runtime() { + echo "Building C++ Runtime Libraries" + + # Set build directory + local build_dir="cmake-out" + + # Enter the Executorch root directory + cd "$EXECUTORCH_ROOT" + rm -rf "${build_dir}" + + # Configure the project with CMake + # Note: Add any additional configuration options you need here + cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_OPENVINO=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ + -B"${build_dir}" + + + # Build the project + cmake --build ${build_dir} --target install --config Release -j$(nproc) +} + +build_llama_runner() { + echo "Building Export Llama Runner" + + # Set build directory + local build_dir="cmake-out" + + # Enter the Executorch root directory + cd "$EXECUTORCH_ROOT" + + # Configure the project with CMake + # Note: Add any additional configuration options you need here + cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \ + -DCMAKE_BUILD_TYPE=Release \ + -B"${build_dir}"/examples/models/llama \ + examples/models/llama + # Build the export llama runner + cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release +} - # Enter the Executorch root directory - cd "$EXECUTORCH_ROOT" - ./install_executorch.sh --clean +build_python_enabled() { + echo "Building Python Package with Pybinding" - # Set parameters to configure the project with CMake - # Note: Add any additional configuration options you need here - export CMAKE_ARGS="-DEXECUTORCH_BUILD_OPENVINO=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON" - export CMAKE_BUILD_ARGS="--target openvino_backend" + # Enter the Executorch root directory + cd "$EXECUTORCH_ROOT" + ./install_executorch.sh --clean - # Build the package - ./install_executorch.sh --minimal + # Set parameters to configure the project with CMake + # Note: Add any additional configuration options you need here + export CMAKE_ARGS="-DEXECUTORCH_BUILD_OPENVINO=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON" + export CMAKE_BUILD_ARGS="--target openvino_backend" - # Install torchao - pip install third-party/ao + # Build the package + ./install_executorch.sh --minimal + + # Install torchao + pip install third-party/ao +} + +main() { + build_type=${1:-"--build_all"} + + # If the first arguments is --build_all (default), build python package, C++ runtime, and llama runner binary + if [[ -z "$build_type" || "$build_type" == "--build_all" ]]; then + install_requirements + build_python_enabled + build_cpp_runtime + build_llama_runner + + # If the first arguments is --cpp_runtime, build libraries for C++ runtime + elif [[ "$build_type" == "--cpp_runtime" ]]; then + build_cpp_runtime # If the first arguments is --llama_runner, build export llama runner binary # Note: c++ runtime with openvino backend should be built before building export llama runner elif [[ "$build_type" == "--llama_runner" ]]; then - echo "Building Export Llama Runner" - - # Set build directory - local build_dir="cmake-out" - - # Enter the Executorch root directory - cd "$EXECUTORCH_ROOT" - - # Configure the project with CMake - # Note: Add any additional configuration options you need here - cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \ - -DCMAKE_BUILD_TYPE=Release \ - -B"${build_dir}"/examples/models/llama \ - examples/models/llama - # Build the export llama runner - cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release + build_llama_runner + + # If the first arguments is --enable_python, build python package with python bindings + elif [[ "$build_type" == "--enable_python" ]]; then + install_requirements + build_python_enabled + else echo "Error: Argument is not valid: $build_type" exit 1 # Exit the script with an error code From 2ac8a8c0b7ea3f2e0b391b1b7cba9460b71dad86 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 4 Sep 2025 15:41:46 -0700 Subject: [PATCH 28/85] Update README.md --- backends/openvino/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backends/openvino/README.md b/backends/openvino/README.md index cc5b20cbab8..71bd27f6b50 100644 --- a/backends/openvino/README.md +++ b/backends/openvino/README.md @@ -18,6 +18,11 @@ For more information on the supported hardware, please refer to [OpenVINO System executorch ├── backends │ └── openvino +│ ├── quantizer +│ ├── observers +│ └── nncf_observers.py +│ ├── __init__.py +│ └── quantizer.py │ ├── runtime │ ├── OpenvinoBackend.cpp │ └── OpenvinoBackend.h @@ -95,6 +100,7 @@ Follow the steps below to setup your build environment: openvino_build.sh ``` - Optionally, `openvino_build.sh` script can be used to build python package or C++ bineries seperately. + **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder. ```bash ./openvino_build.sh --enable_python From 35444aefa26b92b802305669fcef5a7ee857a654 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 4 Sep 2025 15:59:36 -0700 Subject: [PATCH 29/85] Update README.md --- backends/openvino/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backends/openvino/README.md b/backends/openvino/README.md index 71bd27f6b50..0046ad23486 100644 --- a/backends/openvino/README.md +++ b/backends/openvino/README.md @@ -95,21 +95,21 @@ Follow the steps below to setup your build environment: git clone --recurse-submodules https://github.com/pytorch/executorch.git ``` 3. **Build ExecuTorch with OpenVINO Backend** -- Ensure that you are inside `executorch/backends/openvino/scripts` directory. The following command builds and installs ExecuTorch with the OpenVINO backend, and also compiles the C++ runtime binaries into `/cmake-out` for quick inference testing. +- Ensure that you are inside `executorch/backends/openvino/scripts` directory. The following command builds and installs ExecuTorch with the OpenVINO backend, also compiles the C++ runtime libraries and binaries into `/cmake-out` for quick inference testing. ```bash openvino_build.sh ``` -- Optionally, `openvino_build.sh` script can be used to build python package or C++ bineries seperately. +- Optionally, `openvino_build.sh` script can be used to build python package or C++ libraries/binaries seperately. - **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder. + **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument as shown in the below command. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder. ```bash ./openvino_build.sh --enable_python ``` - **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` argument to build C++ runtime libraries into `/cmake-out` folder. `/cmake-out/backends/openvino/openvino_executor_runner` binary file can be used for quick inferencing with vision models. + **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `/cmake-out` directory. The binary located at `/cmake-out/backends/openvino/openvino_executor_runner` can be used to run inference with vision models. ```bash ./openvino_build.sh --cpp_runtime ``` - **Build C++ Llama Runner**: This step requires first building the C++ runtime libraries by following the previous instructions. Then, run `openvino_build.sh` script with the `--llama_runner` argument to compile the llama runner to execute inference with models exported using `export_llama`. The compiled binary file is located in `/cmake-out/examples/models/llama/llama_main`. + **Build C++ Llama Runner**: First, ensure the C++ runtime libraries are built by following the earlier instructions. Then, run the `openvino_build.sh` script with the `--llama_runner flag` to compile the LlaMA runner as shown the below command, which enables executing inference with models exported using export_llama. The resulting binary is located at: `/cmake-out/examples/models/llama/llama_main` ```bash ./openvino_build.sh --llama_runner ``` From 5b8b633a94ca13b672db873e07725363c2e2014c Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Thu, 4 Sep 2025 17:18:03 -0700 Subject: [PATCH 30/85] formatting fix --- backends/openvino/partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index a2920285f99..10d4b2b30a7 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -27,7 +27,7 @@ class PatternNode: - op_types = {} + op_types: dict[str, list] = {} def __init__(self): self.op_types = {} From f4a1423ddc5517495b0993d7d183450e4605f702 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Thu, 4 Sep 2025 17:33:16 -0700 Subject: [PATCH 31/85] formatting fix --- backends/openvino/partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index 10d4b2b30a7..4893a89bebb 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -114,7 +114,7 @@ def __init__( self.delegation_spec = DelegationSpec(OpenvinoBackend.__name__, compile_spec) self._op_types_to_skip = op_types_to_skip self._op_names_to_skip = op_names_to_skip - self._enabled_ops_by_name = set() + self._enabled_ops_by_name: set = set() def ops_to_not_decompose( self, From 44f08831df4d4707b1fba855299293ab435815f6 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Thu, 4 Sep 2025 17:39:03 -0700 Subject: [PATCH 32/85] formatting fix --- backends/openvino/partitioner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index 4893a89bebb..1d93ebd9cec 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -141,10 +141,10 @@ def check_pattern( self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list ) -> bool: if node.op == "call_function": - if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types: + if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types: # type: ignore[union-attr] pt_input_nodes = node.all_input_nodes pattern_input_ops = pattern.op_types[ - "call_function" + ":" + str(node.target.__name__) + "call_function" + ":" + str(node.target.__name__) # type: ignore[union-attr] ] if pattern_input_ops is None: enabled_ops.append(node) From 5f657d3ce8cdc34edf2c3129b274c02917a30231 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Fri, 5 Sep 2025 10:02:55 -0700 Subject: [PATCH 33/85] formatting fix --- backends/openvino/partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index 1d93ebd9cec..d4aff6fa7d3 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -27,7 +27,7 @@ class PatternNode: - op_types: dict[str, list] = {} + op_types: dict[str, Optional[list]] = {} def __init__(self): self.op_types = {} From eafcc33ab6bf99b0bfe8155f324af3e961cba279 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Fri, 5 Sep 2025 10:06:29 -0700 Subject: [PATCH 34/85] formatting fix --- backends/openvino/partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index d4aff6fa7d3..5ed9508ca89 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -193,7 +193,7 @@ def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule): str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default" ): - enabled_ops = [] + enabled_ops: list = [] pattern_match = self.check_pattern(node, stack_node, enabled_ops) if pattern_match: for pattern_op in enabled_ops: From 1763b99d7c7785a1b2f5c3152601924f97c07fea Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Fri, 5 Sep 2025 10:14:59 -0700 Subject: [PATCH 35/85] formatting fix --- backends/openvino/partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index 5ed9508ca89..20841d6730b 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -138,7 +138,7 @@ def ops_to_not_decompose( return (ops_not_decompose, None) def check_pattern( - self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list + self, node: torch.fx.Node, pattern: type[PatternNode], enabled_ops: list ) -> bool: if node.op == "call_function": if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types: # type: ignore[union-attr] From 486382636b43a348512a934110f3215bbc67e842 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Fri, 5 Sep 2025 10:23:35 -0700 Subject: [PATCH 36/85] formatting fix --- backends/openvino/quantizer/observers/nncf_observers.py | 4 ++-- backends/openvino/quantizer/quantizer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py index f6ac2a3cb91..ac95b1bbef5 100644 --- a/backends/openvino/quantizer/observers/nncf_observers.py +++ b/backends/openvino/quantizer/observers/nncf_observers.py @@ -111,7 +111,7 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): decompressor_name, )(model) decomp_node = observer_node.args[0] - observer_node.replace_all_uses_with(decomp_node) + observer_node.replace_all_uses_with(decomp_node) # type: ignore[arg-type] model.graph.erase_node(observer_node) @@ -172,5 +172,5 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): decompressor_name, )(model) decomp_node = observer_node.args[0] - observer_node.replace_all_uses_with(decomp_node) + observer_node.replace_all_uses_with(decomp_node) # type: ignore[arg-type] model.graph.erase_node(observer_node) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index cd78f6907c7..84e29239419 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -75,7 +75,7 @@ class OpenVINOQuantizer(Quantizer): def __init__( self, *, - mode: Optional[QuantizationMode] = QuantizationMode.INT8_SYM, + mode: QuantizationMode = QuantizationMode.INT8_SYM, **kwargs, ): """ From e24072fc68c7884b62a437de3d8d2b7f60cd9efe Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Fri, 5 Sep 2025 10:43:00 -0700 Subject: [PATCH 37/85] formatting fix --- backends/openvino/quantizer/quantizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 84e29239419..5cbd50c3136 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -208,7 +208,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: edge_or_node, annotation = self._get_edge_or_node_and_annotation( graph, nncf_graph, qp, node_vs_torch_annotation ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config( + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config( # type: ignore[no-redef] qp ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) @@ -412,9 +412,9 @@ def _get_torch_ao_qspec_from_nncf_config( else MappingType.ASYMMETRIC ) if qconfig.num_bits == 4: - extra_args["mapping_type"] = mapping_type - extra_args["target_dtype"] = torch.int8 - extra_args["granularity"] = PerGroup(group_size=group_size) + extra_args["mapping_type"] = mapping_type # type: ignore[assignment] + extra_args["target_dtype"] = torch.int8 # type: ignore[assignment] + extra_args["granularity"] = PerGroup(group_size=group_size) # type: ignore[assignment] observer = PTPerBlockParamObserver quant_min = -8 quant_max = 7 From b9bb5f08224544f9f4e9a6896bf756fc41462ce3 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Fri, 5 Sep 2025 10:51:16 -0700 Subject: [PATCH 38/85] formatting fix --- backends/openvino/quantizer/quantizer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 5cbd50c3136..aef9e56876b 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -391,6 +391,10 @@ def _get_torch_ao_qspec_from_nncf_config( extra_args = {"eps": 1e-16} is_weight = qp.is_weight_quantization_point() qconfig = qp.qconfig + dtype = None + quant_min = None + quant_max = None + channel_axis = None observer: Type[UniformQuantizationObserverBase] if qconfig.per_channel: From 291dcd993e17136a3609e30919aa4d406ed54113 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Fri, 5 Sep 2025 10:56:31 -0700 Subject: [PATCH 39/85] formatting fix --- backends/openvino/quantizer/quantizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index aef9e56876b..f2011431a03 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -391,7 +391,7 @@ def _get_torch_ao_qspec_from_nncf_config( extra_args = {"eps": 1e-16} is_weight = qp.is_weight_quantization_point() qconfig = qp.qconfig - dtype = None + dtype = torch.int8 quant_min = None quant_max = None channel_axis = None From c8ea777098b8a812e6162b767dbfeabdd7c193c4 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:39:52 +0400 Subject: [PATCH 40/85] use new transformations --- backends/openvino/quantizer/observers.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index d44a22556dd..76ab33eb5c5 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -15,8 +15,9 @@ get_tensor_constant_from_node, ) from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped] - constant_update_fn, - module_insertion_transformation_builder, + constant_update, + module_insertion, + node_removal, ) from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] WeightCompressionParameters, @@ -103,7 +104,7 @@ def convert( packed_q_weight = decompressor.pack_weight(q_weight) # Weight port id is 0 since observer is inserted for a single weight only. - constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) + constant_update(model, observer_node, packed_q_weight, input_port_id=0) compressed_weight_name = observer_node.all_input_nodes[0].name decompressor_suffix = "_".join( @@ -111,7 +112,8 @@ def convert( ) decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" - module_insertion_transformation_builder( + module_insertion( + model, decompressor, [ PTTargetPoint( @@ -120,11 +122,8 @@ def convert( ) ], decompressor_name, - )(model) - - decomp_node = observer_node.args[0] - observer_node.replace_all_uses_with(decomp_node) # type: ignore[arg-type] - model.graph.erase_node(observer_node) + ) + node_removal(model, observer_node, 0) @abstractmethod def _create_decompressor( From a6b605f41b5390ff9de70b2397a2d00003f34ff2 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:46:24 +0400 Subject: [PATCH 41/85] add comment for manual MP allocation --- extension/llm/export/quantizer_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 9220c1efbdc..e839827208c 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -233,7 +233,7 @@ def get_ov_quantizer( ), f"The quantization config is for backend {backend} instead of openvino." assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel." - # Manually ignore MP layers. + # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP fp_node_names = [ "linear_14", "linear_15", From 9614fc4da170d76a39e047d0c364177bf96d0209 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:48:58 +0400 Subject: [PATCH 42/85] remove nncf_compression from export llama lib --- examples/models/llama/export_llama_lib.py | 54 +---------------------- 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 8eab3eefbc0..ac52893b99c 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -567,13 +567,6 @@ def build_args_parser() -> argparse.ArgumentParser: help="path to the input pruning token mapping file (token_map.json)", ) - parser.add_argument( - "--nncf_compression", - default=False, - action="store_true", - help="Enables nncf compression for openvino backend", - ) - parser.add_argument( "--export_only", default=False, @@ -909,7 +902,6 @@ def _to_edge_and_lower_llama_openvino( quantizers, additional_passes, openvino_device: str = "CPU", - nncf_compression: bool = False, verbose: bool = False, ) -> LLMEdgeManager: # noqa: C901 partitioners = [] @@ -921,51 +913,8 @@ def _to_edge_and_lower_llama_openvino( logging.info("Lowering model using following partitioner(s): ") for partitioner in partitioners: logging.info(f"--> {partitioner.__class__.__name__}") - try: - import nncf - from functools import partial - from pytorch_tokenizers import get_tokenizer - except ImportError: - raise ImportError( - "Please install nncf via backends/openvino/requirements.txt" - ) - - tokenizer = get_tokenizer(builder_exported.tokenizer_path) - from datasets import load_dataset - # Use NNCF compression if enabled - # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize - if nncf_compression: - dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - dataset = dataset.filter(lambda example: example['text'].strip() != "") - dataset = dataset.filter(lambda example: example['text'].strip() != "\n") - def transform_fn( - prompts: str, tokenizer - ): - tokenized_text = tokenizer.encode(prompts["text"], bos=False, eos=False) - device = torch.device("cpu") if openvino_device=="CPU" else torch.device("cuda") - inputs = () - inputs = ( - torch.tensor(tokenized_text[:128], device=device).unsqueeze(0), - {"input_pos": torch.tensor([0], device=device)}, - ) - - return inputs - - builder_exported.pre_autograd_graph_module = nncf.compress_weights( - builder_exported.pre_autograd_graph_module, - dataset=nncf.Dataset(dataset, partial(transform_fn, tokenizer=tokenizer)), - mode=nncf.CompressWeightsMode.INT4_SYM, - group_size=32, - backup_mode=nncf.BackupMode.NONE, - ratio=0.8, - sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, - ) - - builder = builder_exported.to_edge_transform_and_lower(partitioners) - - else: - builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) + builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) @@ -1211,7 +1160,6 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 quantizers, additional_passes, openvino_device=llm_config.backend.openvino.device, - nncf_compression=llm_config.backend.openvino.nncf_compression, verbose=llm_config.debug.verbose, ) else: From 45007cf90c054ccfd527874ae35d383fc34a4ee8 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:52:58 +0400 Subject: [PATCH 43/85] change pt2e quantize flag to use openvino_4wo instead of openvino_8da4w and so on --- extension/llm/export/config/llm_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index b4175d54cd7..49855d61e6e 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -275,8 +275,8 @@ class Pt2eQuantize(str, Enum): xnnpack_dynamic = "xnnpack_dynamic" xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4" - openvino_8da4w = "openvino_8da4w" - openvino_8da8w = "openvino_8da8w" + openvino_4wo = "openvino_4wo" + openvino_8wo = "openvino_8wo" qnn_8a8w = "qnn_8a8w" qnn_16a16w = "qnn_16a16w" qnn_16a4w = "qnn_16a4w" From 9d494147457e6696f7149e4b7cb69f95811cbd47 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:53:14 +0400 Subject: [PATCH 44/85] follow up to last commit --- examples/models/llama/export_llama_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index ac52893b99c..ec03f4b26c9 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -206,8 +206,8 @@ def build_args_parser() -> argparse.ArgumentParser: choices=[ "xnnpack_dynamic", "xnnpack_dynamic_qc4", - "openvino_8da4w", - "openvino_8da8w", + "openvino_4wo", + "openvino_8wo", "qnn_8a8w", "qnn_16a16w", "qnn_16a4w", From d6727cfed609d07281fdea42358d2e234ac82f19 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:56:47 +0400 Subject: [PATCH 45/85] update quantizer lib with openvino_4wo --- extension/llm/export/quantizer_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index e839827208c..8a097f9b8f1 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -263,10 +263,10 @@ def get_ov_quantizer( "linear_109", "linear_110",] - if quant_config == "8da4w": + if quant_config == "4wo": mode = QuantizationMode.INT4WO_SYM - elif quant_config == "8da8w": + elif quant_config == "8wo": group_size = -1 mode = QuantizationMode.INT8WO_SYM else: From 4a0a7819ab69aa0d8fdfce70f3be219c14abc409 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 14:06:48 +0400 Subject: [PATCH 46/85] split qspec function into 2 parts; 1 for WC and other for PTQ qspecs --- backends/openvino/quantizer/quantizer.py | 92 +++++++++++++----------- 1 file changed, 50 insertions(+), 42 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 7f86686d03c..ef04ed0de46 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -188,8 +188,8 @@ def _annotate_weight_compression( ) annotation = node_vs_torch_annotation[target_node] edge_or_node = self._get_weight_edge(target_node, nncf_graph) - qspec = self._get_torch_ao_qspec_from_nncf_config( - qp=None, wc_param=wc_param + qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc( + wc_param=wc_param ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) @@ -217,7 +217,7 @@ def _annotate_post_training_quantization( edge_or_node, annotation = self._get_edge_or_node_and_annotation( graph, nncf_graph, qp, node_vs_torch_annotation ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp) + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) for quantizer_ids in quantization_setup.unified_scale_groups.values(): @@ -412,18 +412,58 @@ def _fill_torch_ao_annotation( annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec @staticmethod - def _get_torch_ao_qspec_from_nncf_config( + def _get_torch_ao_qspec_from_nncf_config_for_wc( + wc_param: WeightCompressionParameters, + ) -> QuantizationSpec: + """ + Returns a TorchAO QuantizationSpec based on NNCF weight compression parameter. + + :param wc_param: NNCF Weight compression parameters for the node. + :return: A TorchAO QuantizationSpec. + """ + observer: Type[UniformQuantizationObserverBase] + + extra_args: Dict[str, Any] = {} + + qmode = wc_param.compression_config.mode + if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: + extra_args["wc_param"] = wc_param + observer = INT4WeightObserver + quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0 + quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = None + else: + extra_args["wc_param"] = wc_param + observer = INT8WeightObserver + quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0 + quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = ( + torch.per_channel_symmetric + if qmode == QuantizationMode.INT8WO_SYM + else torch.per_channel_affine + ) + return QuantizationSpec( + dtype=dtype, + observer_or_fake_quant_ctr=observer.with_args(**extra_args), + quant_min=quant_min, + quant_max=quant_max, + qscheme=torch_qscheme, + ch_axis=channel_axis, + is_dynamic=False, + ) + + @staticmethod + def _get_torch_ao_qspec_from_nncf_config_for_ptq( qp: quantization.quantizer_setup.QuantizationPointBase, - wc_param: WeightCompressionParameters = None, ) -> QuantizationSpec: """ - Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments. - For weight-only quantization (e.g., INT4/INT8 compression), uses `wc_param` which carries - weight only quantization info such as group_size, reduction_axes etc. For post-training - quantization, only `qp` is required. + Returns a TorchAO QuantizationSpec based on NNCF quantization point. :param qp: Quantization point from NNCF. - :param wc_param: NNCF Weight compression parameters for the node. :return: A TorchAO QuantizationSpec. """ observer: Type[UniformQuantizationObserverBase] @@ -431,38 +471,6 @@ def _get_torch_ao_qspec_from_nncf_config( # Eps value is copied from nncf/torch/quantization/layers.py extra_args: Dict[str, Any] = {"eps": 1e-16} - if wc_param: - qmode = wc_param.compression_config.mode - if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: - extra_args["wc_param"] = wc_param - observer = INT4WeightObserver - quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0 - quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15 - dtype = torch.int8 - channel_axis = 0 - torch_qscheme = None - else: - extra_args["wc_param"] = wc_param - observer = INT8WeightObserver - quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0 - quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255 - dtype = torch.int8 - channel_axis = 0 - torch_qscheme = ( - torch.per_channel_symmetric - if qmode == QuantizationMode.INT8WO_SYM - else torch.per_channel_affine - ) - return QuantizationSpec( - dtype=dtype, - observer_or_fake_quant_ctr=observer.with_args(**extra_args), - quant_min=quant_min, - quant_max=quant_max, - qscheme=torch_qscheme, - ch_axis=channel_axis, - is_dynamic=False, - ) - is_weight = qp.is_weight_quantization_point() qconfig = qp.qconfig From f6a1ee3d708ca46fe495f081bc45872042b1bed6 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Sep 2025 12:14:34 +0400 Subject: [PATCH 47/85] micro fix --- backends/openvino/quantizer/quantizer.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index ef04ed0de46..762ed2a9171 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -426,24 +426,29 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( extra_args: Dict[str, Any] = {} qmode = wc_param.compression_config.mode + is_asym_mode = wc_param.compression_config.is_asym_mode if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: extra_args["wc_param"] = wc_param observer = INT4WeightObserver - quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0 - quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15 + quant_min = -8 if not is_asym_mode else 0 + quant_max = 7 if not is_asym_mode else 15 dtype = torch.int8 channel_axis = 0 - torch_qscheme = None + torch_qscheme = torch_qscheme = ( + torch.per_channel_symmetric + if not is_asym_mode + else torch.per_channel_affine + ) else: extra_args["wc_param"] = wc_param observer = INT8WeightObserver - quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0 - quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255 + quant_min = -128 if not is_asym_mode else 0 + quant_max = 127 if not is_asym_mode else 255 dtype = torch.int8 channel_axis = 0 torch_qscheme = ( torch.per_channel_symmetric - if qmode == QuantizationMode.INT8WO_SYM + if not is_asym_mode else torch.per_channel_affine ) return QuantizationSpec( From d285fcce354f8bde55e968892932cbe4a34421cd Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Sep 2025 15:35:49 +0400 Subject: [PATCH 48/85] udpate mixed precision layers for higher accuracy. Change INT4 mode to Asymmetric --- extension/llm/export/quantizer_lib.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 8a097f9b8f1..46b10dcb960 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -235,21 +235,17 @@ def get_ov_quantizer( # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP fp_node_names = [ + "linear_13", "linear_14", - "linear_15", "linear_35", "linear_56", - "linear_57", - "linear_63", "linear_70", "linear_71", "linear_77", "linear_78", - "linear_81", "linear_84", "linear_85", "linear_88", - "linear_89", "linear_91", "linear_92", "linear_95", @@ -261,10 +257,11 @@ def get_ov_quantizer( "linear_105", "linear_106", "linear_109", - "linear_110",] + "linear_110", + "linear_111",] if quant_config == "4wo": - mode = QuantizationMode.INT4WO_SYM + mode = QuantizationMode.INT4WO_ASYM elif quant_config == "8wo": group_size = -1 From 4e66df1a52e40e90178f4c9fce815d364c5282f9 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Mon, 8 Sep 2025 18:12:37 +0400 Subject: [PATCH 49/85] Apply suggestions from code review Co-authored-by: Daniil Lyakhov --- backends/openvino/quantizer/observers.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 76ab33eb5c5..59a40f2be2d 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -56,9 +56,9 @@ def __init__( :param dtype: target dtype for the quantization. """ super().__init__(dtype=dtype, is_dynamic=False) - self.wc_param = wc_param + self._wc_param = wc_param - def calculate_qparams( # type: ignore[override] + def _calculate_qparams( # type: ignore[override] self, weight: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: @@ -68,7 +68,7 @@ def calculate_qparams( # type: ignore[override] :param weight: FP weight to be used for calculating qparams. :return: A tuple containing the quantized weight, quantization scale and quantization zero point. """ - wc_param = self.get_wc_param() + wc_param = self._wc_param wc_config = wc_param.compression_config reduction_axes = wc_param.reduction_axes q_weight, scale, zp = do_integer_quantization( @@ -143,13 +143,6 @@ def _create_decompressor( :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO. """ - def get_wc_param(self) -> WeightCompressionParameters: - """ - Returns a respective NNCF Weight Compression Config. - - :return: Weight compression config with the compression information such as qmode, group_size etc. - """ - return self.wc_param class INT4WeightObserver(WeightObserverBase): """ From e850e419cb313e86fd0f5669e7eaa1d115fcc10c Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Sep 2025 18:13:28 +0400 Subject: [PATCH 50/85] Review changes --- backends/openvino/quantizer/observers.py | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 59a40f2be2d..457399117e0 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -94,7 +94,7 @@ def convert( """ weight_node = observer_node.args[0] original_weight = get_tensor_constant_from_node(weight_node, model) - q_weight, scale, zero_point = self.calculate_qparams( + q_weight, scale, zero_point = self._calculate_qparams( original_weight ) @@ -156,18 +156,17 @@ def _create_decompressor( q_weight: torch.Tensor, original_weight: torch.Tensor, ) -> BaseWeightsDecompressor: - if zero_point is not None: - return INT4AsymmetricWeightsDecompressor( - scale, - zero_point, - q_weight.shape, - original_weight.shape, - original_weight.dtype, - ) - else: + if zero_point is None: return INT4SymmetricWeightsDecompressor( scale, q_weight.shape, original_weight.shape, original_weight.dtype ) + return INT4AsymmetricWeightsDecompressor( + scale, + zero_point, + q_weight.shape, + original_weight.shape, + original_weight.dtype, + ) class INT8WeightObserver(WeightObserverBase): @@ -182,10 +181,11 @@ def _create_decompressor( q_weight: torch.Tensor, original_weight: torch.Tensor, ) -> BaseWeightsDecompressor: - if zero_point is not None: - return INT8AsymmetricWeightsDecompressor( - scale, zero_point, original_weight.dtype + if zero_point is None: + return INT8SymmetricWeightsDecompressor( + scale, original_weight.dtype ) - else: - return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) + return INT8AsymmetricWeightsDecompressor( + scale, zero_point, original_weight.dtype + ) From 204043f973ba928c3f2b73dc11e1db6572b7c4a7 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Sep 2025 18:33:16 +0400 Subject: [PATCH 51/85] review changes in quantizer --- backends/openvino/quantizer/quantizer.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 762ed2a9171..7e0e3c92af0 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -175,7 +175,6 @@ def _annotate_weight_compression( :param graph: The underlying FX graph. :param nncf_graph: The corresponding NNCF graph. :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations. - :return: Updated mapping of FX nodes with weight compression annotations. """ self._algo.set_backend_entity(model) @@ -343,7 +342,7 @@ def _get_edge_or_node_and_annotation( def _get_weight_edge( target_node: torch.fx.Node, nncf_graph: NNCFGraph, - ): + ) -> tuple[torch.fx.Node, torch.fx.Node]: """ Returns the FX node corresponding to the weight tensor input of a given operator node. Uses the NNCF graph to identify which input port of the target node holds the weight. @@ -351,7 +350,6 @@ def _get_weight_edge( :param target_node: FX node representing a weighted operation (e.g., Linear, Conv). :param nncf_graph: NNCFGraph used to determine weight port indices. - :return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight. """ nncf_node = nncf_graph.get_node_by_name(target_node.name) @@ -428,7 +426,6 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( qmode = wc_param.compression_config.mode is_asym_mode = wc_param.compression_config.is_asym_mode if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: - extra_args["wc_param"] = wc_param observer = INT4WeightObserver quant_min = -8 if not is_asym_mode else 0 quant_max = 7 if not is_asym_mode else 15 @@ -440,7 +437,6 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( else torch.per_channel_affine ) else: - extra_args["wc_param"] = wc_param observer = INT8WeightObserver quant_min = -128 if not is_asym_mode else 0 quant_max = 127 if not is_asym_mode else 255 @@ -453,7 +449,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( ) return QuantizationSpec( dtype=dtype, - observer_or_fake_quant_ctr=observer.with_args(**extra_args), + observer_or_fake_quant_ctr=observer.with_args(wc_param=wc_param), quant_min=quant_min, quant_max=quant_max, qscheme=torch_qscheme, From ae6b089f293d20248df4c3d8a0d0c5ddfed62c4c Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Sep 2025 18:45:54 +0400 Subject: [PATCH 52/85] revert extra args changes --- backends/openvino/quantizer/quantizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 7e0e3c92af0..89d528f8d16 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -424,6 +424,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( extra_args: Dict[str, Any] = {} qmode = wc_param.compression_config.mode + extra_args["wc_param"] = wc_param is_asym_mode = wc_param.compression_config.is_asym_mode if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: observer = INT4WeightObserver @@ -449,7 +450,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( ) return QuantizationSpec( dtype=dtype, - observer_or_fake_quant_ctr=observer.with_args(wc_param=wc_param), + observer_or_fake_quant_ctr=observer.with_args(**extra_args), quant_min=quant_min, quant_max=quant_max, qscheme=torch_qscheme, From 2de569398917362b9ffc02849037528c2a15efa7 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 9 Sep 2025 11:43:00 +0400 Subject: [PATCH 53/85] precommit fixes --- backends/openvino/quantizer/observers.py | 11 +++------ backends/openvino/quantizer/quantizer.py | 30 +++++++++++++---------- examples/models/llama/export_llama_lib.py | 6 +++-- extension/llm/export/quantizer_lib.py | 21 +++++++++------- 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 457399117e0..faeb4fa7a60 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -84,7 +84,7 @@ def convert( self, model: torch.fx.GraphModule, observer_node: torch.fx.Node ) -> None: """ - Replaces the given observer node from the given model with a quantized + Replaces the given observer node from the given model with a quantized weight and a OpenVINO specific decompression module. :param model: A `torch.fx.GraphModule` representing the statically traced model @@ -94,9 +94,7 @@ def convert( """ weight_node = observer_node.args[0] original_weight = get_tensor_constant_from_node(weight_node, model) - q_weight, scale, zero_point = self._calculate_qparams( - original_weight - ) + q_weight, scale, zero_point = self._calculate_qparams(original_weight) decompressor = self._create_decompressor( scale, zero_point, q_weight, original_weight @@ -182,10 +180,7 @@ def _create_decompressor( original_weight: torch.Tensor, ) -> BaseWeightsDecompressor: if zero_point is None: - return INT8SymmetricWeightsDecompressor( - scale, original_weight.dtype - ) + return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) return INT8AsymmetricWeightsDecompressor( scale, zero_point, original_weight.dtype ) - diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 9db79fce9f9..bef1ef3274f 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -12,7 +12,6 @@ import nncf # type: ignore[import-untyped] import nncf.common.quantization as quantization # type: ignore[import-untyped] -from nncf.common.scopes import should_consider_scope # type: ignore[import-untyped] import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped] import torch.fx @@ -21,12 +20,12 @@ INT8WeightObserver, ) from nncf.common.graph.graph import NNCFGraph # type: ignore[import-untyped] -from nncf.quantization.quantize_model import ( # type: ignore[import-untyped] - get_weight_compression_configuration, -) from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] WeightCompressionParameters, ) +from nncf.quantization.quantize_model import ( # type: ignore[import-untyped] + get_weight_compression_configuration, +) from torchao.quantization.pt2e import ( HistogramObserver, PerChannelMinMaxObserver, @@ -118,7 +117,7 @@ def __init__( ), # Mode value has to match NNCF CompressWeightsMode **kwargs, ) - subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve + subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( subset_size=subset_size, **weight_compression_configuration ) @@ -178,7 +177,9 @@ def _annotate_weight_compression( :return: Updated mapping of FX nodes with weight compression annotations. """ self._algo.set_backend_entity(model) - all_wc_params, _ = self._algo.get_weight_compression_parameters(model, nncf_graph) + all_wc_params, _ = self._algo.get_weight_compression_parameters( + model, nncf_graph + ) for wc_param in all_wc_params: node_with_weight = wc_param.node_with_weight @@ -187,9 +188,7 @@ def _annotate_weight_compression( ) annotation = node_vs_torch_annotation[target_node] edge_or_node = self._get_weight_edge(target_node, nncf_graph) - qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc( - wc_param=wc_param - ) + qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(wc_param=wc_param) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) return node_vs_torch_annotation @@ -216,7 +215,9 @@ def _annotate_post_training_quantization( edge_or_node, annotation = self._get_edge_or_node_and_annotation( graph, nncf_graph, qp, node_vs_torch_annotation ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp) + qspec: QuantizationSpecBase = ( + self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp) + ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) for quantizer_ids in quantization_setup.unified_scale_groups.values(): @@ -426,8 +427,11 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( qmode = wc_param.compression_config.mode extra_args["wc_param"] = wc_param is_asym_mode = wc_param.compression_config.is_asym_mode - if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: - observer = INT4WeightObserver + if qmode in [ + nncf.CompressWeightsMode.INT4_ASYM, + nncf.CompressWeightsMode.INT4_SYM, + ]: + observer = INT4WeightObserver # type: ignore[type-abstract] quant_min = -8 if not is_asym_mode else 0 quant_max = 7 if not is_asym_mode else 15 dtype = torch.int8 @@ -438,7 +442,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( else torch.per_channel_affine ) else: - observer = INT8WeightObserver + observer = INT8WeightObserver # type: ignore[type-abstract] quant_min = -128 if not is_asym_mode else 0 quant_max = 127 if not is_asym_mode else 255 dtype = torch.int8 diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 578fd0fea7b..d9c282888cc 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -43,10 +43,10 @@ ) from executorch.extension.llm.export.quantizer_lib import ( get_coreml_quantizer, + get_ov_quantizer, get_pt2e_quantization_params, get_pt2e_quantizers, get_qnn_quantizer, - get_ov_quantizer, get_vulkan_quantizer, ) from executorch.util.activation_memory_profiler import generate_memory_trace @@ -897,7 +897,9 @@ def _to_edge_and_lower_llama_openvino( for partitioner in partitioners: logging.info(f"--> {partitioner.__class__.__name__}") - builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) + builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower( + partitioners + ) if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 83d4a84420d..df8c2a5e36c 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -220,20 +220,22 @@ def get_ov_quantizer( group_size: int = 32, ): try: - from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode - import nncf - except ImportError: - raise ImportError( - "Please install nncf via backends/openvino/requirements.txt" + from executorch.backends.openvino.quantizer import ( + OpenVINOQuantizer, + QuantizationMode, ) - + except ImportError: + raise ImportError("Please install nncf via backends/openvino/requirements.txt") + backend, quant_config = pt2e_quantize.split("_") assert ( backend == "openvino" ), f"The quantization config is for backend {backend} instead of openvino." - assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel." + assert ( + group_size + ), "Group Size None is Not Supported. It should be set to -1 for per-channel." - # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP + # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP fp_node_names = [ "linear_13", "linear_14", @@ -258,7 +260,8 @@ def get_ov_quantizer( "linear_106", "linear_109", "linear_110", - "linear_111",] + "linear_111", + ] if quant_config == "4wo": mode = QuantizationMode.INT4WO_ASYM From 0e10f28242129a3c332ccdbd7a3b9a4340a8e1a1 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Tue, 9 Sep 2025 21:52:23 +0400 Subject: [PATCH 54/85] revert _calculate_qparams back to calculate_qparams --- backends/openvino/quantizer/observers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index faeb4fa7a60..6cda4561604 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -58,7 +58,7 @@ def __init__( super().__init__(dtype=dtype, is_dynamic=False) self._wc_param = wc_param - def _calculate_qparams( # type: ignore[override] + def calculate_qparams( # type: ignore[override] self, weight: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: @@ -94,7 +94,7 @@ def convert( """ weight_node = observer_node.args[0] original_weight = get_tensor_constant_from_node(weight_node, model) - q_weight, scale, zero_point = self._calculate_qparams(original_weight) + q_weight, scale, zero_point = self.calculate_qparams(original_weight) decompressor = self._create_decompressor( scale, zero_point, q_weight, original_weight From 05f5a929c7c5b9a79859d9c9848ce37dd0c16b41 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Wed, 10 Sep 2025 18:49:08 +0400 Subject: [PATCH 55/85] remove manual ignored nodes --- extension/llm/export/quantizer_lib.py | 29 --------------------------- 1 file changed, 29 deletions(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index df8c2a5e36c..870080a7549 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -235,34 +235,6 @@ def get_ov_quantizer( group_size ), "Group Size None is Not Supported. It should be set to -1 for per-channel." - # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP - fp_node_names = [ - "linear_13", - "linear_14", - "linear_35", - "linear_56", - "linear_70", - "linear_71", - "linear_77", - "linear_78", - "linear_84", - "linear_85", - "linear_88", - "linear_91", - "linear_92", - "linear_95", - "linear_96", - "linear_98", - "linear_99", - "linear_102", - "linear_103", - "linear_105", - "linear_106", - "linear_109", - "linear_110", - "linear_111", - ] - if quant_config == "4wo": mode = QuantizationMode.INT4WO_ASYM @@ -274,7 +246,6 @@ def get_ov_quantizer( f"No support for quant type {quant_config}. Support 8a4w, 8a8w only." ) ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size) - ov_quantizer.set_ignored_scope(names=fp_node_names) return ov_quantizer From fbe0e21137ee9ebc8ea246e61fd9cfa252f57b15 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Wed, 10 Sep 2025 18:52:42 +0400 Subject: [PATCH 56/85] add ratio to quantizer initialization --- extension/llm/export/quantizer_lib.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 870080a7549..350e8b3ce7c 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -235,17 +235,23 @@ def get_ov_quantizer( group_size ), "Group Size None is Not Supported. It should be set to -1 for per-channel." + quantization_params = {} + if quant_config == "4wo": - mode = QuantizationMode.INT4WO_ASYM + quantization_params["mode"] = QuantizationMode.INT4WO_ASYM + quantization_params["group_size"] = group_size + quantization_params["ratio"] = 0.8 elif quant_config == "8wo": - group_size = -1 - mode = QuantizationMode.INT8WO_SYM + quantization_params["mode"] = QuantizationMode.INT8WO_ASYM + quantization_params["group_size"] = -1 + quantization_params["ratio"] = None + else: raise AssertionError( f"No support for quant type {quant_config}. Support 8a4w, 8a8w only." ) - ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size) + ov_quantizer = OpenVINOQuantizer(**quantization_params) return ov_quantizer From 6bff1cdb00ebdae53b57ab706cab6e9e9ee7e335 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Thu, 11 Sep 2025 23:04:13 +0400 Subject: [PATCH 57/85] Update export_llama_lib.py --- examples/models/llama/export_llama_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index d9c282888cc..cbbf169a085 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -768,7 +768,7 @@ def get_quantizer_and_quant_params(llm_config): if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: assert not quantizers, "Should not enable both xnnpack and openvino" group_size = llm_config.quantization.group_size - group_size = group_size if group_size else 32 + group_size = group_size if group_size else 128 ov_quantizer = get_ov_quantizer( llm_config.quantization.pt2e_quantize.value, group_size ) From d744ae95f3cf806278b12db346105e233a2daec5 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Thu, 11 Sep 2025 23:04:50 +0400 Subject: [PATCH 58/85] Update quantizer_lib.py --- extension/llm/export/quantizer_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 350e8b3ce7c..f92c59cebd3 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -217,7 +217,7 @@ def get_qnn_quantizer( def get_ov_quantizer( pt2e_quantize: str, - group_size: int = 32, + group_size: int = 128, ): try: from executorch.backends.openvino.quantizer import ( From b874204d7d8eba9aa35dc8f9e55bd47bc0719cbb Mon Sep 17 00:00:00 2001 From: suryasidd Date: Thu, 11 Sep 2025 14:22:29 -0700 Subject: [PATCH 59/85] Updated NNCF commit id --- backends/openvino/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt index 2ada445414c..519818d0aac 100644 --- a/backends/openvino/requirements.txt +++ b/backends/openvino/requirements.txt @@ -1,2 +1,2 @@ transformers -git+https://github.com/openvinotoolkit/nncf@5cb2b58#egg=nncf +git+https://github.com/openvinotoolkit/nncf@3d753ac#egg=nncf From 41ac36a8a513e2adbc3015d231f071b7530efae0 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Thu, 11 Sep 2025 16:21:43 -0700 Subject: [PATCH 60/85] openvino llama export configuration - initial --- examples/openvino/llama/README.md | 11 ++++++++++ .../llama/llama3_2_ov_4wo_config.yaml | 20 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 examples/openvino/llama/README.md create mode 100644 examples/openvino/llama/llama3_2_ov_4wo_config.yaml diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md new file mode 100644 index 00000000000..30644af3cde --- /dev/null +++ b/examples/openvino/llama/README.md @@ -0,0 +1,11 @@ + +LLAMA_CHECKPOINT=/consolidated.00.pth +LLAMA_PARAMS=/params.json +LLAMA_TOKENIZER=/tokenizer.model + +python -m extension.llm.export.export_llm \ + --config llama3_2_ov_4wo_config.yaml \ + +base.model_class="llama3_2" \ + +base.checkpoint="${LLAMA_CHECKPOINT:?}" \ + +base.params="${LLAMA_PARAMS:?}" \ + +base.tokenizer_path="${LLAMA_TOKENIZER:?}" \ diff --git a/examples/openvino/llama/llama3_2_ov_4wo_config.yaml b/examples/openvino/llama/llama3_2_ov_4wo_config.yaml new file mode 100644 index 00000000000..7f47f133216 --- /dev/null +++ b/examples/openvino/llama/llama3_2_ov_4wo_config.yaml @@ -0,0 +1,20 @@ +base: + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + +model: + use_kv_cache: True + dtype_override: fp32 + enable_dynamic_shape: False + +export: + output_dir: "../" + +quantization: + pt2e_quantize: "openvino_4wo" + +backend: + openvino: + enabled: True + +debug: + verbose: True From 08461ec1b54de22b279511669a862d20ecef0f5d Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Thu, 11 Sep 2025 16:32:20 -0700 Subject: [PATCH 61/85] updated ov llama config file --- .../{llama3_2_ov_4wo_config.yaml => llama3_2_ov_4wo.yaml} | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) rename examples/openvino/llama/{llama3_2_ov_4wo_config.yaml => llama3_2_ov_4wo.yaml} (90%) diff --git a/examples/openvino/llama/llama3_2_ov_4wo_config.yaml b/examples/openvino/llama/llama3_2_ov_4wo.yaml similarity index 90% rename from examples/openvino/llama/llama3_2_ov_4wo_config.yaml rename to examples/openvino/llama/llama3_2_ov_4wo.yaml index 7f47f133216..68a53708fb9 100644 --- a/examples/openvino/llama/llama3_2_ov_4wo_config.yaml +++ b/examples/openvino/llama/llama3_2_ov_4wo.yaml @@ -6,11 +6,9 @@ model: dtype_override: fp32 enable_dynamic_shape: False -export: - output_dir: "../" - quantization: pt2e_quantize: "openvino_4wo" + group_size: 128 backend: openvino: From be85af8b86b995b4879e4382cc00f00eb7584d16 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 11 Sep 2025 16:14:11 -0700 Subject: [PATCH 62/85] Update README.md --- examples/openvino/llama/README.md | 40 +++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md index 30644af3cde..e5571e3da79 100644 --- a/examples/openvino/llama/README.md +++ b/examples/openvino/llama/README.md @@ -1,11 +1,41 @@ -LLAMA_CHECKPOINT=/consolidated.00.pth -LLAMA_PARAMS=/params.json -LLAMA_TOKENIZER=/tokenizer.model +# Export Llama with OpenVINO Backend -python -m extension.llm.export.export_llm \ +## Download the Model +Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time. + +## Environment Setup +Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend. + +## Export the model: +Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. + +``` +LLAMA_CHECKPOINT=/consolidated.00.pth +LLAMA_PARAMS=/params.json +LLAMA_TOKENIZER=/tokenizer.model + +python -m executorch.extension.llm.export.export_llm \ --config llama3_2_ov_4wo_config.yaml \ +base.model_class="llama3_2" \ +base.checkpoint="${LLAMA_CHECKPOINT:?}" \ +base.params="${LLAMA_PARAMS:?}" \ - +base.tokenizer_path="${LLAMA_TOKENIZER:?}" \ + +base.tokenizer_path="${LLAMA_TOKENIZER:?}" +``` + +## Build OpenVINO C++ Runtime with Llama Runner: +First, build the backend libraries by executing the script below in `/backends/openvino/scripts` folder: +```bash +./openvino_build.sh --cpp_runtime +``` +Then, build the llama runner by executing the script below (with `--llama_runner` argument) also in `/backends/openvino/scripts` folder: +```bash +./openvino_build.sh --llama_runner +``` +The executable is saved in `/cmake-out/examples/models/llama/llama_main` + +## Execute Inference Using Llama Runner +Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt. +``` +./cmake-out/examples/models/llama/llama_main --model_path=llama3_2.pte --tokenizer_path=/tokenizer.model --prompt="Your custom prompt" +``` From 35f1d84b05b285f1cf041ac6e4c95b840e9631ca Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 11 Sep 2025 16:20:28 -0700 Subject: [PATCH 63/85] Update README.md --- examples/openvino/README.md | 53 +++---------------------------------- 1 file changed, 4 insertions(+), 49 deletions(-) diff --git a/examples/openvino/README.md b/examples/openvino/README.md index dbce5df1b55..0ecedde092c 100644 --- a/examples/openvino/README.md +++ b/examples/openvino/README.md @@ -9,7 +9,10 @@ Below is the layout of the `examples/openvino` directory, which includes the nec ``` examples/openvino ├── README.md # Documentation for examples (this file) -└── aot_optimize_and_infer.py # Example script to export and execute models +├── aot_optimize_and_infer.py # Example script to export and execute models +└── llama + ├── README.md # Documentation for Llama example + └── llama3_2_ov_4wo.yaml # Configuration file for exporting Llama3.2 with OpenVINO backend ``` # Build Instructions for Examples @@ -183,51 +186,3 @@ Run inference with a given model for 10 iterations: --model_path=model.pte \ --num_executions=10 ``` - -# Export Llama with OpenVINO Backend - -## Download the Model -Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time. - -## Environment Setup -Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend. - -## Export the model: -Execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. - -``` -LLAMA_CHECKPOINT=/consolidated.00.pth -LLAMA_PARAMS=/params.json -LLAMA_TOKENIZER=/tokenizer.model - -python -u -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${LLAMA_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -kv \ - --openvino \ - -d fp32 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="llama.pte" \ - --verbose \ - --disable_dynamic_shape \ - --tokenizer_path "${LLAMA_TOKENIZER:?}" \ - --nncf_compression -``` - -## Build OpenVINO C++ Runtime with Llama Runner: -First, build the backend libraries by executing the script below in `/backends/openvino/scripts` folder: -```bash -./openvino_build.sh -``` -Then, build the llama runner by executing the script below (with `--llama_runner` argument) also in `/backends/openvino/scripts` folder: -```bash -./openvino_build.sh --llama_runner -``` -The executable is saved in `/cmake-out/examples/models/llama/llama_main` - -## Execute Inference Using Llama Runner -Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt. -``` -./cmake-out/examples/models/llama/llama_main --model_path=llama.pte --tokenizer_path=/tokenizer.model --prompt="Your custom prompt" -``` From 4426541d133b8d9c3148c06654b870f27b4123d0 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 11 Sep 2025 16:25:34 -0700 Subject: [PATCH 64/85] Update README.md --- examples/openvino/llama/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md index e5571e3da79..abb3f5179cb 100644 --- a/examples/openvino/llama/README.md +++ b/examples/openvino/llama/README.md @@ -8,7 +8,7 @@ Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to d Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend. ## Export the model: -Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. +Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. The exported model will be generated in the same directory with the filename `llama3_2.pte`. ``` LLAMA_CHECKPOINT=/consolidated.00.pth @@ -37,5 +37,5 @@ The executable is saved in `/cmake-out/examples/models/llama/ll ## Execute Inference Using Llama Runner Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt. ``` -./cmake-out/examples/models/llama/llama_main --model_path=llama3_2.pte --tokenizer_path=/tokenizer.model --prompt="Your custom prompt" +./cmake-out/examples/models/llama/llama_main --model_path=/examples/openvino/llama/llama3_2.pte --tokenizer_path=/tokenizer.model --prompt="Your custom prompt" ``` From 6b936c5ddf8ab6c356315fd67f293a331f1a4aaf Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 11 Sep 2025 16:26:51 -0700 Subject: [PATCH 65/85] Update README.md --- examples/openvino/llama/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md index abb3f5179cb..4de20a0f061 100644 --- a/examples/openvino/llama/README.md +++ b/examples/openvino/llama/README.md @@ -16,7 +16,7 @@ LLAMA_PARAMS=/params.json LLAMA_TOKENIZER=/tokenizer.model python -m executorch.extension.llm.export.export_llm \ - --config llama3_2_ov_4wo_config.yaml \ + --config llama3_2_ov_4wo.yaml \ +base.model_class="llama3_2" \ +base.checkpoint="${LLAMA_CHECKPOINT:?}" \ +base.params="${LLAMA_PARAMS:?}" \ From bba4a01437ef5b1b6a6ddd7af5a406a9cc9842ca Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 11 Sep 2025 16:51:22 -0700 Subject: [PATCH 66/85] Update README.md --- examples/openvino/llama/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md index 4de20a0f061..d357f038781 100644 --- a/examples/openvino/llama/README.md +++ b/examples/openvino/llama/README.md @@ -8,7 +8,7 @@ Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to d Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend. ## Export the model: -Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. The exported model will be generated in the same directory with the filename `llama3_2.pte`. +Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. Replace device with the target hardware you want to compile the model for (`CPU`, `GPU`, or `NPU`). The exported model will be generated in the same directory with the filename `llama3_2.pte`. ``` LLAMA_CHECKPOINT=/consolidated.00.pth @@ -17,6 +17,7 @@ LLAMA_TOKENIZER=/tokenizer.model python -m executorch.extension.llm.export.export_llm \ --config llama3_2_ov_4wo.yaml \ + +backend.openvino.device="CPU" \ +base.model_class="llama3_2" \ +base.checkpoint="${LLAMA_CHECKPOINT:?}" \ +base.params="${LLAMA_PARAMS:?}" \ From 1421921da0a6b083c17c9fe85b5b5f8beebd7216 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Fri, 12 Sep 2025 13:05:24 +0400 Subject: [PATCH 67/85] Update README.md with quantization paragraph --- examples/openvino/llama/README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md index d357f038781..7a97e27410c 100644 --- a/examples/openvino/llama/README.md +++ b/examples/openvino/llama/README.md @@ -24,6 +24,24 @@ python -m executorch.extension.llm.export.export_llm \ +base.tokenizer_path="${LLAMA_TOKENIZER:?}" ``` +### Compress Model Weights and Export +OpenVINO backend also offers Quantization support for llama models when exporting the model. The different quantization modes that are offered are INT4 groupwise & per-channel weights compression and INT8 per-channel weights compression. It can be achieved using the `--pt2e_quantize opevnino_4wo` flag. For modifying the group size `--group_size` can be used. By default group size 128 is used to achieve optimal performance with the NPU. + +``` +LLAMA_CHECKPOINT=/consolidated.00.pth +LLAMA_PARAMS=/params.json +LLAMA_TOKENIZER=/tokenizer.model + +python -m executorch.extension.llm.export.export_llm \ + --config llama3_2_ov_4wo.yaml \ + +backend.openvino.device="CPU" \ + +base.model_class="llama3_2" \ + +pt2e_quantize opevnino_4wo \ + +base.checkpoint="${LLAMA_CHECKPOINT:?}" \ + +base.params="${LLAMA_PARAMS:?}" \ + +base.tokenizer_path="${LLAMA_TOKENIZER:?}" +``` + ## Build OpenVINO C++ Runtime with Llama Runner: First, build the backend libraries by executing the script below in `/backends/openvino/scripts` folder: ```bash From f050eeac96dd63c158afb526c1df1ac13beec0f6 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Sun, 14 Sep 2025 20:39:13 -0700 Subject: [PATCH 68/85] formatting fix --- backends/openvino/quantizer/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py index 0fd8c10b249..5aae52ef3e8 100644 --- a/backends/openvino/quantizer/__init__.py +++ b/backends/openvino/quantizer/__init__.py @@ -1,3 +1,3 @@ -from .quantizer import OpenVINOQuantizer, quantize_model, QuantizationMode +from .quantizer import OpenVINOQuantizer, QuantizationMode, quantize_model __all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"] From 4bfdca9e95de0bbe41e3f0e8df8e4f1e8476d97f Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Sun, 14 Sep 2025 20:44:22 -0700 Subject: [PATCH 69/85] Update README.md --- examples/openvino/llama/README.md | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md index 7a97e27410c..46dbfb8c2f0 100644 --- a/examples/openvino/llama/README.md +++ b/examples/openvino/llama/README.md @@ -25,22 +25,7 @@ python -m executorch.extension.llm.export.export_llm \ ``` ### Compress Model Weights and Export -OpenVINO backend also offers Quantization support for llama models when exporting the model. The different quantization modes that are offered are INT4 groupwise & per-channel weights compression and INT8 per-channel weights compression. It can be achieved using the `--pt2e_quantize opevnino_4wo` flag. For modifying the group size `--group_size` can be used. By default group size 128 is used to achieve optimal performance with the NPU. - -``` -LLAMA_CHECKPOINT=/consolidated.00.pth -LLAMA_PARAMS=/params.json -LLAMA_TOKENIZER=/tokenizer.model - -python -m executorch.extension.llm.export.export_llm \ - --config llama3_2_ov_4wo.yaml \ - +backend.openvino.device="CPU" \ - +base.model_class="llama3_2" \ - +pt2e_quantize opevnino_4wo \ - +base.checkpoint="${LLAMA_CHECKPOINT:?}" \ - +base.params="${LLAMA_PARAMS:?}" \ - +base.tokenizer_path="${LLAMA_TOKENIZER:?}" -``` +OpenVINO backend also offers Quantization support for llama models when exporting the model. The different quantization modes that are offered are INT4 groupwise & per-channel weights compression and INT8 per-channel weights compression. It can be achieved by setting `pt2e_quantize` option in `llama3_2_ov_4wo.yaml` file under `quantization`. Set this parameter to `openvino_4wo` for INT4 or `openvino_8wo` for INT8 weight compression. It is set to `openvino_4wo` in `llama3_2_ov_4wo.yaml` file by default. For modifying the group size, set `group_size` option in `llama3_2_ov_4wo.yaml` file under `quantization`. By default group size 128 is used to achieve optimal performance with the NPU. ## Build OpenVINO C++ Runtime with Llama Runner: First, build the backend libraries by executing the script below in `/backends/openvino/scripts` folder: From 16aba1bb1bb52632829e5a84ef0dd15f0e01d464 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Tue, 16 Sep 2025 10:21:59 -0700 Subject: [PATCH 70/85] Update non_cpu_backends.md for OpenVINO instructions --- examples/models/llama/non_cpu_backends.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/models/llama/non_cpu_backends.md b/examples/models/llama/non_cpu_backends.md index f414582a3c1..6e5d0b63256 100644 --- a/examples/models/llama/non_cpu_backends.md +++ b/examples/models/llama/non_cpu_backends.md @@ -22,3 +22,6 @@ After exporting the CoreML model .pte file, please [follow the instruction to bu ### MTK Please [follow the instructions](https://github.com/pytorch/executorch/tree/main/examples/mediatek#llama-example-instructions) to deploy llama3 8b to an Android phones with MediaTek chip + +### OpenVINO +Please follow [the instructions](../../openvino/llama/README.md) to deploy Llama 3 1B to Intel CPUs, GPUs, and NPUs. From 155529f2a63bffeaa6539908dabda16e8d0e415f Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Tue, 16 Sep 2025 10:22:58 -0700 Subject: [PATCH 71/85] Update llama instructions link for OpenVINO backend --- examples/models/llama/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 784142b61f1..aba3b255fee 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -136,7 +136,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus

-[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP or MediaTek](non_cpu_backends.md). +[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, MediaTek, or OpenVINO](non_cpu_backends.md). # Instructions From 5875aa8af0b07474b6d7d066164dc5a298b26d9a Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Tue, 16 Sep 2025 10:25:46 -0700 Subject: [PATCH 72/85] Remove OpenVINO from non_cpu_backends.md --- examples/models/llama/non_cpu_backends.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/models/llama/non_cpu_backends.md b/examples/models/llama/non_cpu_backends.md index 6e5d0b63256..f414582a3c1 100644 --- a/examples/models/llama/non_cpu_backends.md +++ b/examples/models/llama/non_cpu_backends.md @@ -22,6 +22,3 @@ After exporting the CoreML model .pte file, please [follow the instruction to bu ### MTK Please [follow the instructions](https://github.com/pytorch/executorch/tree/main/examples/mediatek#llama-example-instructions) to deploy llama3 8b to an Android phones with MediaTek chip - -### OpenVINO -Please follow [the instructions](../../openvino/llama/README.md) to deploy Llama 3 1B to Intel CPUs, GPUs, and NPUs. From 2630fd6c1db8f3e8eb5a840b34b96b48210c9362 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Tue, 16 Sep 2025 11:03:51 -0700 Subject: [PATCH 73/85] Update llama instructions for OpenVINO backend --- examples/models/llama/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index aba3b255fee..516f0073ef1 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -94,6 +94,8 @@ Llama 3.2 1B and 3B performance was measured on Android OnePlus 12 device. The p +[Please visit this section to try it on OpenVINO backend](../../openvino/llama/README.md). + ## Llama 3/3.1 8B Since Llama 3 8B model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized (PTQ) model. @@ -136,7 +138,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus

-[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, MediaTek, or OpenVINO](non_cpu_backends.md). +[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, or MediaTek](non_cpu_backends.md). # Instructions From 6d0cbc53a5143c0bf66333872fdecefbc66b60d0 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Tue, 16 Sep 2025 11:11:17 -0700 Subject: [PATCH 74/85] Removed the comma which was added by mistake --- examples/models/llama/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 516f0073ef1..d0e72234c54 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -138,7 +138,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus

-[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, or MediaTek](non_cpu_backends.md). +[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP or MediaTek](non_cpu_backends.md). # Instructions From 3fbefecb61e147114c2aabc02079e88fa6d7777f Mon Sep 17 00:00:00 2001 From: suryasidd Date: Tue, 16 Sep 2025 12:18:52 -0700 Subject: [PATCH 75/85] Added NPU in choices --- examples/models/llama/export_llama_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index ed352c0997e..4f4ef2553aa 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -464,8 +464,8 @@ def build_args_parser() -> argparse.ArgumentParser: "--openvino_device", type=str, default="CPU", - choices=["CPU", "GPU"], - help="Specify the device for Openvino (CPU or GPU).", + choices=["CPU", "GPU", "NPU"], + help="Specify the device for Openvino (CPU, GPU or NPU).", ) parser.add_argument( From 12e51c72d6f184c1ee6902d6d8f895292a4d6d92 Mon Sep 17 00:00:00 2001 From: suryasidd Date: Tue, 16 Sep 2025 15:26:06 -0700 Subject: [PATCH 76/85] Fixed ref links --- examples/openvino/llama/README.md | 6 +++--- examples/openvino/llama/llama3_2_ov_4wo.yaml | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md index 46dbfb8c2f0..a98645b3918 100644 --- a/examples/openvino/llama/README.md +++ b/examples/openvino/llama/README.md @@ -2,13 +2,13 @@ # Export Llama with OpenVINO Backend ## Download the Model -Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time. +Follow the [instructions](../../../examples/models/llama/README.md#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time. ## Environment Setup -Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend. +Follow the [instructions](../../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend. ## Export the model: -Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. Replace device with the target hardware you want to compile the model for (`CPU`, `GPU`, or `NPU`). The exported model will be generated in the same directory with the filename `llama3_2.pte`. +Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. Replace device with the target hardware you want to compile the model for (`CPU`, `GPU`, or `NPU`). The exported model will be generated in the same directory with the filename `llama3_2_ov.pte`. For modifying the output name, change `output_name` in `llama3_2_ov_4wo.yaml` file under `export`. ``` LLAMA_CHECKPOINT=/consolidated.00.pth diff --git a/examples/openvino/llama/llama3_2_ov_4wo.yaml b/examples/openvino/llama/llama3_2_ov_4wo.yaml index 68a53708fb9..8fb1d7a1c09 100644 --- a/examples/openvino/llama/llama3_2_ov_4wo.yaml +++ b/examples/openvino/llama/llama3_2_ov_4wo.yaml @@ -2,17 +2,20 @@ base: metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' model: - use_kv_cache: True + use_kv_cache: true dtype_override: fp32 - enable_dynamic_shape: False + enable_dynamic_shape: false quantization: pt2e_quantize: "openvino_4wo" group_size: 128 +export: + output_name: "llama3_2_ov.pte" + backend: openvino: - enabled: True + enabled: true debug: - verbose: True + verbose: false From 72331f5d0feaea93cef7517fda0eba7942ac6dd2 Mon Sep 17 00:00:00 2001 From: suryasidd Date: Wed, 17 Sep 2025 13:16:49 -0700 Subject: [PATCH 77/85] Added Remove clone ops transformation to OpenVINO backend --- backends/openvino/preprocess.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py index c343f44a8b5..66d5ec97b0a 100644 --- a/backends/openvino/preprocess.py +++ b/backends/openvino/preprocess.py @@ -8,6 +8,7 @@ from typing import final, List +from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform from executorch.exir.backend.backend_details import ( BackendDetails, ExportedProgram, @@ -36,6 +37,14 @@ def preprocess( Returns: PreprocessResult: The result of preprocessing, including the compiled model bytes. """ + # Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations + remove_clone_transform = RemoveCloneOpsTransform() + transformed_result = remove_clone_transform(edge_program.graph_module) + + # Update the edge_program with the transformed graph + if transformed_result.graph_module is not None: + edge_program._graph_module = transformed_result.graph_module + input_names = edge_program.graph_signature.user_inputs args = [] for node in edge_program.graph.nodes: @@ -47,7 +56,9 @@ def preprocess( compile_options[spec.key] = spec.value.decode() compiled = openvino_compile( - edge_program.module(), *args, options=compile_options + edge_program.module(), + *args, + options=compile_options ) model_bytes = compiled.export_model() From 8016165619eee3777e2ef437e4b83de84b3582b6 Mon Sep 17 00:00:00 2001 From: suryasidd Date: Wed, 17 Sep 2025 13:28:50 -0700 Subject: [PATCH 78/85] Fixed variable names --- backends/openvino/preprocess.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py index 66d5ec97b0a..7fc9d61d68e 100644 --- a/backends/openvino/preprocess.py +++ b/backends/openvino/preprocess.py @@ -38,12 +38,11 @@ def preprocess( PreprocessResult: The result of preprocessing, including the compiled model bytes. """ # Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations - remove_clone_transform = RemoveCloneOpsTransform() - transformed_result = remove_clone_transform(edge_program.graph_module) + transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module) # Update the edge_program with the transformed graph - if transformed_result.graph_module is not None: - edge_program._graph_module = transformed_result.graph_module + if transformed_ep.graph_module is not None: + edge_program._graph_module = transformed_ep.graph_module input_names = edge_program.graph_signature.user_inputs args = [] From f0d9fc72f504cb7e80ee34c02bca2e62977a1c9e Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Wed, 17 Sep 2025 15:30:48 -0700 Subject: [PATCH 79/85] Added extended support list for openvino backend --- backends/openvino/partitioner.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index 20841d6730b..00107959412 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -34,6 +34,9 @@ def __init__(self): class OpenvinoOperatorsSupport(OperatorSupportBase): + extended_support_dict = { + "torch.ops.dim_order_ops._clone_dim_order.default": None, + } def __init__( self, @@ -77,7 +80,9 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: if node.name in self._enabled_ops_by_name: return True - supported_ops = OperatorSupport(options)._support_dict + supported_ops = ( + OperatorSupport(options)._support_dict | self.extended_support_dict + ) if op_type == "getitem": return True From 9b41c28be3e266c10808ae07cc1cf1ff84112280 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Wed, 17 Sep 2025 15:31:06 -0700 Subject: [PATCH 80/85] formating fix --- backends/openvino/preprocess.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py index 7fc9d61d68e..3ba693973e0 100644 --- a/backends/openvino/preprocess.py +++ b/backends/openvino/preprocess.py @@ -55,9 +55,7 @@ def preprocess( compile_options[spec.key] = spec.value.decode() compiled = openvino_compile( - edge_program.module(), - *args, - options=compile_options + edge_program.module(), *args, options=compile_options ) model_bytes = compiled.export_model() From e7517263cdae812bf96941c6ececd73790f1c69a Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Wed, 17 Sep 2025 16:09:00 -0700 Subject: [PATCH 81/85] formatting fix --- backends/openvino/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py index 3ba693973e0..72c781c0fb3 100644 --- a/backends/openvino/preprocess.py +++ b/backends/openvino/preprocess.py @@ -41,7 +41,7 @@ def preprocess( transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module) # Update the edge_program with the transformed graph - if transformed_ep.graph_module is not None: + if transformed_ep and transformed_ep.graph_module: edge_program._graph_module = transformed_ep.graph_module input_names = edge_program.graph_signature.user_inputs From 8106204b8a4af557bc6d925b070d9202789c14b4 Mon Sep 17 00:00:00 2001 From: suryasidd Date: Tue, 30 Sep 2025 15:32:58 -0700 Subject: [PATCH 82/85] Added DimorderOpsRevertPass to Openvino backend --- backends/openvino/partitioner.py | 1 + backends/openvino/preprocess.py | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index 00107959412..0d407e33f6e 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -36,6 +36,7 @@ def __init__(self): class OpenvinoOperatorsSupport(OperatorSupportBase): extended_support_dict = { "torch.ops.dim_order_ops._clone_dim_order.default": None, + "torch.ops.dim_order_ops._to_dim_order_copy.default": None, } def __init__( diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py index 72c781c0fb3..7d89e117dc6 100644 --- a/backends/openvino/preprocess.py +++ b/backends/openvino/preprocess.py @@ -8,7 +8,7 @@ from typing import final, List -from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform +from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass from executorch.exir.backend.backend_details import ( BackendDetails, ExportedProgram, @@ -37,8 +37,7 @@ def preprocess( Returns: PreprocessResult: The result of preprocessing, including the compiled model bytes. """ - # Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations - transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module) + transformed_ep = DimOrderOpsRevertPass()(edge_program.graph_module) # Update the edge_program with the transformed graph if transformed_ep and transformed_ep.graph_module: From eaf0e174f09e9cfa1584d8e77b8f06abf18b8e1b Mon Sep 17 00:00:00 2001 From: suryasidd Date: Wed, 1 Oct 2025 11:28:42 -0700 Subject: [PATCH 83/85] Fixed linter issues --- backends/openvino/preprocess.py | 3 ++- extension/llm/export/config/llm_config.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py index 7d89e117dc6..691115f6579 100644 --- a/backends/openvino/preprocess.py +++ b/backends/openvino/preprocess.py @@ -8,13 +8,14 @@ from typing import final, List -from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass from executorch.exir.backend.backend_details import ( BackendDetails, ExportedProgram, PreprocessResult, ) from executorch.exir.backend.compile_spec_schema import CompileSpec + +from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass from openvino.frontend.pytorch.torchdynamo.compile import ( # type: ignore[import-untyped] openvino_compile, ) diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index a176fa71dcc..0ac965b98cc 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -465,6 +465,7 @@ class OpenvinoConfig: nncf_compression: bool = False nncf_compression_group_size: int = 32 + @dataclass class TorchAOKernelsConfig: """ From 229bbd27dfb5a622b67377f66cb58fe5c3bc6d28 Mon Sep 17 00:00:00 2001 From: suryasidd Date: Tue, 7 Oct 2025 23:44:44 -0700 Subject: [PATCH 84/85] Use defualt runner for OpenVINO backend as well --- backends/openvino/CMakeLists.txt | 24 --------------------- backends/openvino/README.md | 2 +- backends/openvino/scripts/openvino_build.sh | 3 ++- docs/source/build-run-openvino.md | 4 ++-- examples/openvino/README.md | 8 +++---- 5 files changed, 9 insertions(+), 32 deletions(-) diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt index f5b957da881..736ed6d8603 100644 --- a/backends/openvino/CMakeLists.txt +++ b/backends/openvino/CMakeLists.txt @@ -53,30 +53,6 @@ target_sources( executorch_target_link_options_shared_lib(openvino_backend) -if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER) - # Build executor runner binary for openvino backend - list(APPEND openvino_executor_runner_libs openvino_backend executorch) - - set(_openvino_executor_runner__srcs - ${EXECUTORCH_ROOT}/examples/portable/executor_runner/executor_runner.cpp - ${EXECUTORCH_ROOT}/extension/data_loader/file_data_loader.cpp - ${EXECUTORCH_ROOT}/extension/evalue_util/print_evalue.cpp - ${EXECUTORCH_ROOT}/extension/runner_util/inputs.cpp - ${EXECUTORCH_ROOT}/extension/runner_util/inputs_portable.cpp - ) - add_executable(openvino_executor_runner ${_openvino_executor_runner__srcs}) - - list(APPEND openvino_executor_runner_libs) - - target_link_libraries( - openvino_executor_runner gflags portable_ops_lib - ${openvino_executor_runner_libs} - ) - target_compile_options( - openvino_executor_runner PUBLIC ${_common_compile_options} - ) -endif() - # Install OpenVINO backend library to the lib directory install( TARGETS openvino_backend diff --git a/backends/openvino/README.md b/backends/openvino/README.md index 0046ad23486..5ce38ade56f 100644 --- a/backends/openvino/README.md +++ b/backends/openvino/README.md @@ -105,7 +105,7 @@ Follow the steps below to setup your build environment: ```bash ./openvino_build.sh --enable_python ``` - **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `/cmake-out` directory. The binary located at `/cmake-out/backends/openvino/openvino_executor_runner` can be used to run inference with vision models. + **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `/cmake-out` directory. The binary located at `/cmake-out/executor_runner` can be used to run inference with vision models. ```bash ./openvino_build.sh --cpp_runtime ``` diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh index b7e5f5270ab..6d7853b96e5 100755 --- a/backends/openvino/scripts/openvino_build.sh +++ b/backends/openvino/scripts/openvino_build.sh @@ -30,10 +30,11 @@ build_cpp_runtime() { -DEXECUTORCH_BUILD_OPENVINO=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ diff --git a/docs/source/build-run-openvino.md b/docs/source/build-run-openvino.md index dc6f098850f..12aa5df130d 100644 --- a/docs/source/build-run-openvino.md +++ b/docs/source/build-run-openvino.md @@ -92,7 +92,7 @@ The exported model will be saved as 'resnet50.pte' in the current directory. ### Build C++ OpenVINO Examples -After building the OpenVINO backend following the [instructions](#setup) above, the executable will be saved in `/cmake-out/backends/openvino/`. +After building the OpenVINO backend following the [instructions](#setup) above, the executable will be saved in `/cmake-out/`. The executable requires a model file (`.pte` file generated in the aot step) and the number of inference executions. @@ -101,7 +101,7 @@ The executable requires a model file (`.pte` file generated in the aot step) and Run inference with a given model for 10 executions: ``` -./openvino_executor_runner \ +./executor_runner \ --model_path=model.pte \ --num_executions=10 ``` diff --git a/examples/openvino/README.md b/examples/openvino/README.md index 0ecedde092c..83e3daf6849 100644 --- a/examples/openvino/README.md +++ b/examples/openvino/README.md @@ -157,7 +157,7 @@ Build the backend libraries and executor runner by executing the script below in ```bash ./openvino_build.sh ``` -The executable is saved in `/cmake-out/backends/openvino/` +The executable is saved in `/cmake-out/` ### Run the Example with Executor Runner @@ -166,9 +166,9 @@ Now, run the example using the executable generated in the above step. The execu #### Command Syntax: ``` -cd ../../cmake-out/backends/openvino +cd ../../cmake-out -./openvino_executor_runner \ +./executor_runner \ --model_path= \ --num_executions= ``` @@ -182,7 +182,7 @@ cd ../../cmake-out/backends/openvino Run inference with a given model for 10 iterations: ``` -./openvino_executor_runner \ +./executor_runner \ --model_path=model.pte \ --num_executions=10 ``` From 1428d81247f77e5b68d8c9dbfd20b7151f994751 Mon Sep 17 00:00:00 2001 From: suryasidd Date: Mon, 13 Oct 2025 13:37:37 -0700 Subject: [PATCH 85/85] Changed quantization scheme --- extension/llm/export/quantizer_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index f92c59cebd3..592a6666dfa 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -238,9 +238,9 @@ def get_ov_quantizer( quantization_params = {} if quant_config == "4wo": - quantization_params["mode"] = QuantizationMode.INT4WO_ASYM + quantization_params["mode"] = QuantizationMode.INT4WO_SYM quantization_params["group_size"] = group_size - quantization_params["ratio"] = 0.8 + quantization_params["ratio"] = 1 elif quant_config == "8wo": quantization_params["mode"] = QuantizationMode.INT8WO_ASYM