From fb3a2e0ed7b9b40271cf441a237a00dee47bf952 Mon Sep 17 00:00:00 2001 From: irenab Date: Sun, 2 Mar 2025 16:57:53 +0200 Subject: [PATCH 01/12] move virtual graph creation and search method call inside MP manager --- .../core/common/graph/base_graph.py | 18 +++- .../core/common/graph/base_node.py | 1 + .../mixed_precision_search_facade.py | 43 +++------ .../mixed_precision_search_manager.py | 89 ++++++++++++------- .../search_methods/linear_programming.py | 17 ++-- .../solution_refinement_procedure.py | 2 +- .../weights_mixed_precision_tests.py | 12 +-- 7 files changed, 93 insertions(+), 89 deletions(-) diff --git a/model_compression_toolkit/core/common/graph/base_graph.py b/model_compression_toolkit/core/common/graph/base_graph.py index cb54aac0e..7914559e6 100644 --- a/model_compression_toolkit/core/common/graph/base_graph.py +++ b/model_compression_toolkit/core/common/graph/base_graph.py @@ -706,14 +706,24 @@ def update_fused_nodes(self, fusion: List[Any]): """ self.fused_nodes.append(fusion) - def is_single_activation_cfg(self): + def has_any_configurable_activation(self) -> bool: """ - Checks whether all nodes in the graph that have activation quantization are quantized with the same bit-width. + Checks whether any node in the graph has a configurable activation quantization. - Returns: True if all quantization config candidates of all nodes have the same activation quantization bit-width. + Returns: + Whether any node in the graph has a configurable activation quantization. + """ + return any([n.has_configurable_activation() for n in self.nodes]) + + def has_any_configurable_weights(self): + """ + Checks whether any node in the graph has any configurable weights quantization. + Returns: + Whether any node in the graph has any configurable weights quantization. """ - return all([n.is_all_activation_candidates_equal() for n in self.nodes]) + + return any([n.has_any_configurable_weight() for n in self.nodes]) def replace_node(self, node_to_replace: BaseNode, new_node: BaseNode): """ diff --git a/model_compression_toolkit/core/common/graph/base_node.py b/model_compression_toolkit/core/common/graph/base_node.py index 1dfd1e533..d867fe578 100644 --- a/model_compression_toolkit/core/common/graph/base_node.py +++ b/model_compression_toolkit/core/common/graph/base_node.py @@ -170,6 +170,7 @@ def is_configurable_weight(self, attr_name: str) -> bool: def has_any_configurable_weight(self) -> bool: """ Check whether any of the node's weights is configurable. + Returns: Whether any of the node's weights is configurable. """ diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py index 9a473cad0..41c1fdb32 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py @@ -34,16 +34,10 @@ class BitWidthSearchMethod(Enum): - # When adding a new search_methods MP configuration method, these enum and factory dictionary - # should be updated with it's kind and a search_method implementation. INTEGER_PROGRAMMING = 0 -search_methods = { - BitWidthSearchMethod.INTEGER_PROGRAMMING: mp_integer_programming_search} - - -def search_bit_width(graph_to_search_cfg: Graph, +def search_bit_width(graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkImplementation, target_resource_utilization: ResourceUtilization, @@ -60,7 +54,7 @@ def search_bit_width(graph_to_search_cfg: Graph, target_resource_utilization have to be passed. If it was not passed, the facade is not supposed to get here by now. Args: - graph_to_search_cfg: Graph to search a MP configuration for. + graph: Graph to search a MP configuration for. fw_info: FrameworkInfo object about the specific framework (e.g., attributes of different layers' weights to quantize). fw_impl: FrameworkImplementation object with specific framework methods implementation. target_resource_utilization: Target Resource Utilization to bound our feasible solution space s.t the configuration does not violate it. @@ -75,51 +69,34 @@ def search_bit_width(graph_to_search_cfg: Graph, bit-width index on the node). """ - - # target_resource_utilization have to be passed. If it was not passed, the facade is not supposed to get here by now. - if target_resource_utilization is None: - Logger.critical("Target ResourceUtilization is required for the bit-width search method's configuration.") # pragma: no cover - - # Set graph for MP search - graph = copy.deepcopy(graph_to_search_cfg) # Copy graph before searching - if target_resource_utilization.bops_restricted(): - # TODO: we only need the virtual graph is both activations and weights are configurable - # Since Bit-operations count target resource utilization is set, we need to reconstruct the graph for the MP search - graph = substitute(graph, fw_impl.get_substitutions_virtual_weights_activation_coupling()) - # If we only run weights compression with MP than no need to consider activation quantization when computing the # MP metric (it adds noise to the computation) tru = target_resource_utilization weight_only_restricted = tru.weight_restricted() and not (tru.activation_restricted() or tru.total_mem_restricted() or tru.bops_restricted()) - disable_activation_for_metric = weight_only_restricted or graph_to_search_cfg.is_single_activation_cfg() + disable_activation_for_metric = weight_only_restricted or not graph.has_any_configurable_activation() # Set Sensitivity Evaluator for MP search. It should always work with the original MP graph, # even if a virtual graph was created (and is used only for BOPS utilization computation purposes) se = fw_impl.get_sensitivity_evaluator( - graph_to_search_cfg, + graph, mp_config, representative_data_gen=representative_data_gen, fw_info=fw_info, disable_activation_for_metric=disable_activation_for_metric, hessian_info_service=hessian_info_service) - # Instantiate a manager object + if search_method != BitWidthSearchMethod.INTEGER_PROGRAMMING: + raise NotImplementedError() + + # Search manager and LP are highly coupled, so LP search method was moved inside search manager. search_manager = MixedPrecisionSearchManager(graph, fw_info, fw_impl, se, - target_resource_utilization, - original_graph=graph_to_search_cfg) - - if search_method not in search_methods: - raise NotImplementedError() # pragma: no cover - - search_method_fn = search_methods[search_method] - # Search for the desired mixed-precision configuration - result_bit_cfg = search_method_fn(search_manager, - target_resource_utilization) + target_resource_utilization) + result_bit_cfg = search_manager.search() if mp_config.refine_mp_solution: result_bit_cfg = greedy_solution_refinement_procedure(result_bit_cfg, search_manager, target_resource_utilization) diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py index 862896197..333a2a71e 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +import copy from typing import Callable, Dict, List @@ -30,6 +31,7 @@ from model_compression_toolkit.core.common.mixed_precision.mixed_precision_ru_helper import \ MixedPrecisionRUHelper from model_compression_toolkit.core.common.mixed_precision.sensitivity_evaluation import SensitivityEvaluation +from model_compression_toolkit.core.common.substitutions.apply_substitutions import substitute from model_compression_toolkit.logger import Logger @@ -43,8 +45,7 @@ def __init__(self, fw_info: FrameworkInfo, fw_impl: FrameworkImplementation, sensitivity_evaluator: SensitivityEvaluation, - target_resource_utilization: ResourceUtilization, - original_graph: Graph = None): + target_resource_utilization: ResourceUtilization): """ Args: @@ -54,18 +55,21 @@ def __init__(self, sensitivity_evaluator: A SensitivityEvaluation which provides a function that evaluates the sensitivity of a bit-width configuration for the MP model. target_resource_utilization: Target Resource Utilization to bound our feasible solution space s.t the configuration does not violate it. - original_graph: In case we have a search over a virtual graph (if we have BOPS utilization target), then this argument - will contain the original graph (for config reconstruction purposes). """ - self.graph = graph - self.original_graph = graph if original_graph is None else original_graph self.fw_info = fw_info self.fw_impl = fw_impl + + self.original_graph = graph + # graph for mp search + self.mp_graph, self.using_virtual_graph = self._get_mp_graph(graph, target_resource_utilization) + self.sensitivity_evaluator = sensitivity_evaluator + self.compute_metric_fn = sensitivity_evaluator.compute_metric + self.target_resource_utilization = target_resource_utilization + + self.mp_topo_configurable_nodes = self.mp_graph.get_configurable_sorted_nodes(fw_info) self.layer_to_bitwidth_mapping = self.get_search_space() - self.compute_metric_fn = self.get_sensitivity_metric() - self._cuts = None # To define RU Total constraints we need to compute weights and activations even if they have no constraints # TODO currently this logic is duplicated in linear_programming.py @@ -74,16 +78,53 @@ def __init__(self, targets = targets.union({RUTarget.ACTIVATION, RUTarget.WEIGHTS}) - {RUTarget.TOTAL} self.ru_targets_to_compute = targets - self.ru_helper = MixedPrecisionRUHelper(graph, fw_info, fw_impl) - self.target_resource_utilization = target_resource_utilization - self.min_ru_config = self.graph.get_min_candidates_config(fw_info) - self.max_ru_config = self.graph.get_max_candidates_config(fw_info) + self.ru_helper = MixedPrecisionRUHelper(self.mp_graph, fw_info, fw_impl) + + self.min_ru_config = self.mp_graph.get_min_candidates_config(fw_info) + self.max_ru_config = self.mp_graph.get_max_candidates_config(fw_info) self.min_ru = self.ru_helper.compute_utilization(self.ru_targets_to_compute, self.min_ru_config) self.non_conf_ru_dict = self.ru_helper.compute_utilization(self.ru_targets_to_compute, None) - self.config_reconstruction_helper = ConfigReconstructionHelper(virtual_graph=self.graph, + self.config_reconstruction_helper = ConfigReconstructionHelper(virtual_graph=self.mp_graph, original_graph=self.original_graph) + def search(self): + """ + Run mixed precision search. + + Returns: + Indices of the selected bit-widths candidates. + """ + # import here to prevent circular dependency + from model_compression_toolkit.core.common.mixed_precision.search_methods.linear_programming import \ + mp_integer_programming_search + config = mp_integer_programming_search(self, self.target_resource_utilization) + if self.mp_graph is self.original_graph: + return config + + return self.config_reconstruction_helper.reconstruct_config_from_virtual_graph(config) + + def _get_mp_graph(self, graph, target_resource_utilization): + """ + Get graph for mixed precision search. Virtual graph is built if bops is restricted and both activation and + weights are configurable. + + Args: + graph: input graph. + target_resource_utilization: target resource utilization. + + Returns: + Graph for mixed precision search (virtual or original). + """ + if (target_resource_utilization.bops_restricted() and + graph.has_any_configurable_activation() and + graph.has_any_configurable_weights()): + mp_graph = substitute(copy.deepcopy(graph), + self.fw_impl.get_substitutions_virtual_weights_activation_coupling()) + return mp_graph, True + + return graph, False + def get_search_space(self) -> Dict[int, List[int]]: """ The search space is a mapping from a node's index to a list of integers (possible bitwidths candidates indeces @@ -94,26 +135,12 @@ def get_search_space(self) -> Dict[int, List[int]]: """ indices_mapping = {} - nodes_to_configure = self.graph.get_configurable_sorted_nodes(self.fw_info) - for idx, n in enumerate(nodes_to_configure): + for idx, n in enumerate(self.mp_topo_configurable_nodes): # For each node, get all possible bitwidth indices for it # (which is a list from 0 to the length of the candidates mp_config list of the node). indices_mapping[idx] = list(range(len(n.candidates_quantization_cfg))) # all search_methods space return indices_mapping - def get_sensitivity_metric(self) -> Callable: - """ - - Returns: Return a function (from the framework implementation) to compute a metric that - indicates the similarity of the mixed-precision model (to the float model) for a given - mixed-precision configuration. - - """ - # Get from the framework an evaluation function on how a MP configuration, - # affects the expected loss. - - return self.sensitivity_evaluator.compute_metric - def compute_resource_utilization_matrix(self, target: RUTarget) -> np.ndarray: """ Computes and builds a resource utilization matrix, to be used for the mixed-precision search problem formalization. @@ -126,12 +153,8 @@ def compute_resource_utilization_matrix(self, target: RUTarget) -> np.ndarray: A resource utilization matrix of shape (num configurations, num memory elements). Num memory elements depends on the target, e.g. num nodes or num cuts, for which utilization is computed. """ - assert isinstance(target, RUTarget), f"{target} is not a valid resource target" - - configurable_sorted_nodes = self.graph.get_configurable_sorted_nodes(self.fw_info) - ru_matrix = [] - for c, c_n in enumerate(configurable_sorted_nodes): + for c, c_n in enumerate(self.mp_topo_configurable_nodes): for candidate_idx in range(len(c_n.candidates_quantization_cfg)): if candidate_idx == self.min_ru_config[c]: candidate_rus = self.min_ru[target] diff --git a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py index 1a3b2102c..e6b772f68 100644 --- a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py +++ b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py @@ -27,7 +27,7 @@ def mp_integer_programming_search(search_manager: MixedPrecisionSearchManager, - target_resource_utilization: ResourceUtilization = None) -> np.ndarray: + target_resource_utilization: ResourceUtilization) -> List[int]: """ Searching and returning a mixed-precision configuration using an ILP optimization solution. It first builds a mapping from each layer's index (in the model) to a dictionary that maps the @@ -44,17 +44,13 @@ def mp_integer_programming_search(search_manager: MixedPrecisionSearchManager, consumption). Returns: - The mixed-precision configuration (1-D array of indices. Each indicates the bitwidth index of a node). + The mixed-precision configuration (A list of indices. Each indicates the bitwidth index of a node). """ # Build a mapping from each layer's index (in the model) to a dictionary that maps the # bitwidth index to the observed sensitivity of the model when using that bitwidth for that layer. - if target_resource_utilization is None or search_manager is None: - Logger.critical("Invalid parameters: 'target_resource_utilization' and 'search_manager' must not be 'None' " - "for mixed-precision search. Ensure valid inputs are provided.") - layer_to_metrics_mapping = _build_layer_to_metrics_mapping(search_manager, target_resource_utilization) # Init variables to find their values when solving the lp problem. @@ -82,10 +78,7 @@ def mp_integer_programming_search(search_manager: MixedPrecisionSearchManager, in layer_to_indicator_vars_mapping.values()] ).flatten() - if target_resource_utilization.bops_restricted(): - return search_manager.config_reconstruction_helper.reconstruct_config_from_virtual_graph(config) - else: - return config + return config.tolist() def _init_problem_vars(layer_to_metrics_mapping: Dict[int, Dict[int, float]]) -> Tuple[ @@ -289,7 +282,7 @@ def _build_layer_to_metrics_mapping(search_manager: MixedPrecisionSearchManager, Logger.info('Starting to evaluate metrics') layer_to_metrics_mapping = {} - if target_resource_utilization.bops_restricted(): + if search_manager.using_virtual_graph: origin_max_config = search_manager.config_reconstruction_helper.reconstruct_config_from_virtual_graph(search_manager.max_ru_config) max_config_value = search_manager.compute_metric_fn(origin_max_config) else: @@ -310,7 +303,7 @@ def _build_layer_to_metrics_mapping(search_manager: MixedPrecisionSearchManager, mp_model_configuration[node_idx] = bitwidth_idx # Build a distance matrix using the function we got from the framework implementation. - if target_resource_utilization.bops_restricted(): + if search_manager.using_virtual_graph: # Reconstructing original graph's configuration from virtual graph's configuration origin_mp_model_configuration = \ search_manager.config_reconstruction_helper.reconstruct_config_from_virtual_graph( diff --git a/model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py b/model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py index 16c49ad53..7277d662e 100644 --- a/model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py +++ b/model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py @@ -63,7 +63,7 @@ def greedy_solution_refinement_procedure(mp_solution: List[int], # layer has max config in the given solution, nothing to optimize continue - current_node = search_manager.graph.get_configurable_sorted_nodes(search_manager.fw_info)[node_idx] + current_node = search_manager.mp_topo_configurable_nodes[node_idx] node_candidates = current_node.candidates_quantization_cfg # only weights kernel attribute is quantized with weights mixed precision diff --git a/tests/keras_tests/feature_networks_tests/feature_networks/weights_mixed_precision_tests.py b/tests/keras_tests/feature_networks_tests/feature_networks/weights_mixed_precision_tests.py index 252a38d9a..5e4c2401a 100644 --- a/tests/keras_tests/feature_networks_tests/feature_networks/weights_mixed_precision_tests.py +++ b/tests/keras_tests/feature_networks_tests/feature_networks/weights_mixed_precision_tests.py @@ -259,7 +259,7 @@ def get_resource_utilization(self): def _compare(self, quantized_model, float_model, input_x=None, quantization_info=None): conv_layers = get_layers_from_model_by_type(quantized_model, layers.Conv2D) - assert (quantization_info.mixed_precision_cfg == [1, 1]).all() + assert quantization_info.mixed_precision_cfg == [1, 1] for i in range(32): # quantized per channel self.unit_test.assertTrue( np.unique(conv_layers[0].get_quantized_weights()['kernel'][:, :, :, i]).flatten().shape[0] <= 16) @@ -300,7 +300,7 @@ def create_networks(self): def _compare(self, quantized_model, float_model, input_x=None, quantization_info=None): conv_layers = get_layers_from_model_by_type(quantized_model, layers.Conv2D) - self.unit_test.assertTrue((quantization_info.mixed_precision_cfg != 0).any()) + self.unit_test.assertTrue(any(i for i in quantization_info.mixed_precision_cfg)) for i in range(32): # quantized per channel self.unit_test.assertTrue( @@ -325,7 +325,7 @@ def get_resource_utilization(self): def _compare(self, quantized_model, float_model, input_x=None, quantization_info=None): conv_layers = get_layers_from_model_by_type(quantized_model, layers.Conv2D) - assert (quantization_info.mixed_precision_cfg == [2, 2]).all() + assert quantization_info.mixed_precision_cfg == [2, 2] for i in range(32): # quantized per channel self.unit_test.assertTrue( np.unique(conv_layers[0].get_quantized_weights()['kernel'][:, :, :, i]).flatten().shape[0] <= 4) @@ -443,7 +443,7 @@ def get_resource_utilization(self): def _compare(self, quantized_model, float_model, input_x=None, quantization_info=None): conv_layers = get_layers_from_model_by_type(quantized_model, layers.Conv2D) - assert (quantization_info.mixed_precision_cfg == [0, 1]).all() + assert quantization_info.mixed_precision_cfg == [0, 1] for i in range(32): # quantized per channel self.unit_test.assertTrue( np.unique(conv_layers[0].get_quantized_weights()['kernel'][:, :, :, i]).flatten().shape[0] <= 256) @@ -466,8 +466,8 @@ def get_resource_utilization(self): def _compare(self, quantized_model, float_model, input_x=None, quantization_info=None): conv_layers = get_layers_from_model_by_type(quantized_model, layers.Conv2D) - assert any([(quantization_info.mixed_precision_cfg == [1, 0]).all(), - (quantization_info.mixed_precision_cfg == [0, 1]).all()]) + assert any([quantization_info.mixed_precision_cfg == [1, 0], + quantization_info.mixed_precision_cfg == [0, 1]]) for i in range(32): # quantized per channel self.unit_test.assertTrue( np.unique(conv_layers[0].get_quantized_weights()['kernel'][:, :, :, i]).flatten().shape[0] <= 256) From d663f28256d248040102c8c25fd48d9134ee4632 Mon Sep 17 00:00:00 2001 From: irenab Date: Sun, 2 Mar 2025 19:40:17 +0200 Subject: [PATCH 02/12] fix test --- .../feature_networks_tests/feature_networks/qat/qat_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/keras_tests/feature_networks_tests/feature_networks/qat/qat_test.py b/tests/keras_tests/feature_networks_tests/feature_networks/qat/qat_test.py index d9bbb6b8b..d3e343fe6 100644 --- a/tests/keras_tests/feature_networks_tests/feature_networks/qat/qat_test.py +++ b/tests/keras_tests/feature_networks_tests/feature_networks/qat/qat_test.py @@ -306,9 +306,9 @@ def run_test(self, **kwargs): self.compare(qat_ready_model, quantization_info) - def compare(self, qat_ready_model, quantization_info): + def _compare(self, qat_ready_model, quantization_info): - self.unit_test.assertTrue(all(quantization_info.mixed_precision_cfg == self.expected_mp_cfg)) + self.unit_test.assertTrue(quantization_info.mixed_precision_cfg == self.expected_mp_cfg) # check that quantizer gets multiple bits configuration for layer in qat_ready_model.layers: From 213270859e0b2242158767c4f4a389908ca5af2c Mon Sep 17 00:00:00 2001 From: irenab Date: Sun, 2 Mar 2025 19:41:09 +0200 Subject: [PATCH 03/12] remove unused imports --- .../mixed_precision_search_facade.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py index 41c1fdb32..a6f737606 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py @@ -13,24 +13,20 @@ # limitations under the License. # ============================================================================== -import copy from enum import Enum -import numpy as np -from typing import List, Callable, Dict +from typing import List, Callable from model_compression_toolkit.core import MixedPrecisionQuantizationConfig from model_compression_toolkit.core.common import Graph -from model_compression_toolkit.core.common.hessian import HessianInfoService -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import ResourceUtilization, RUTarget from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation -from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_manager import MixedPrecisionSearchManager -from model_compression_toolkit.core.common.mixed_precision.search_methods.linear_programming import \ - mp_integer_programming_search from model_compression_toolkit.core.common.framework_info import FrameworkInfo +from model_compression_toolkit.core.common.hessian import HessianInfoService +from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_manager import \ + MixedPrecisionSearchManager +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \ + ResourceUtilization, RUTarget from model_compression_toolkit.core.common.mixed_precision.solution_refinement_procedure import \ greedy_solution_refinement_procedure -from model_compression_toolkit.core.common.substitutions.apply_substitutions import substitute -from model_compression_toolkit.logger import Logger class BitWidthSearchMethod(Enum): From 617e4762031a2f35026fdcbc4ca68f9356b8f1eb Mon Sep 17 00:00:00 2001 From: irenab Date: Sun, 2 Mar 2025 19:44:00 +0200 Subject: [PATCH 04/12] move sensitivity computation from linear_programming to MP search manager --- .../mixed_precision_search_manager.py | 73 ++++++++++++++++- .../search_methods/linear_programming.py | 79 +------------------ 2 files changed, 74 insertions(+), 78 deletions(-) diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py index 333a2a71e..c245721c4 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py @@ -13,11 +13,13 @@ # limitations under the License. # ============================================================================== import copy +from tqdm import tqdm -from typing import Callable, Dict, List +from typing import Dict, List import numpy as np +from model_compression_toolkit.constants import EPS from model_compression_toolkit.core.common import BaseNode from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation from model_compression_toolkit.core.common.framework_info import FrameworkInfo @@ -104,6 +106,75 @@ def search(self): return self.config_reconstruction_helper.reconstruct_config_from_virtual_graph(config) + def build_sensitivity_mapping(self, eps: float = EPS) -> Dict[int, Dict[int, float]]: + """ + This function measures the sensitivity of a change in a bitwidth of a layer on the entire model. + It builds a mapping from a node's index, to its bitwidht's effect on the model sensitivity. + For each node and some possible node's bitwidth (according to the given search space), we use + the framework function compute_metric_fn in order to infer + a batch of images, and compute (using the inference results) the sensitivity metric of + the configured mixed-precision model. + + Args: + eps: Epsilon value to manually increase metric value (if necessary) for numerical stability + + Returns: + Mapping from each node's index in a graph, to a dictionary from the bitwidth index (of this node) to + the sensitivity of the model. + + """ + + Logger.info('Starting to evaluate metrics') + layer_to_metrics_mapping = {} + + if self.using_virtual_graph: + origin_max_config = self.config_reconstruction_helper.reconstruct_config_from_virtual_graph( + self.max_ru_config) + max_config_value = self.compute_metric_fn(origin_max_config) + else: + max_config_value = self.compute_metric_fn(self.max_ru_config) + + for node_idx, layer_possible_bitwidths_indices in tqdm(self.layer_to_bitwidth_mapping.items(), + total=len(self.layer_to_bitwidth_mapping)): + layer_to_metrics_mapping[node_idx] = {} + + for bitwidth_idx in layer_possible_bitwidths_indices: + if self.max_ru_config[node_idx] == bitwidth_idx: + # This is a computation of the metric for the max configuration, assign pre-calculated value + layer_to_metrics_mapping[node_idx][bitwidth_idx] = max_config_value + continue + + # Create a configuration that differs at one layer only from the baseline model + mp_model_configuration = self.max_ru_config.copy() + mp_model_configuration[node_idx] = bitwidth_idx + + # Build a distance matrix using the function we got from the framework implementation. + if self.using_virtual_graph: + # Reconstructing original graph's configuration from virtual graph's configuration + origin_mp_model_configuration = \ + self.config_reconstruction_helper.reconstruct_config_from_virtual_graph( + mp_model_configuration, + changed_virtual_nodes_idx=[node_idx], + original_base_config=origin_max_config) + origin_changed_nodes_indices = [i for i, c in enumerate(origin_max_config) if + c != origin_mp_model_configuration[i]] + metric_value = self.compute_metric_fn( + origin_mp_model_configuration, + origin_changed_nodes_indices, + origin_max_config) + else: + metric_value = self.compute_metric_fn( + mp_model_configuration, + [node_idx], + self.max_ru_config) + + layer_to_metrics_mapping[node_idx][bitwidth_idx] = max(metric_value, max_config_value + eps) + + # Finalize distance metric mapping + self.finalize_distance_metric(layer_to_metrics_mapping) + + return layer_to_metrics_mapping + def _get_mp_graph(self, graph, target_resource_utilization): """ Get graph for mixed precision search. Virtual graph is built if bops is restricted and both activation and diff --git a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py index e6b772f68..3b3bbc0da 100644 --- a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py +++ b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py @@ -15,8 +15,7 @@ import numpy as np from pulp import * -from tqdm import tqdm -from typing import Dict, Tuple, Any, Optional +from typing import Dict, Tuple, Any from model_compression_toolkit.logger import Logger from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import ResourceUtilization, RUTarget @@ -51,7 +50,7 @@ def mp_integer_programming_search(search_manager: MixedPrecisionSearchManager, # Build a mapping from each layer's index (in the model) to a dictionary that maps the # bitwidth index to the observed sensitivity of the model when using that bitwidth for that layer. - layer_to_metrics_mapping = _build_layer_to_metrics_mapping(search_manager, target_resource_utilization) + layer_to_metrics_mapping = search_manager.build_sensitivity_mapping() # Init variables to find their values when solving the lp problem. layer_to_indicator_vars_mapping, layer_to_objective_vars_mapping = _init_problem_vars(layer_to_metrics_mapping) @@ -254,77 +253,3 @@ def _aggregate_for_lp(targets_ru_vec: Dict[RUTarget, Any], target: RUTarget) -> return list(targets_ru_vec[target]) raise ValueError(f'Unexpected target {target}.') # pragma: no cover - - -def _build_layer_to_metrics_mapping(search_manager: MixedPrecisionSearchManager, - target_resource_utilization: ResourceUtilization, - eps: float = EPS) -> Dict[int, Dict[int, float]]: - """ - This function measures the sensitivity of a change in a bitwidth of a layer on the entire model. - It builds a mapping from a node's index, to its bitwidht's effect on the model sensitivity. - For each node and some possible node's bitwidth (according to the given search space), we use - the framework function compute_metric_fn in order to infer - a batch of images, and compute (using the inference results) the sensitivity metric of - the configured mixed-precision model. - - Args: - search_manager: MixedPrecisionSearchManager object to be used for problem formalization. - target_resource_utilization: ResourceUtilization to constrain our LP problem with some resources limitations - (like model' weights memory consumption). - eps: Epsilon value to manually increase metric value (if necessary) for numerical stability - - Returns: - Mapping from each node's index in a graph, to a dictionary from the bitwidth index (of this node) to - the sensitivity of the model. - - """ - - Logger.info('Starting to evaluate metrics') - layer_to_metrics_mapping = {} - - if search_manager.using_virtual_graph: - origin_max_config = search_manager.config_reconstruction_helper.reconstruct_config_from_virtual_graph(search_manager.max_ru_config) - max_config_value = search_manager.compute_metric_fn(origin_max_config) - else: - max_config_value = search_manager.compute_metric_fn(search_manager.max_ru_config) - - for node_idx, layer_possible_bitwidths_indices in tqdm(search_manager.layer_to_bitwidth_mapping.items(), - total=len(search_manager.layer_to_bitwidth_mapping)): - layer_to_metrics_mapping[node_idx] = {} - - for bitwidth_idx in layer_possible_bitwidths_indices: - if search_manager.max_ru_config[node_idx] == bitwidth_idx: - # This is a computation of the metric for the max configuration, assign pre-calculated value - layer_to_metrics_mapping[node_idx][bitwidth_idx] = max_config_value - continue - - # Create a configuration that differs at one layer only from the baseline model - mp_model_configuration = search_manager.max_ru_config.copy() - mp_model_configuration[node_idx] = bitwidth_idx - - # Build a distance matrix using the function we got from the framework implementation. - if search_manager.using_virtual_graph: - # Reconstructing original graph's configuration from virtual graph's configuration - origin_mp_model_configuration = \ - search_manager.config_reconstruction_helper.reconstruct_config_from_virtual_graph( - mp_model_configuration, - changed_virtual_nodes_idx=[node_idx], - original_base_config=origin_max_config) - origin_changed_nodes_indices = [i for i, c in enumerate(origin_max_config) if - c != origin_mp_model_configuration[i]] - metric_value = search_manager.compute_metric_fn( - origin_mp_model_configuration, - origin_changed_nodes_indices, - origin_max_config) - else: - metric_value = search_manager.compute_metric_fn( - mp_model_configuration, - [node_idx], - search_manager.max_ru_config) - - layer_to_metrics_mapping[node_idx][bitwidth_idx] = max(metric_value, max_config_value + eps) - - # Finalize distance metric mapping - search_manager.finalize_distance_metric(layer_to_metrics_mapping) - - return layer_to_metrics_mapping From 266f3f97296e254640aa0c865fe46d122851cdeb Mon Sep 17 00:00:00 2001 From: irenab Date: Sun, 2 Mar 2025 19:59:11 +0200 Subject: [PATCH 05/12] remove separate computation for non-configurable nodes --- .../mixed_precision_ru_helper.py | 53 ++++++------------- .../mixed_precision_search_manager.py | 1 - .../search_methods/linear_programming.py | 12 +---- 3 files changed, 19 insertions(+), 47 deletions(-) diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py index 400cbb9e0..2cfcef336 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py @@ -17,7 +17,7 @@ import numpy as np from model_compression_toolkit.core import FrameworkInfo -from model_compression_toolkit.core.common import Graph, BaseNode +from model_compression_toolkit.core.common import Graph from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \ RUTarget @@ -36,7 +36,7 @@ def __init__(self, graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkImple self.fw_impl = fw_impl self.ru_calculator = ResourceUtilizationCalculator(graph, fw_impl, fw_info) - def compute_utilization(self, ru_targets: Set[RUTarget], mp_cfg: Optional[List[int]]) -> Dict[RUTarget, np.ndarray]: + def compute_utilization(self, ru_targets: Set[RUTarget], mp_cfg: List[int]) -> Dict[RUTarget, np.ndarray]: """ Compute utilization of requested targets for a specific configuration in the format expected by LP problem formulation namely a vector of ru values for relevant memory elements (nodes or cuts) in a constant order @@ -51,7 +51,7 @@ def compute_utilization(self, ru_targets: Set[RUTarget], mp_cfg: Optional[List[i """ ru = {} - act_qcs, w_qcs = self.get_quantization_candidates(mp_cfg) if mp_cfg else (None, None) + act_qcs, w_qcs = self.get_quantization_candidates(mp_cfg) if RUTarget.WEIGHTS in ru_targets: wu = self._weights_utilization(w_qcs) ru[RUTarget.WEIGHTS] = np.array(list(wu.values())) @@ -71,7 +71,7 @@ def compute_utilization(self, ru_targets: Set[RUTarget], mp_cfg: Optional[List[i return ru def get_quantization_candidates(self, mp_cfg) \ - -> Tuple[Dict[BaseNode, NodeActivationQuantizationConfig], Dict[BaseNode, NodeWeightsQuantizationConfig]]: + -> Tuple[Dict[str, NodeActivationQuantizationConfig], Dict[str, NodeWeightsQuantizationConfig]]: """ Retrieve quantization candidates objects for weights and activations from the configuration list. @@ -88,70 +88,51 @@ def get_quantization_candidates(self, mp_cfg) \ w_qcs = {n.name: cfg.weights_quantization_cfg for n, cfg in node_qcs.items()} return act_qcs, w_qcs - def _weights_utilization(self, w_qcs: Optional[Dict[BaseNode, NodeWeightsQuantizationConfig]]) -> Dict[BaseNode, float]: + def _weights_utilization(self, w_qcs: Dict[str, NodeWeightsQuantizationConfig]) -> Dict[str, float]: """ - Compute weights utilization for configurable weights if configuration is passed, - or for non-configurable nodes otherwise. + Compute weights utilization for configurable weights. Args: - w_qcs: nodes quantization configuration to compute, or None. + w_qcs: nodes quantization configuration to compute. Returns: Weight utilization per node. """ - if w_qcs: - target_criterion = TargetInclusionCriterion.QConfigurable - bitwidth_mode = BitwidthMode.QCustom - else: - target_criterion = TargetInclusionCriterion.QNonConfigurable - bitwidth_mode = BitwidthMode.QDefaultSP - - _, nodes_util, _ = self.ru_calculator.compute_weights_utilization(target_criterion=target_criterion, - bitwidth_mode=bitwidth_mode, + _, nodes_util, _ = self.ru_calculator.compute_weights_utilization(target_criterion=TargetInclusionCriterion.AnyQuantized, + bitwidth_mode=BitwidthMode.QCustom, w_qcs=w_qcs) nodes_util = {n: u.bytes for n, u in nodes_util.items()} return nodes_util - def _activation_utilization(self, act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]]) \ - -> Optional[Dict[Any, float]]: + def _activation_utilization(self, act_qcs: Dict[str, NodeActivationQuantizationConfig]) -> Dict[Any, float]: """ - Compute activation utilization using MaxCut for all quantized nodes if configuration is passed. + Compute activation utilization using MaxCut for all quantized nodes. Args: - act_qcs: nodes activation configuration or None. + act_qcs: nodes activation configuration. Returns: - Activation utilization per cut, or empty dict if no configuration was passed. + Activation utilization per cut. """ - # Maxcut activation utilization is computed for all quantized nodes, so non-configurable memory is already - # covered by the computation of configurable activations. - if not act_qcs: - return {} - _, cuts_util, *_ = self.ru_calculator.compute_activation_utilization_by_cut( TargetInclusionCriterion.AnyQuantized, bitwidth_mode=BitwidthMode.QCustom, act_qcs=act_qcs) cuts_util = {c: u.bytes for c, u in cuts_util.items()} return cuts_util def _bops_utilization(self, - act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]], - w_qcs: Optional[Dict[BaseNode, NodeWeightsQuantizationConfig]]) -> np.ndarray: + act_qcs: Optional[Dict[str, NodeActivationQuantizationConfig]], + w_qcs: Optional[Dict[str, NodeWeightsQuantizationConfig]]) -> np.ndarray: """ Computes a resource utilization vector with the respective bit-operations (BOPS) count according to the given mixed-precision configuration. Args: - act_qcs: nodes activation configuration or None. - w_qcs: nodes quantization configuration to compute, or None. - Either both are provided, or both are None. + act_qcs: nodes activation configuration. + w_qcs: nodes quantization configuration to compute. Returns: A vector of node's BOPS count. """ - assert [act_qcs, w_qcs].count(None) in [0, 2], 'act_qcs and w_qcs should both be provided or both be None.' - if act_qcs is None: - return np.array([]) - _, detailed_bops = self.ru_calculator.compute_bops(TargetInclusionCriterion.Any, BitwidthMode.QCustom, act_qcs=act_qcs, w_qcs=w_qcs) return np.array(list(detailed_bops.values())) diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py index c245721c4..cbbbb3303 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py @@ -85,7 +85,6 @@ def __init__(self, self.min_ru_config = self.mp_graph.get_min_candidates_config(fw_info) self.max_ru_config = self.mp_graph.get_max_candidates_config(fw_info) self.min_ru = self.ru_helper.compute_utilization(self.ru_targets_to_compute, self.min_ru_config) - self.non_conf_ru_dict = self.ru_helper.compute_utilization(self.ru_targets_to_compute, None) self.config_reconstruction_helper = ConfigReconstructionHelper(virtual_graph=self.mp_graph, original_graph=self.original_graph) diff --git a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py index 3b3bbc0da..73ba3f297 100644 --- a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py +++ b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py @@ -162,8 +162,7 @@ def _formalize_problem(layer_to_indicator_vars_mapping: Dict[int, Dict[int, LpVa _add_ru_constraints(search_manager=search_manager, target_resource_utilization=target_resource_utilization, indicators_matrix=indicators_matrix, - lp_problem=lp_problem, - non_conf_ru_dict=search_manager.non_conf_ru_dict) + lp_problem=lp_problem) else: # pragma: no cover Logger.critical("Unable to execute mixed-precision search: 'target_resource_utilization' is None. " "A valid 'target_resource_utilization' is required.") @@ -173,8 +172,7 @@ def _formalize_problem(layer_to_indicator_vars_mapping: Dict[int, Dict[int, LpVa def _add_ru_constraints(search_manager: MixedPrecisionSearchManager, target_resource_utilization: ResourceUtilization, indicators_matrix: np.ndarray, - lp_problem: LpProblem, - non_conf_ru_dict: Dict[RUTarget, np.ndarray]): + lp_problem: LpProblem): """ Adding targets constraints for the Lp problem for the given target resource utilization. The update to the Lp problem object is done inplace. @@ -184,7 +182,6 @@ def _add_ru_constraints(search_manager: MixedPrecisionSearchManager, target_resource_utilization: Target resource utilization. indicators_matrix: A diagonal matrix of the Lp problem's indicators. lp_problem: An Lp problem object to add constraint to. - non_conf_ru_dict: A non-configurable nodes' resource utilization vectors for the constrained targets. """ ru_indicated_vectors = {} # targets to add constraints for @@ -203,11 +200,6 @@ def _add_ru_constraints(search_manager: MixedPrecisionSearchManager, # that would be required if that configuration is selected). # Each element in a vector is an lp object representing the configurations sum term for a memory element. ru_vec = indicated_ru_matrix.sum(axis=1) + search_manager.min_ru[target] - - non_conf_ru_vec = non_conf_ru_dict[target] - if non_conf_ru_vec is not None and non_conf_ru_vec.size: - # add non-conf value as additional mem elements so that they get aggregated - ru_vec = np.concatenate([ru_vec, non_conf_ru_vec]) ru_indicated_vectors[target] = ru_vec # Add constraints only for the restricted targets in target resource utilization. From 5c619ad4ed82d05c7b6749a647f35016a5c89299 Mon Sep 17 00:00:00 2001 From: irenab Date: Mon, 3 Mar 2025 09:27:33 +0200 Subject: [PATCH 06/12] fix tests --- .../feature_networks_tests/feature_networks/qat/qat_test.py | 2 +- .../feature_models/mixed_precision_activation_test.py | 2 +- .../feature_models/mixed_precision_weights_test.py | 2 +- tests/pytorch_tests/model_tests/feature_models/qat_test.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/keras_tests/feature_networks_tests/feature_networks/qat/qat_test.py b/tests/keras_tests/feature_networks_tests/feature_networks/qat/qat_test.py index d3e343fe6..60b728065 100644 --- a/tests/keras_tests/feature_networks_tests/feature_networks/qat/qat_test.py +++ b/tests/keras_tests/feature_networks_tests/feature_networks/qat/qat_test.py @@ -306,7 +306,7 @@ def run_test(self, **kwargs): self.compare(qat_ready_model, quantization_info) - def _compare(self, qat_ready_model, quantization_info): + def compare(self, qat_ready_model, quantization_info): self.unit_test.assertTrue(quantization_info.mixed_precision_cfg == self.expected_mp_cfg) diff --git a/tests/pytorch_tests/model_tests/feature_models/mixed_precision_activation_test.py b/tests/pytorch_tests/model_tests/feature_models/mixed_precision_activation_test.py index dfc9edb13..9e2fbf6d0 100644 --- a/tests/pytorch_tests/model_tests/feature_models/mixed_precision_activation_test.py +++ b/tests/pytorch_tests/model_tests/feature_models/mixed_precision_activation_test.py @@ -72,7 +72,7 @@ def compare(self, quantized_model, float_model, input_x=None, quantization_info: raise NotImplementedError def verify_config(self, result_config, expected_config): - self.unit_test.assertTrue(all(result_config == expected_config), + self.unit_test.assertTrue(result_config == expected_config, f"Configuration mismatch: expected {expected_config} but got {result_config}.") diff --git a/tests/pytorch_tests/model_tests/feature_models/mixed_precision_weights_test.py b/tests/pytorch_tests/model_tests/feature_models/mixed_precision_weights_test.py index f09fb5b53..1dd065fe0 100644 --- a/tests/pytorch_tests/model_tests/feature_models/mixed_precision_weights_test.py +++ b/tests/pytorch_tests/model_tests/feature_models/mixed_precision_weights_test.py @@ -335,7 +335,7 @@ def get_resource_utilization(self): return ResourceUtilization(80) def compare(self, quantized_models, float_model, input_x=None, quantization_info=None): - self.unit_test.assertTrue(all(quantization_info.mixed_precision_cfg == self.expected_config)) + self.unit_test.assertTrue(quantization_info.mixed_precision_cfg == self.expected_config) class MixedPrecisionWeightsTestNet(torch.nn.Module): diff --git a/tests/pytorch_tests/model_tests/feature_models/qat_test.py b/tests/pytorch_tests/model_tests/feature_models/qat_test.py index 73154073c..e650feb44 100644 --- a/tests/pytorch_tests/model_tests/feature_models/qat_test.py +++ b/tests/pytorch_tests/model_tests/feature_models/qat_test.py @@ -289,7 +289,7 @@ def run_test(self): input_x=self.representative_data_gen(), quantization_info=quantization_info) - self.unit_test.assertTrue(all(quantization_info.mixed_precision_cfg == [1, 0, 0, 1, 0])) + self.unit_test.assertTrue(quantization_info.mixed_precision_cfg == [1, 0, 0, 1, 0]) # check that quantizer gets multiple bits configuration for _, layer in qat_ready_model.named_children(): @@ -336,7 +336,7 @@ def run_test(self): quantization_info=quantization_info) # check that MP search doesn't return 8 bits configuration for all layers - self.unit_test.assertTrue(all(quantization_info.mixed_precision_cfg == [1, 1, 0, 0, 0])) + self.unit_test.assertTrue(quantization_info.mixed_precision_cfg == [1, 1, 0, 0, 0]) # check that quantizer gets multiple bits configuration for _, layer in qat_ready_model.named_children(): From 6fdd3d646cad333246a2cdbde68328650637e69c Mon Sep 17 00:00:00 2001 From: irenab Date: Mon, 3 Mar 2025 15:29:35 +0200 Subject: [PATCH 07/12] simplify ru constraints construction in mixed precision --- .../mixed_precision_ru_helper.py | 99 ++++---------- .../mixed_precision_search_facade.py | 2 + .../mixed_precision_search_manager.py | 54 ++++---- .../resource_utilization.py | 32 +++-- .../search_methods/linear_programming.py | 128 ++++++------------ .../test_lp_search_bitwidth.py | 119 ++++++---------- .../test_tensorboard_writer.py | 3 +- 7 files changed, 167 insertions(+), 270 deletions(-) diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py index 2cfcef336..56d969d1c 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -from typing import List, Set, Dict, Optional, Tuple, Any +from typing import List, Set, Dict, Optional, Tuple, Any, Union import numpy as np @@ -38,37 +38,41 @@ def __init__(self, graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkImple def compute_utilization(self, ru_targets: Set[RUTarget], mp_cfg: List[int]) -> Dict[RUTarget, np.ndarray]: """ - Compute utilization of requested targets for a specific configuration in the format expected by LP problem - formulation namely a vector of ru values for relevant memory elements (nodes or cuts) in a constant order - (between calls). + Compute utilization of requested targets for a specific configuration: + for weights and bops - total utilization, + for activations and total - utilization per cut. Args: ru_targets: resource utilization targets to compute. mp_cfg: a list of candidates indices for configurable layers. Returns: - Dict of the computed utilization per target. + Dict of the computed utilization per target, as 1d vector. """ - - ru = {} act_qcs, w_qcs = self.get_quantization_candidates(mp_cfg) - if RUTarget.WEIGHTS in ru_targets: - wu = self._weights_utilization(w_qcs) - ru[RUTarget.WEIGHTS] = np.array(list(wu.values())) - - if RUTarget.ACTIVATION in ru_targets: - au = self._activation_utilization(act_qcs) - ru[RUTarget.ACTIVATION] = np.array(list(au.values())) - - if RUTarget.BOPS in ru_targets: - ru[RUTarget.BOPS] = self._bops_utilization(act_qcs=act_qcs, w_qcs=w_qcs) - - if RUTarget.TOTAL in ru_targets: - raise ValueError('Total target should be computed based on weights and activations targets.') - assert len(ru) == len(ru_targets), (f'Mismatch between the number of computed and requested metrics.' - f'Requested {ru_targets}') - return ru + ru, detailed_ru = self.ru_calculator.compute_resource_utilization(TargetInclusionCriterion.AnyQuantized, + BitwidthMode.QCustom, + act_qcs=act_qcs, + w_qcs=w_qcs, + ru_targets=ru_targets, + allow_unused_qcs=True, + return_detailed=True) + + ru_dict = {k: np.array([v]) for k, v in ru.get_resource_utilization_dict(restricted_only=True).items()} + # For activation and total we need utilization per cut, as different mp configurations might result in + # different cuts to be maximal. + for target in [RUTarget.ACTIVATION, RUTarget.TOTAL]: + if target in ru_dict: + ru_dict[target] = np.array(list(detailed_ru[target].values())) + + assert all(v.ndim == 1 for v in ru_dict.values()) + if RUTarget.ACTIVATION in ru_targets and RUTarget.TOTAL in ru_targets: + assert ru_dict[RUTarget.ACTIVATION].shape == ru_dict[RUTarget.TOTAL].shape + + assert len(ru_dict) == len(ru_targets), (f'Mismatch between the number of computed and requested metrics.' + f'Requested {ru_targets}') + return ru_dict def get_quantization_candidates(self, mp_cfg) \ -> Tuple[Dict[str, NodeActivationQuantizationConfig], Dict[str, NodeWeightsQuantizationConfig]]: @@ -87,52 +91,3 @@ def get_quantization_candidates(self, mp_cfg) \ act_qcs = {n.name: cfg.activation_quantization_cfg for n, cfg in node_qcs.items()} w_qcs = {n.name: cfg.weights_quantization_cfg for n, cfg in node_qcs.items()} return act_qcs, w_qcs - - def _weights_utilization(self, w_qcs: Dict[str, NodeWeightsQuantizationConfig]) -> Dict[str, float]: - """ - Compute weights utilization for configurable weights. - - Args: - w_qcs: nodes quantization configuration to compute. - - Returns: - Weight utilization per node. - """ - _, nodes_util, _ = self.ru_calculator.compute_weights_utilization(target_criterion=TargetInclusionCriterion.AnyQuantized, - bitwidth_mode=BitwidthMode.QCustom, - w_qcs=w_qcs) - nodes_util = {n: u.bytes for n, u in nodes_util.items()} - return nodes_util - - def _activation_utilization(self, act_qcs: Dict[str, NodeActivationQuantizationConfig]) -> Dict[Any, float]: - """ - Compute activation utilization using MaxCut for all quantized nodes. - - Args: - act_qcs: nodes activation configuration. - - Returns: - Activation utilization per cut. - """ - _, cuts_util, *_ = self.ru_calculator.compute_activation_utilization_by_cut( - TargetInclusionCriterion.AnyQuantized, bitwidth_mode=BitwidthMode.QCustom, act_qcs=act_qcs) - cuts_util = {c: u.bytes for c, u in cuts_util.items()} - return cuts_util - - def _bops_utilization(self, - act_qcs: Optional[Dict[str, NodeActivationQuantizationConfig]], - w_qcs: Optional[Dict[str, NodeWeightsQuantizationConfig]]) -> np.ndarray: - """ - Computes a resource utilization vector with the respective bit-operations (BOPS) count - according to the given mixed-precision configuration. - - Args: - act_qcs: nodes activation configuration. - w_qcs: nodes quantization configuration to compute. - - Returns: - A vector of node's BOPS count. - """ - _, detailed_bops = self.ru_calculator.compute_bops(TargetInclusionCriterion.Any, BitwidthMode.QCustom, - act_qcs=act_qcs, w_qcs=w_qcs) - return np.array(list(detailed_bops.values())) diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py index a6f737606..93beee95d 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py @@ -65,6 +65,8 @@ def search_bit_width(graph: Graph, bit-width index on the node). """ + assert target_resource_utilization.is_any_restricted() + # If we only run weights compression with MP than no need to consider activation quantization when computing the # MP metric (it adds noise to the computation) tru = target_resource_utilization diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py index cbbbb3303..1750cdf1f 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py @@ -13,6 +13,8 @@ # limitations under the License. # ============================================================================== import copy +from collections import defaultdict + from tqdm import tqdm from typing import Dict, List @@ -73,18 +75,12 @@ def __init__(self, self.mp_topo_configurable_nodes = self.mp_graph.get_configurable_sorted_nodes(fw_info) self.layer_to_bitwidth_mapping = self.get_search_space() - # To define RU Total constraints we need to compute weights and activations even if they have no constraints - # TODO currently this logic is duplicated in linear_programming.py - targets = target_resource_utilization.get_restricted_targets() - if RUTarget.TOTAL in targets: - targets = targets.union({RUTarget.ACTIVATION, RUTarget.WEIGHTS}) - {RUTarget.TOTAL} - self.ru_targets_to_compute = targets - + self.ru_targets = target_resource_utilization.get_restricted_targets() self.ru_helper = MixedPrecisionRUHelper(self.mp_graph, fw_info, fw_impl) self.min_ru_config = self.mp_graph.get_min_candidates_config(fw_info) self.max_ru_config = self.mp_graph.get_max_candidates_config(fw_info) - self.min_ru = self.ru_helper.compute_utilization(self.ru_targets_to_compute, self.min_ru_config) + self.min_ru = self.ru_helper.compute_utilization(self.ru_targets, self.min_ru_config) self.config_reconstruction_helper = ConfigReconstructionHelper(virtual_graph=self.mp_graph, original_graph=self.original_graph) @@ -100,10 +96,10 @@ def search(self): from model_compression_toolkit.core.common.mixed_precision.search_methods.linear_programming import \ mp_integer_programming_search config = mp_integer_programming_search(self, self.target_resource_utilization) - if self.mp_graph is self.original_graph: - return config - return self.config_reconstruction_helper.reconstruct_config_from_virtual_graph(config) + if self.using_virtual_graph: + config = self.config_reconstruction_helper.reconstruct_config_from_virtual_graph(config) + return config def build_sensitivity_mapping(self, eps: float = EPS) -> Dict[int, Dict[int, float]]: """ @@ -211,32 +207,32 @@ def get_search_space(self) -> Dict[int, List[int]]: indices_mapping[idx] = list(range(len(n.candidates_quantization_cfg))) # all search_methods space return indices_mapping - def compute_resource_utilization_matrix(self, target: RUTarget) -> np.ndarray: + def compute_resource_utilization_matrices(self) -> Dict[RUTarget, np.ndarray]: """ - Computes and builds a resource utilization matrix, to be used for the mixed-precision search problem formalization. + Computes and builds a resource utilization matrix for all restricted targets, to be used for the + mixed-precision search problem formalization. Utilization is computed relative to the minimal configuration, i.e. utilization for it will be 0. - Args: - target: The resource target for which the resource utilization is calculated (a RUTarget value). - Returns: - A resource utilization matrix of shape (num configurations, num memory elements). Num memory elements - depends on the target, e.g. num nodes or num cuts, for which utilization is computed. + A dictionary containing resource utilization matrix of shape (num configurations, num memory elements) + per ru target. Num memory elements depends on the target, e.g. num cuts or 1 for cumulative metrics. """ - ru_matrix = [] + rus_per_candidate = defaultdict(list) for c, c_n in enumerate(self.mp_topo_configurable_nodes): for candidate_idx in range(len(c_n.candidates_quantization_cfg)): if candidate_idx == self.min_ru_config[c]: - candidate_rus = self.min_ru[target] + candidate_rus = self.min_ru else: - candidate_rus = self.compute_node_ru_for_candidate(c, candidate_idx, target) + candidate_rus = self.compute_ru_for_candidate(c, candidate_idx) - ru_matrix.append(np.asarray(candidate_rus)) + for target, ru in candidate_rus.items(): + rus_per_candidate[target].append(ru) - np_ru_matrix = np.array(ru_matrix) - self.min_ru[target] # num configurations X num elements - return np_ru_matrix + # Each target contains a matrix of num configurations X num elements + relative_rus = {target: np.array(ru) - self.min_ru[target] for target, ru in rus_per_candidate.items()} + return relative_rus - def compute_node_ru_for_candidate(self, conf_node_idx: int, candidate_idx: int, target: RUTarget) -> np.ndarray: + def compute_ru_for_candidate(self, conf_node_idx: int, candidate_idx: int) -> Dict[RUTarget, np.ndarray]: """ Computes a resource utilization vector after replacing the given node's configuration candidate in the minimal target configuration with the given candidate index. @@ -244,13 +240,13 @@ def compute_node_ru_for_candidate(self, conf_node_idx: int, candidate_idx: int, Args: conf_node_idx: The index of a node in a sorted configurable nodes list. candidate_idx: Quantization config candidate to be used for the node's resource utilization computation. - target: The target for which the resource utilization is calculated (a RUTarget value). - Returns: Node's resource utilization vector. + Returns: + Node's resource utilization vector. """ cfg = self.replace_config_in_index(self.min_ru_config, conf_node_idx, candidate_idx) - return self.ru_helper.compute_utilization({target}, cfg)[target] + return self.ru_helper.compute_utilization(self.ru_targets, cfg) @staticmethod def replace_config_in_index(mp_cfg: List[int], idx: int, value: int) -> List[int]: @@ -284,7 +280,7 @@ def compute_resource_utilization_for_config(self, config: List[int]) -> Resource act_qcs, w_qcs = self.ru_helper.get_quantization_candidates(config) ru = self.ru_helper.ru_calculator.compute_resource_utilization( target_criterion=TargetInclusionCriterion.AnyQuantized, bitwidth_mode=BitwidthMode.QCustom, act_qcs=act_qcs, - w_qcs=w_qcs, ru_targets=self.ru_targets_to_compute, allow_unused_qcs=True) + w_qcs=w_qcs, ru_targets=self.ru_targets, allow_unused_qcs=True) return ru def finalize_distance_metric(self, layer_to_metrics_mapping: Dict[int, Dict[int, float]]): diff --git a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py index d2746da1b..afb03f06a 100644 --- a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py +++ b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py @@ -51,25 +51,34 @@ class ResourceUtilization: bops: float = np.inf def weight_restricted(self): - return self.weights_memory < np.inf + return self._is_restricted(self.weights_memory) def activation_restricted(self): - return self.activation_memory < np.inf + return self._is_restricted(self.activation_memory) def total_mem_restricted(self): - return self.total_memory < np.inf + return self._is_restricted(self.total_memory) def bops_restricted(self): - return self.bops < np.inf + return self._is_restricted(self.bops) - def get_resource_utilization_dict(self) -> Dict[RUTarget, float]: + def get_resource_utilization_dict(self, restricted_only: bool = False) -> Dict[RUTarget, float]: """ - Returns: a dictionary with the ResourceUtilization object's values for each resource utilization target. + Get resource utilization as a dictionary. + + Args: + restricted_only: whether to include only targets with restricted utilization. + + Returns: + A dictionary containing the resource utilization with targets as keys. """ - return {RUTarget.WEIGHTS: self.weights_memory, - RUTarget.ACTIVATION: self.activation_memory, - RUTarget.TOTAL: self.total_memory, - RUTarget.BOPS: self.bops} + ru_dict = {RUTarget.WEIGHTS: self.weights_memory, + RUTarget.ACTIVATION: self.activation_memory, + RUTarget.TOTAL: self.total_memory, + RUTarget.BOPS: self.bops} + if restricted_only: + ru_dict = {k: v for k, v in ru_dict.items() if self._is_restricted(v)} + return ru_dict def is_satisfied_by(self, ru: 'ResourceUtilization') -> bool: """ @@ -114,3 +123,6 @@ def get_summary_str(self, restricted: bool): if RUTarget.BOPS in targets: summary.append(f"BOPS: {self.bops}") return ', '.join(summary) + + def _is_restricted(self, v): + return v < np.inf diff --git a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py index 73ba3f297..331bd6b00 100644 --- a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py +++ b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py @@ -50,14 +50,14 @@ def mp_integer_programming_search(search_manager: MixedPrecisionSearchManager, # Build a mapping from each layer's index (in the model) to a dictionary that maps the # bitwidth index to the observed sensitivity of the model when using that bitwidth for that layer. - layer_to_metrics_mapping = search_manager.build_sensitivity_mapping() + layer_to_sensitivity_mapping = search_manager.build_sensitivity_mapping() # Init variables to find their values when solving the lp problem. - layer_to_indicator_vars_mapping, layer_to_objective_vars_mapping = _init_problem_vars(layer_to_metrics_mapping) + layer_to_indicator_vars_mapping, layer_to_objective_vars_mapping = _init_problem_vars(layer_to_sensitivity_mapping) # Add all equations and inequalities that define the problem. lp_problem = _formalize_problem(layer_to_indicator_vars_mapping, - layer_to_metrics_mapping, + layer_to_sensitivity_mapping, layer_to_objective_vars_mapping, target_resource_utilization, search_manager) @@ -150,28 +150,25 @@ def _formalize_problem(layer_to_indicator_vars_mapping: Dict[int, Dict[int, LpVa # Bound the feasible solution space with the desired resource utilization values. # Creates separate constraints for weights utilization and activation utilization. - if target_resource_utilization is not None: - indicators = [] - for layer in layer_to_metrics_mapping.keys(): - for _, indicator in layer_to_indicator_vars_mapping[layer].items(): - indicators.append(indicator) - - indicators_arr = np.array(indicators) - indicators_matrix = np.diag(indicators_arr) - - _add_ru_constraints(search_manager=search_manager, - target_resource_utilization=target_resource_utilization, - indicators_matrix=indicators_matrix, - lp_problem=lp_problem) - else: # pragma: no cover - Logger.critical("Unable to execute mixed-precision search: 'target_resource_utilization' is None. " - "A valid 'target_resource_utilization' is required.") + assert target_resource_utilization and target_resource_utilization.is_any_restricted() + + indicators = [] + for layer in layer_to_metrics_mapping.keys(): + for _, indicator in layer_to_indicator_vars_mapping[layer].items(): + indicators.append(indicator) + + indicators_vec = np.array(indicators) + + _add_ru_constraints(search_manager=search_manager, + target_resource_utilization=target_resource_utilization, + indicators_vec=indicators_vec, + lp_problem=lp_problem) return lp_problem def _add_ru_constraints(search_manager: MixedPrecisionSearchManager, target_resource_utilization: ResourceUtilization, - indicators_matrix: np.ndarray, + indicators_vec: np.ndarray, lp_problem: LpProblem): """ Adding targets constraints for the Lp problem for the given target resource utilization. @@ -180,68 +177,33 @@ def _add_ru_constraints(search_manager: MixedPrecisionSearchManager, Args: search_manager: MixedPrecisionSearchManager object to be used for resource utilization constraints formalization. target_resource_utilization: Target resource utilization. - indicators_matrix: A diagonal matrix of the Lp problem's indicators. + indicators_vec: A vector of the Lp problem's indicators. lp_problem: An Lp problem object to add constraint to. """ - ru_indicated_vectors = {} - # targets to add constraints for - constraints_targets = target_resource_utilization.get_restricted_targets() - # to add constraints for Total target we need to compute weight and activation - targets_to_compute = constraints_targets - if RUTarget.TOTAL in constraints_targets: - targets_to_compute = targets_to_compute.union({RUTarget.ACTIVATION, RUTarget.WEIGHTS}) - {RUTarget.TOTAL} - - for target in targets_to_compute: - ru_matrix = search_manager.compute_resource_utilization_matrix(target) # num elements X num configurations - indicated_ru_matrix = np.matmul(ru_matrix.T, indicators_matrix) # num elements X num configurations - - # Sum the indicated values over all configurations, and add the value for minimal configuration once. - # Indicated utilization values are relative to the minimal configuration, i.e. they represent the extra memory - # that would be required if that configuration is selected). - # Each element in a vector is an lp object representing the configurations sum term for a memory element. - ru_vec = indicated_ru_matrix.sum(axis=1) + search_manager.min_ru[target] - ru_indicated_vectors[target] = ru_vec - - # Add constraints only for the restricted targets in target resource utilization. - # Adding activation constraints modifies the lp term in ru_indicated_vectors, so if both activation and total - # are restricted we first add the constraints for total. - if RUTarget.TOTAL in constraints_targets and RUTarget.ACTIVATION in constraints_targets: - constraints_targets.remove(RUTarget.ACTIVATION) - constraints_targets = list(constraints_targets) + [RUTarget.ACTIVATION] - for target in constraints_targets: - target_resource_utilization_value = target_resource_utilization.get_resource_utilization_dict()[target] - aggr_ru = _aggregate_for_lp(ru_indicated_vectors, target) - for v in aggr_ru: - if isinstance(v, float): - if v > target_resource_utilization_value: - Logger.critical( - f"The model cannot be quantized to meet the specified target resource utilization {target.value} " - f"with the value {target_resource_utilization_value}.") # pragma: no cover - else: - lp_problem += v <= target_resource_utilization_value - - -def _aggregate_for_lp(targets_ru_vec: Dict[RUTarget, Any], target: RUTarget) -> list: - """ - Aggregate resource utilization values for the LP. - - Args: - targets_ru_vec: resource utilization vectors for all precomputed targets. - target: resource utilization target. - - Returns: - Aggregated resource utilization. - """ - if target == RUTarget.TOTAL: - w = lpSum(targets_ru_vec[RUTarget.WEIGHTS]) - act_ru_vec = targets_ru_vec[RUTarget.ACTIVATION] - return [w + v for v in act_ru_vec] - - if target in [RUTarget.WEIGHTS, RUTarget.BOPS]: - return [lpSum(targets_ru_vec[target])] - - if target == RUTarget.ACTIVATION: - # for max aggregation, each value constitutes a separate constraint - return list(targets_ru_vec[target]) - - raise ValueError(f'Unexpected target {target}.') # pragma: no cover + candidates_ru = search_manager.compute_resource_utilization_matrices() + min_ru = search_manager.min_ru + target_ru = target_resource_utilization.get_resource_utilization_dict(restricted_only=True) + assert candidates_ru.keys() == target_ru.keys() + + for target, ru_matrix in candidates_ru.items(): + # We expect 2d matrix of shape (num candidates, m). For cumulative metrics (weights, bops) m=1 - overall + # utilization. For max metrics (activation, total) m=num memory elements (max element depends on configuration) + assert ru_matrix.ndim == 2 + if target in [RUTarget.WEIGHTS, RUTarget.BOPS]: + assert ru_matrix.shape[1] == 1 + + # ru values are relative to the minimal configuration, so we adjust the target ru accordingly + ru_constraint = target_ru[target] - min_ru[target] + if any(ru_constraint < 0): + raise ValueError(f"The model cannot be quantized to meet the specified target resource utilization " + f"{target.value} with the value {target_ru[target]}.") + + indicated_ru_matrix = ru_matrix.T * indicators_vec + # build lp sum term over all candidates + ru_vec = indicated_ru_matrix.sum(axis=1) + + # For cumulative metrics a single constraint is added, for max metrics a separate constraint + # is added for each memory element (each element < target => max element < target). + assert len(ru_vec) == len(ru_constraint) + for v, c in zip(ru_vec, ru_constraint): + lp_problem += v <= c diff --git a/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py b/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py index 36de90950..d304b4f69 100644 --- a/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py +++ b/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py @@ -12,12 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +from unittest.mock import Mock + import numpy as np import unittest import keras from model_compression_toolkit.core import DEFAULTCONFIG from model_compression_toolkit.core.common.mixed_precision.distance_weighting import MpDistanceWeighting +from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_manager import \ + MixedPrecisionSearchManager from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \ ResourceUtilization, RUTarget from model_compression_toolkit.core.common.mixed_precision.mixed_precision_quantization_config import \ @@ -57,32 +61,31 @@ def reconstruct_config_from_virtual_graph(self, class MockMixedPrecisionSearchManager: - def __init__(self, layer_to_ru_mapping): + def __init__(self, layer_to_ru_mapping, ru_targets): + self.ru_targets = ru_targets self.layer_to_bitwidth_mapping = {0: [0, 1, 2]} self.layer_to_ru_mapping = layer_to_ru_mapping - self.compute_metric_fn = lambda x, y=None, z=None: {0: 2, 1: 1, 2: 0}[x[0]] - self.min_ru = {RUTarget.WEIGHTS: [1], - RUTarget.ACTIVATION: [1], - RUTarget.BOPS: [1]} # minimal resource utilization in the tests layer_to_ru_mapping + self.min_ru = {t: np.array([1]) for t in ru_targets} # minimal resource utilization in the tests layer_to_ru_mapping self.max_ru_config = [0] self.config_reconstruction_helper = MockReconstructionHelper() - self.non_conf_ru_dict = {RUTarget.WEIGHTS: None, RUTarget.ACTIVATION: None, RUTarget.BOPS: None} - def compute_resource_utilization_matrix(self, target): - # minus 1 is normalization by the minimal resource utilization (which is always 1 in this test) - if target == RUTarget.WEIGHTS: - ru_matrix = [np.flip(np.array([ru.weights_memory - 1 for _, ru in self.layer_to_ru_mapping[0].items()]))] - elif target == RUTarget.ACTIVATION: - ru_matrix = [np.flip(np.array([ru.activation_memory - 1 for _, ru in self.layer_to_ru_mapping[0].items()]))] - elif target == RUTarget.BOPS: - ru_matrix = [np.flip(np.array([ru.bops - 1 for _, ru in self.layer_to_ru_mapping[0].items()]))] - else: - raise ValueError('Not supposed to get here') - return np.array(ru_matrix).T + def build_sensitivity_mapping(self): + return {0: {0: 0, 1: 1, 2: 2}} - def finalize_distance_metric(self, d): - return d + def compute_resource_utilization_matrices(self): + # minus 1 is normalization by the minimal resource utilization (which is always 1 in this test) + ru = { + RUTarget.WEIGHTS: + [np.flip(np.array([ru.weights_memory - 1 for _, ru in self.layer_to_ru_mapping[0].items()]))], + RUTarget.ACTIVATION: + [np.flip(np.array([ru.activation_memory - 1 for _, ru in self.layer_to_ru_mapping[0].items()]))], + RUTarget.BOPS: + [np.flip(np.array([ru.bops - 1 for _, ru in self.layer_to_ru_mapping[0].items()]))], + RUTarget.TOTAL: + [np.flip(np.array([ru.total_memory - 1 for _, ru in self.layer_to_ru_mapping[0].items()]))] + } + return {k: np.array(v).T for k, v in ru.items() if k in self.ru_targets} class TestLpSearchBitwidth(unittest.TestCase): @@ -92,7 +95,7 @@ def test_search_weights_only(self): layer_to_ru_mapping = {0: {2: ResourceUtilization(weights_memory=1), 1: ResourceUtilization(weights_memory=2), 0: ResourceUtilization(weights_memory=3)}} - mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping) + mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping, {RUTarget.WEIGHTS}) bit_cfg = mp_integer_programming_search(mock_search_manager, target_resource_utilization=target_resource_utilization) @@ -106,42 +109,26 @@ def test_search_weights_only(self): target_resource_utilization=target_resource_utilization) bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=ResourceUtilization(weights_memory=np.inf)) + target_resource_utilization=ResourceUtilization(weights_memory=1000)) self.assertTrue(len(bit_cfg) == 1) - self.assertTrue(bit_cfg[0] == 0) # ResourceUtilization is Inf so expecting for the maximal bit-width result + self.assertTrue(bit_cfg[0] == 0) # expecting for the maximal bit-width result target_resource_utilization = None # target ResourceUtilization is not defined! with self.assertRaises(Exception): - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) - - def test_search_weights_only_with_non_conf(self): - target_resource_utilization = ResourceUtilization(weights_memory=2+11) - layer_to_ru_mapping = {0: {2: ResourceUtilization(weights_memory=1), - 1: ResourceUtilization(weights_memory=2), - 0: ResourceUtilization(weights_memory=3)} - } - mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping) - mock_search_manager.non_conf_ru_dict = {RUTarget.WEIGHTS: np.array([5, 6])} - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) + mp_integer_programming_search(mock_search_manager, + target_resource_utilization=target_resource_utilization) - self.assertTrue(len(bit_cfg) == 1) - self.assertTrue(bit_cfg[0] == 1) - - # make sure non_conf was taken into account and lower target has a different solution - target_resource_utilization = ResourceUtilization(weights_memory=2 + 10.9) - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) - self.assertFalse(bit_cfg[0] == 1) + with self.assertRaises(Exception): + mp_integer_programming_search(mock_search_manager, + target_resource_utilization=ResourceUtilization(weights_memory=np.inf)) def test_search_activation_only(self): target_resource_utilization = ResourceUtilization(activation_memory=2) layer_to_ru_mapping = {0: {2: ResourceUtilization(activation_memory=1), 1: ResourceUtilization(activation_memory=2), 0: ResourceUtilization(activation_memory=3)}} - mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping) + mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping, {RUTarget.ACTIVATION}) bit_cfg = mp_integer_programming_search(mock_search_manager, target_resource_utilization=target_resource_utilization) @@ -156,17 +143,17 @@ def test_search_activation_only(self): bit_cfg = mp_integer_programming_search(mock_search_manager, target_resource_utilization=ResourceUtilization( - activation_memory=np.inf)) + activation_memory=1000)) self.assertTrue(len(bit_cfg) == 1) - self.assertTrue(bit_cfg[0] == 0) # ResourceUtilization is Inf so expecting for the maximal bit-width result + self.assertTrue(bit_cfg[0] == 0) # expecting for the maximal bit-width result def test_search_weights_and_activation(self): target_resource_utilization = ResourceUtilization(weights_memory=2, activation_memory=2) layer_to_ru_mapping = {0: {2: ResourceUtilization(weights_memory=1, activation_memory=1), 1: ResourceUtilization(weights_memory=2, activation_memory=2), 0: ResourceUtilization(weights_memory=3, activation_memory=3)}} - mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping) + mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping, {RUTarget.WEIGHTS, RUTarget.ACTIVATION}) bit_cfg = mp_integer_programming_search(mock_search_manager, target_resource_utilization=target_resource_utilization) @@ -180,18 +167,18 @@ def test_search_weights_and_activation(self): target_resource_utilization=target_resource_utilization) bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=ResourceUtilization(weights_memory=np.inf, - activation_memory=np.inf)) + target_resource_utilization=ResourceUtilization(weights_memory=1000, + activation_memory=1000)) self.assertTrue(len(bit_cfg) == 1) - self.assertTrue(bit_cfg[0] == 0) # ResourceUtilization is Inf so expecting for the maximal bit-width result + self.assertTrue(bit_cfg[0] == 0) # expecting for the maximal bit-width result def test_search_total_resource_utilization(self): target_resource_utilization = ResourceUtilization(total_memory=4) - layer_to_ru_mapping = {0: {2: ResourceUtilization(weights_memory=1, activation_memory=1), - 1: ResourceUtilization(weights_memory=2, activation_memory=2), - 0: ResourceUtilization(weights_memory=3, activation_memory=3)}} - mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping) + layer_to_ru_mapping = {0: {2: ResourceUtilization(weights_memory=1, activation_memory=1, total_memory=2), + 1: ResourceUtilization(weights_memory=2, activation_memory=2, total_memory=4), + 0: ResourceUtilization(weights_memory=3, activation_memory=3, total_memory=6)}} + mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping, {RUTarget.TOTAL}) bit_cfg = mp_integer_programming_search(mock_search_manager, target_resource_utilization=target_resource_utilization) @@ -204,7 +191,7 @@ def test_search_bops_ru(self): layer_to_ru_mapping = {0: {2: ResourceUtilization(bops=1), 1: ResourceUtilization(bops=2), 0: ResourceUtilization(bops=3)}} - mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping) + mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping, {RUTarget.BOPS}) bit_cfg = mp_integer_programming_search(mock_search_manager, target_resource_utilization=target_resource_utilization) @@ -272,32 +259,14 @@ def representative_data_gen(): representative_data_gen, fw_info=fw_info) - cfg = search_bit_width(graph_to_search_cfg=graph, + cfg = search_bit_width(graph=graph, fw_info=DEFAULT_KERAS_INFO, fw_impl=keras_impl, - target_resource_utilization=ResourceUtilization(np.inf), + target_resource_utilization=ResourceUtilization(weights_memory=100), mp_config=core_config.mixed_precision_config, representative_data_gen=representative_data_gen, search_method=BitWidthSearchMethod.INTEGER_PROGRAMMING) - with self.assertRaises(Exception): - cfg = search_bit_width(graph_to_search_cfg=graph, - fw_info=DEFAULT_KERAS_INFO, - fw_impl=keras_impl, - target_resource_utilization=ResourceUtilization(np.inf), - mp_config=core_config.mixed_precision_config, - representative_data_gen=representative_data_gen, - search_method=None) - - with self.assertRaises(Exception): - cfg = search_bit_width(graph_to_search_cfg=graph, - fw_info=DEFAULT_KERAS_INFO, - fw_impl=keras_impl, - target_resource_utilization=None, - mp_config=core_config.mixed_precision_config, - representative_data_gen=representative_data_gen, - search_method=BitWidthSearchMethod.INTEGER_PROGRAMMING) - def test_mixed_precision_search_facade(self): core_config_avg_weights = CoreConfig(quantization_config=DEFAULTCONFIG, mixed_precision_config=MixedPrecisionQuantizationConfig(compute_mse, diff --git a/tests/keras_tests/non_parallel_tests/test_tensorboard_writer.py b/tests/keras_tests/non_parallel_tests/test_tensorboard_writer.py index 120ef70a9..7c830a4b6 100644 --- a/tests/keras_tests/non_parallel_tests/test_tensorboard_writer.py +++ b/tests/keras_tests/non_parallel_tests/test_tensorboard_writer.py @@ -162,7 +162,8 @@ def plot_tensor_sizes(self, core_config): fqc=fqc, network_editor=[], quant_config=cfg, - target_resource_utilization=mct.core.ResourceUtilization(), + target_resource_utilization=mct.core.ResourceUtilization(weights_memory=73, + activation_memory=191), n_iter=1, analyze_similarity=True, mp_cfg=mp_cfg) From 91f2aa5010a54292d853569460d2cbeaff2b3bae Mon Sep 17 00:00:00 2001 From: irenab Date: Tue, 4 Mar 2025 18:50:40 +0200 Subject: [PATCH 08/12] fix test --- .../core/mixed_precision/test_greedy_solution_refinement.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests_pytest/common_tests/unit_tests/core/mixed_precision/test_greedy_solution_refinement.py b/tests_pytest/common_tests/unit_tests/core/mixed_precision/test_greedy_solution_refinement.py index 5a5fbeee5..7d29842af 100644 --- a/tests_pytest/common_tests/unit_tests/core/mixed_precision/test_greedy_solution_refinement.py +++ b/tests_pytest/common_tests/unit_tests/core/mixed_precision/test_greedy_solution_refinement.py @@ -25,7 +25,7 @@ @pytest.fixture def search_manager(): manager = Mock() - manager.graph.get_configurable_sorted_nodes = MagicMock() + manager.mp_topo_configurable_nodes = MagicMock() manager.fw_info.get_kernel_op_attributes = MagicMock() manager.replace_config_in_index = MagicMock( side_effect=lambda config, idx, candidate: ( @@ -105,7 +105,7 @@ def test_greedy_solution_refinement_procedure( node_mock = Mock() node_mock.candidates_quantization_cfg = candidate_configs(weight_bits_dict_0, act_bits_0, weight_bits_dict_1, act_bits_1) - search_manager.graph.get_configurable_sorted_nodes.return_value = [node_mock] + search_manager.mp_topo_configurable_nodes = [node_mock] search_manager.compute_resource_utilization_for_config = MagicMock(side_effect=lambda config: { 0: ResourceUtilization(**alternative_candidate_resources_usage), From 56665422de428325b474e2ea0a61869a66d6a363 Mon Sep 17 00:00:00 2001 From: irenab Date: Sun, 9 Mar 2025 18:30:42 +0200 Subject: [PATCH 09/12] tiny updates --- .../core/common/graph/base_node.py | 1 - .../mixed_precision/mixed_precision_ru_helper.py | 2 +- .../mixed_precision/mixed_precision_search_facade.py | 2 +- .../mixed_precision_search_manager.py | 12 +++++++----- .../resource_utilization_calculator.py | 6 +----- 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/model_compression_toolkit/core/common/graph/base_node.py b/model_compression_toolkit/core/common/graph/base_node.py index d867fe578..1dfd1e533 100644 --- a/model_compression_toolkit/core/common/graph/base_node.py +++ b/model_compression_toolkit/core/common/graph/base_node.py @@ -170,7 +170,6 @@ def is_configurable_weight(self, attr_name: str) -> bool: def has_any_configurable_weight(self) -> bool: """ Check whether any of the node's weights is configurable. - Returns: Whether any of the node's weights is configurable. """ diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py index 56d969d1c..4bd9134bb 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -from typing import List, Set, Dict, Optional, Tuple, Any, Union +from typing import List, Set, Dict, Tuple import numpy as np diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py index 93beee95d..4189cc37a 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py @@ -24,7 +24,7 @@ from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_manager import \ MixedPrecisionSearchManager from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \ - ResourceUtilization, RUTarget + ResourceUtilization from model_compression_toolkit.core.common.mixed_precision.solution_refinement_procedure import \ greedy_solution_refinement_procedure diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py index 1750cdf1f..5ec783b11 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py @@ -67,9 +67,9 @@ def __init__(self, self.original_graph = graph # graph for mp search self.mp_graph, self.using_virtual_graph = self._get_mp_graph(graph, target_resource_utilization) + del graph # so that it's not used by mistake self.sensitivity_evaluator = sensitivity_evaluator - self.compute_metric_fn = sensitivity_evaluator.compute_metric self.target_resource_utilization = target_resource_utilization self.mp_topo_configurable_nodes = self.mp_graph.get_configurable_sorted_nodes(fw_info) @@ -93,6 +93,7 @@ def search(self): Indices of the selected bit-widths candidates. """ # import here to prevent circular dependency + # TODO: remove search manager dependency from linear_programming from model_compression_toolkit.core.common.mixed_precision.search_methods.linear_programming import \ mp_integer_programming_search config = mp_integer_programming_search(self, self.target_resource_utilization) @@ -122,12 +123,13 @@ def build_sensitivity_mapping(self, eps: float = EPS) -> Dict[int, Dict[int, flo Logger.info('Starting to evaluate metrics') layer_to_metrics_mapping = {} + compute_metric = self.sensitivity_evaluator.compute_metric if self.using_virtual_graph: origin_max_config = self.config_reconstruction_helper.reconstruct_config_from_virtual_graph( self.max_ru_config) - max_config_value = self.compute_metric_fn(origin_max_config) + max_config_value = compute_metric(origin_max_config) else: - max_config_value = self.compute_metric_fn(self.max_ru_config) + max_config_value = compute_metric(self.max_ru_config) for node_idx, layer_possible_bitwidths_indices in tqdm(self.layer_to_bitwidth_mapping.items(), total=len(self.layer_to_bitwidth_mapping)): @@ -153,12 +155,12 @@ def build_sensitivity_mapping(self, eps: float = EPS) -> Dict[int, Dict[int, flo original_base_config=origin_max_config) origin_changed_nodes_indices = [i for i, c in enumerate(origin_max_config) if c != origin_mp_model_configuration[i]] - metric_value = self.compute_metric_fn( + metric_value = compute_metric( origin_mp_model_configuration, origin_changed_nodes_indices, origin_max_config) else: - metric_value = self.compute_metric_fn( + metric_value = compute_metric( mp_model_configuration, [node_idx], self.max_ru_config) diff --git a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py index 408e5a598..07f350d53 100644 --- a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py +++ b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py @@ -431,8 +431,7 @@ def compute_node_activation_tensor_utilization(self, Returns: Node's activation utilization. """ - if qc and bitwidth_mode != BitwidthMode.QCustom: - raise ValueError(self.unexpected_qc_error) + self._validate_custom_qcs(qc, bitwidth_mode) if target_criterion: # only check whether the node meets the criterion @@ -470,9 +469,6 @@ def compute_bops(self, - Total BOPS count of the network. - Detailed BOPS count per node. """ - self._validate_custom_qcs(act_qcs, bitwidth_mode) - self._validate_custom_qcs(w_qcs, bitwidth_mode) - nodes_bops = {} for n in self.graph.get_topo_sorted_nodes(): w_qc = w_qcs.get(n.name) if w_qcs else None From 42aeae853dbd38955c7d6095159bfcbff230e111 Mon Sep 17 00:00:00 2001 From: irenab Date: Sun, 16 Mar 2025 18:53:15 +0200 Subject: [PATCH 10/12] convert LP functions into class, remove dependency on MPSearchManager, call LP from MPSearchManager with precomputed metrics --- .../mixed_precision_search_manager.py | 43 ++- .../search_methods/linear_programming.py | 318 ++++++++---------- 2 files changed, 168 insertions(+), 193 deletions(-) diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py index 5ec783b11..124e0e317 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py @@ -34,6 +34,8 @@ TargetInclusionCriterion, BitwidthMode from model_compression_toolkit.core.common.mixed_precision.mixed_precision_ru_helper import \ MixedPrecisionRUHelper +from model_compression_toolkit.core.common.mixed_precision.search_methods.linear_programming import \ + MixedPrecisionIntegerLPSolver from model_compression_toolkit.core.common.mixed_precision.sensitivity_evaluation import SensitivityEvaluation from model_compression_toolkit.core.common.substitutions.apply_substitutions import substitute from model_compression_toolkit.logger import Logger @@ -85,24 +87,49 @@ def __init__(self, self.config_reconstruction_helper = ConfigReconstructionHelper(virtual_graph=self.mp_graph, original_graph=self.original_graph) - def search(self): + def search(self) -> List[int]: """ Run mixed precision search. Returns: Indices of the selected bit-widths candidates. """ - # import here to prevent circular dependency - # TODO: remove search manager dependency from linear_programming - from model_compression_toolkit.core.common.mixed_precision.search_methods.linear_programming import \ - mp_integer_programming_search - config = mp_integer_programming_search(self, self.target_resource_utilization) + candidates_sensitivity = self._build_sensitivity_mapping() + candidates_ru = self._compute_relative_ru_matrices() + rel_target_ru = self._get_relative_ru_constraint_per_mem_element() + solver = MixedPrecisionIntegerLPSolver(candidates_sensitivity, candidates_ru, rel_target_ru) + config = solver.run() if self.using_virtual_graph: config = self.config_reconstruction_helper.reconstruct_config_from_virtual_graph(config) return config - def build_sensitivity_mapping(self, eps: float = EPS) -> Dict[int, Dict[int, float]]: + def _get_relative_ru_constraint_per_mem_element(self) -> Dict[RUTarget, np.ndarray]: + """ + Computes resource utilization constraint with respect to the minimal bit configuration, i.e. corresponding + constraint for each memory element is the relative utilization between the target utilization and + element's utilization for min-bit configuration. + + Returns: + A dictionary of relative resource utilization constraints per ru target. + + Raises: + ValueError: if target resource utilization cannot be satisfied (utilization for the minimal bit + configuration exceeds the requested target utilization for any target). + """ + target_ru = self.target_resource_utilization.get_resource_utilization_dict(restricted_only=True) + rel_target_ru = { + ru_target: ru - self.min_ru[ru_target] for ru_target, ru in target_ru.items() + } + unsatisfiable_targets = { + ru_target.value: target_ru[ru_target] for ru_target, ru in rel_target_ru.items() if any(ru < 0) + } + if unsatisfiable_targets: + raise ValueError(f"The model cannot be quantized to meet the specified resource utilization for the " + f"following targets: {unsatisfiable_targets}") + return rel_target_ru + + def _build_sensitivity_mapping(self, eps: float = EPS) -> Dict[int, Dict[int, float]]: """ This function measures the sensitivity of a change in a bitwidth of a layer on the entire model. It builds a mapping from a node's index, to its bitwidht's effect on the model sensitivity. @@ -209,7 +236,7 @@ def get_search_space(self) -> Dict[int, List[int]]: indices_mapping[idx] = list(range(len(n.candidates_quantization_cfg))) # all search_methods space return indices_mapping - def compute_resource_utilization_matrices(self) -> Dict[RUTarget, np.ndarray]: + def _compute_relative_ru_matrices(self) -> Dict[RUTarget, np.ndarray]: """ Computes and builds a resource utilization matrix for all restricted targets, to be used for the mixed-precision search problem formalization. diff --git a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py index 331bd6b00..4e5155ad4 100644 --- a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py +++ b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py @@ -12,198 +12,146 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - import numpy as np from pulp import * -from typing import Dict, Tuple, Any +from typing import Dict, Tuple -from model_compression_toolkit.logger import Logger -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import ResourceUtilization, RUTarget -from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_manager import MixedPrecisionSearchManager +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import RUTarget # Limit ILP solver runtime in seconds SOLVER_TIME_LIMIT = 60 -def mp_integer_programming_search(search_manager: MixedPrecisionSearchManager, - target_resource_utilization: ResourceUtilization) -> List[int]: - """ - Searching and returning a mixed-precision configuration using an ILP optimization solution. - It first builds a mapping from each layer's index (in the model) to a dictionary that maps the - bitwidth index to the observed sensitivity of the model when using that bitwidth for that layer. - Then, it creates a mapping from each node's index (in the graph) to a dictionary - that maps the bitwidth index to the contribution of configuring this node with this - bitwidth to the minimal possible resource utilization of the model. - Then, and using these mappings, it builds an LP problem and finds an optimal solution. - If a solution could not be found, exception is thrown. - - Args: - search_manager: MixedPrecisionSearchManager object to be used for problem formalization. - target_resource_utilization: Target resource utilization to constrain our LP problem with some resources limitations (like model' weights memory - consumption). - - Returns: - The mixed-precision configuration (A list of indices. Each indicates the bitwidth index of a node). - - """ - - # Build a mapping from each layer's index (in the model) to a dictionary that maps the - # bitwidth index to the observed sensitivity of the model when using that bitwidth for that layer. - - layer_to_sensitivity_mapping = search_manager.build_sensitivity_mapping() - - # Init variables to find their values when solving the lp problem. - layer_to_indicator_vars_mapping, layer_to_objective_vars_mapping = _init_problem_vars(layer_to_sensitivity_mapping) - - # Add all equations and inequalities that define the problem. - lp_problem = _formalize_problem(layer_to_indicator_vars_mapping, - layer_to_sensitivity_mapping, - layer_to_objective_vars_mapping, - target_resource_utilization, - search_manager) - - # Use default PULP solver. Limit runtime in seconds - solver = PULP_CBC_CMD(timeLimit=SOLVER_TIME_LIMIT) - lp_problem.solve(solver=solver) # Try to solve the problem. - - assert lp_problem.status == LpStatusOptimal, Logger.critical( - "No solution was found during solving the LP problem") - Logger.info(f"ILP status: {LpStatus[lp_problem.status]}") - - # Take the bitwidth index only if its corresponding indicator is one. - config = np.asarray( - [[nbits for nbits, indicator in nbits_to_indicator.items() if indicator.varValue == 1.0] for - nbits_to_indicator - in layer_to_indicator_vars_mapping.values()] - ).flatten() - - return config.tolist() - - -def _init_problem_vars(layer_to_metrics_mapping: Dict[int, Dict[int, float]]) -> Tuple[ - Dict[int, Dict[int, LpVariable]], Dict[int, LpVariable]]: - """ - Initialize the LP problem variables: Variable for each layer as to the index of the bitwidth it should use, - and a variable for each indicator for whether we use the former variable or not. - - Args: - layer_to_metrics_mapping: Mapping from each layer's index (in the model) to a dictionary that maps the - bitwidth index to the observed sensitivity of the model. - - Returns: - A tuple of two dictionaries: One from a layer to the variable for the bitwidth problem, - and the second for indicators for each variable. - """ - - layer_to_indicator_vars_mapping = dict() - layer_to_objective_vars_mapping = dict() - - for layer, nbits_to_metric in layer_to_metrics_mapping.items(): - layer_to_indicator_vars_mapping[layer] = dict() - - for nbits in nbits_to_metric.keys(): - layer_to_indicator_vars_mapping[layer][nbits] = LpVariable(f"layer_{layer}_{nbits}", - lowBound=0, - upBound=1, - cat=LpInteger) - - layer_to_objective_vars_mapping[layer] = LpVariable(f"s_{layer}", 0) - - return layer_to_indicator_vars_mapping, layer_to_objective_vars_mapping - - -def _formalize_problem(layer_to_indicator_vars_mapping: Dict[int, Dict[int, LpVariable]], - layer_to_metrics_mapping: Dict[int, Dict[int, float]], - layer_to_objective_vars_mapping: Dict[int, LpVariable], - target_resource_utilization: ResourceUtilization, - search_manager: MixedPrecisionSearchManager) -> LpProblem: - """ - Formalize the LP problem by defining all inequalities that define the solution space. - - Args: - layer_to_indicator_vars_mapping: Dictionary that maps each node's index to a dictionary of bitwidth to - indicator variable. - layer_to_metrics_mapping: Dictionary that maps each node's index to a dictionary of bitwidth to sensitivity - evaluation. - layer_to_objective_vars_mapping: Dictionary that maps each node's index to a bitwidth variable we find its - value. - target_resource_utilization: Target resource utilization to reduce our feasible solution space. - search_manager: MixedPrecisionSearchManager object to be used for resource utilization constraints formalization. - - Returns: - The formalized LP problem. - """ - - lp_problem = LpProblem() # minimization problem by default - lp_problem += lpSum([layer_to_objective_vars_mapping[layer] for layer in - layer_to_metrics_mapping.keys()]) # Objective (minimize acc loss) - - for layer in layer_to_metrics_mapping.keys(): - # Use every bitwidth for every layer with its indicator. - lp_problem += lpSum([indicator * layer_to_metrics_mapping[layer][nbits] - for nbits, indicator in layer_to_indicator_vars_mapping[layer].items()]) == \ - layer_to_objective_vars_mapping[layer] - - # Constraint of only one indicator==1 - lp_problem += lpSum( - [v for v in layer_to_indicator_vars_mapping[layer].values()]) == 1 - - # Bound the feasible solution space with the desired resource utilization values. - # Creates separate constraints for weights utilization and activation utilization. - assert target_resource_utilization and target_resource_utilization.is_any_restricted() - - indicators = [] - for layer in layer_to_metrics_mapping.keys(): - for _, indicator in layer_to_indicator_vars_mapping[layer].items(): - indicators.append(indicator) +class MixedPrecisionIntegerLPSolver: + """ Integer Linear Programming solver for Mixed Precision. - indicators_vec = np.array(indicators) - - _add_ru_constraints(search_manager=search_manager, - target_resource_utilization=target_resource_utilization, - indicators_vec=indicators_vec, - lp_problem=lp_problem) - return lp_problem - - -def _add_ru_constraints(search_manager: MixedPrecisionSearchManager, - target_resource_utilization: ResourceUtilization, - indicators_vec: np.ndarray, - lp_problem: LpProblem): - """ - Adding targets constraints for the Lp problem for the given target resource utilization. - The update to the Lp problem object is done inplace. - - Args: - search_manager: MixedPrecisionSearchManager object to be used for resource utilization constraints formalization. - target_resource_utilization: Target resource utilization. - indicators_vec: A vector of the Lp problem's indicators. - lp_problem: An Lp problem object to add constraint to. + Args: + layer_to_sensitivity_mapping: sensitivity per candidate per layer. + candidates_ru: resource utilization per candidate. + ru_constraints: resource utilization constraints corresponding to 'candidates_ru'. """ - candidates_ru = search_manager.compute_resource_utilization_matrices() - min_ru = search_manager.min_ru - target_ru = target_resource_utilization.get_resource_utilization_dict(restricted_only=True) - assert candidates_ru.keys() == target_ru.keys() - - for target, ru_matrix in candidates_ru.items(): - # We expect 2d matrix of shape (num candidates, m). For cumulative metrics (weights, bops) m=1 - overall - # utilization. For max metrics (activation, total) m=num memory elements (max element depends on configuration) - assert ru_matrix.ndim == 2 - if target in [RUTarget.WEIGHTS, RUTarget.BOPS]: - assert ru_matrix.shape[1] == 1 - - # ru values are relative to the minimal configuration, so we adjust the target ru accordingly - ru_constraint = target_ru[target] - min_ru[target] - if any(ru_constraint < 0): - raise ValueError(f"The model cannot be quantized to meet the specified target resource utilization " - f"{target.value} with the value {target_ru[target]}.") - - indicated_ru_matrix = ru_matrix.T * indicators_vec - # build lp sum term over all candidates - ru_vec = indicated_ru_matrix.sum(axis=1) - - # For cumulative metrics a single constraint is added, for max metrics a separate constraint - # is added for each memory element (each element < target => max element < target). - assert len(ru_vec) == len(ru_constraint) - for v, c in zip(ru_vec, ru_constraint): - lp_problem += v <= c + def __init__(self, layer_to_sensitivity_mapping: Dict[int, Dict[int, float]], + candidates_ru: Dict[RUTarget, np.ndarray], + ru_constraints: Dict[RUTarget, np.ndarray]): + self.layer_to_sensitivity_mapping = layer_to_sensitivity_mapping + self.candidates_ru = candidates_ru + self.ru_constraints = ru_constraints + + self.layer_to_indicator_vars_mapping, self.layer_to_objective_vars_mapping = ( + self._init_problem_vars(layer_to_sensitivity_mapping)) + + def run(self) -> List[int]: + """ + Build and solve an ILP optimization problem. + + Returns: + The mixed-precision configuration (A list of indices. Each indicates the bitwidth index of a node). + + """ + # Add all equations and inequalities that define the problem. + lp_problem = self._formalize_problem() + + # Use default PULP solver. Limit runtime in seconds + solver = PULP_CBC_CMD(timeLimit=SOLVER_TIME_LIMIT) + lp_problem.solve(solver=solver) # Try to solve the problem. + + if lp_problem.status != LpStatusOptimal: + raise RuntimeError(f'No solution was found for the LP problem, with status {lp_problem.status}') + + # Take the bitwidth index only if its corresponding indicator is one. + config = np.asarray( + [[nbits for nbits, indicator in nbits_to_indicator.items() if indicator.varValue == 1.0] for + nbits_to_indicator + in self.layer_to_indicator_vars_mapping.values()] + ).flatten() + + return config.tolist() + + @staticmethod + def _init_problem_vars(layer_to_metrics_mapping: Dict[int, Dict[int, float]]) -> Tuple[ + Dict[int, Dict[int, LpVariable]], Dict[int, LpVariable]]: + """ + Initialize the LP problem variables: Variable for each layer as to the index of the bitwidth it should use, + and a variable for each indicator for whether we use the former variable or not. + + Args: + layer_to_metrics_mapping: Mapping from each layer's index (in the model) to a dictionary that maps the + bitwidth index to the observed sensitivity of the model. + + Returns: + A tuple of two dictionaries: One from a layer to the variable for the bitwidth problem, + and the second for indicators for each variable. + """ + + layer_to_indicator_vars_mapping = dict() + layer_to_objective_vars_mapping = dict() + + for layer, nbits_to_metric in layer_to_metrics_mapping.items(): + layer_to_indicator_vars_mapping[layer] = dict() + + for nbits in nbits_to_metric.keys(): + layer_to_indicator_vars_mapping[layer][nbits] = LpVariable(f"layer_{layer}_{nbits}", + lowBound=0, + upBound=1, + cat=LpInteger) + + layer_to_objective_vars_mapping[layer] = LpVariable(f"s_{layer}", 0) + + return layer_to_indicator_vars_mapping, layer_to_objective_vars_mapping + + def _formalize_problem(self) -> LpProblem: + """ + Formalize the LP problem by defining all inequalities that define the solution space. + + Returns: + The formalized LP problem. + """ + + lp_problem = LpProblem() # minimization problem by default + lp_problem += lpSum([self.layer_to_objective_vars_mapping[layer] for layer in + self.layer_to_sensitivity_mapping.keys()]) # Objective (minimize acc loss) + + for layer in self.layer_to_sensitivity_mapping.keys(): + # Use every bitwidth for every layer with its indicator. + lp_problem += lpSum([indicator * self.layer_to_sensitivity_mapping[layer][nbits] + for nbits, indicator in self.layer_to_indicator_vars_mapping[layer].items()]) == \ + self.layer_to_objective_vars_mapping[layer] + + # Constraint of only one indicator==1 + lp_problem += lpSum( + [v for v in self.layer_to_indicator_vars_mapping[layer].values()]) == 1 + + # Bound the feasible solution space with the desired resource utilization values. + self._add_ru_constraints(lp_problem=lp_problem) + + return lp_problem + + def _add_ru_constraints(self, lp_problem: LpProblem): + """ + Adding targets constraints for the Lp problem for the given target resource utilization. + The update to the Lp problem object is done inplace. + + Args: + lp_problem: An Lp problem object to add constraint to. + """ + indicators = [] + for layer in self.layer_to_sensitivity_mapping: + indicators.extend(list(self.layer_to_indicator_vars_mapping[layer].values())) + indicators_vec = np.array(indicators) + + for target, ru_matrix in self.candidates_ru.items(): + # We expect 2d matrix of shape (num candidates, m). For cumulative metrics (weights, bops) m=1 - overall + # utilization. For max metrics (activation, total) m=num memory elements (max element depends on configuration) + assert ru_matrix.ndim == 2 + if target in [RUTarget.WEIGHTS, RUTarget.BOPS]: + assert ru_matrix.shape[1] == 1 + + indicated_ru_matrix = ru_matrix.T * indicators_vec + # build lp sum term over all candidates + ru_vec = indicated_ru_matrix.sum(axis=1) + + # For cumulative metrics a single constraint is added, for max metrics a separate constraint + # is added for each memory element (each element < target => max element < target). + assert len(ru_vec) == len(self.ru_constraints[target]) + for v, c in zip(ru_vec, self.ru_constraints[target]): + lp_problem += v <= c From 07c94861b5226840cbd41627f2249848620c84f3 Mon Sep 17 00:00:00 2001 From: irenab Date: Sun, 16 Mar 2025 20:15:27 +0200 Subject: [PATCH 11/12] fix lp_search test --- .../test_lp_search_bitwidth.py | 56 ++++++++----------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py b/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py index d304b4f69..b9c94bde1 100644 --- a/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py +++ b/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py @@ -20,8 +20,6 @@ import keras from model_compression_toolkit.core import DEFAULTCONFIG from model_compression_toolkit.core.common.mixed_precision.distance_weighting import MpDistanceWeighting -from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_manager import \ - MixedPrecisionSearchManager from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \ ResourceUtilization, RUTarget from model_compression_toolkit.core.common.mixed_precision.mixed_precision_quantization_config import \ @@ -29,9 +27,8 @@ from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_facade import search_bit_width, \ BitWidthSearchMethod from model_compression_toolkit.core.common.mixed_precision.search_methods.linear_programming import \ - mp_integer_programming_search + MixedPrecisionIntegerLPSolver from model_compression_toolkit.core.common.model_collector import ModelCollector -from model_compression_toolkit.core.common.quantization.bit_width_config import BitWidthConfig from model_compression_toolkit.core.common.quantization.core_config import CoreConfig from model_compression_toolkit.core.common.quantization.quantization_params_generation.qparams_computation import \ calculate_quantization_params @@ -46,7 +43,6 @@ from model_compression_toolkit.target_platform_capabilities.tpc_models.imx500_tpc.latest import \ get_op_quantization_configs from tests.keras_tests.tpc_keras import get_weights_only_mp_tpc_keras -from pulp import lpSum class MockReconstructionHelper: @@ -90,38 +86,40 @@ def compute_resource_utilization_matrices(self): class TestLpSearchBitwidth(unittest.TestCase): + def _execute(self, mock_search_mgr, target_resource_utilization): + candidates_sensitivity = mock_search_mgr.build_sensitivity_mapping() + candidates_ru = mock_search_mgr.compute_resource_utilization_matrices() + min_ru = mock_search_mgr.min_ru + ru_constraints = {k: v - min_ru[k] for k, v in target_resource_utilization.get_resource_utilization_dict(restricted_only=True).items()} + lp_solver = MixedPrecisionIntegerLPSolver(candidates_sensitivity, candidates_ru, ru_constraints) + return lp_solver.run() + def test_search_weights_only(self): target_resource_utilization = ResourceUtilization(weights_memory=2) layer_to_ru_mapping = {0: {2: ResourceUtilization(weights_memory=1), 1: ResourceUtilization(weights_memory=2), 0: ResourceUtilization(weights_memory=3)}} mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping, {RUTarget.WEIGHTS}) - - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) + bit_cfg = self._execute(mock_search_manager, target_resource_utilization) self.assertTrue(len(bit_cfg) == 1) self.assertTrue(bit_cfg[0] == 1) target_resource_utilization = ResourceUtilization(weights_memory=0) # Infeasible solution! with self.assertRaises(Exception): - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) + self._execute(mock_search_manager, target_resource_utilization=target_resource_utilization) - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=ResourceUtilization(weights_memory=1000)) + bit_cfg = self._execute(mock_search_manager, target_resource_utilization=ResourceUtilization(weights_memory=1000)) self.assertTrue(len(bit_cfg) == 1) self.assertTrue(bit_cfg[0] == 0) # expecting for the maximal bit-width result target_resource_utilization = None # target ResourceUtilization is not defined! with self.assertRaises(Exception): - mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) + self._execute(mock_search_manager, target_resource_utilization=target_resource_utilization) with self.assertRaises(Exception): - mp_integer_programming_search(mock_search_manager, - target_resource_utilization=ResourceUtilization(weights_memory=np.inf)) + self._execute(mock_search_manager, target_resource_utilization=ResourceUtilization(weights_memory=np.inf)) def test_search_activation_only(self): target_resource_utilization = ResourceUtilization(activation_memory=2) @@ -130,20 +128,17 @@ def test_search_activation_only(self): 0: ResourceUtilization(activation_memory=3)}} mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping, {RUTarget.ACTIVATION}) - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) + bit_cfg = self._execute(mock_search_manager, target_resource_utilization=target_resource_utilization) self.assertTrue(len(bit_cfg) == 1) self.assertTrue(bit_cfg[0] == 1) target_resource_utilization = ResourceUtilization(activation_memory=0) # Infeasible solution! with self.assertRaises(Exception): - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) + bit_cfg = self._execute(mock_search_manager, target_resource_utilization=target_resource_utilization) - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=ResourceUtilization( - activation_memory=1000)) + bit_cfg = self._execute(mock_search_manager, + target_resource_utilization=ResourceUtilization(activation_memory=1000)) self.assertTrue(len(bit_cfg) == 1) self.assertTrue(bit_cfg[0] == 0) # expecting for the maximal bit-width result @@ -155,19 +150,16 @@ def test_search_weights_and_activation(self): 0: ResourceUtilization(weights_memory=3, activation_memory=3)}} mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping, {RUTarget.WEIGHTS, RUTarget.ACTIVATION}) - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) + bit_cfg = self._execute(mock_search_manager, target_resource_utilization=target_resource_utilization) self.assertTrue(len(bit_cfg) == 1) self.assertTrue(bit_cfg[0] == 1) target_resource_utilization = ResourceUtilization(weights_memory=0, activation_memory=0) # Infeasible solution! with self.assertRaises(Exception): - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) + bit_cfg = self._execute(mock_search_manager, target_resource_utilization=target_resource_utilization) - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=ResourceUtilization(weights_memory=1000, + bit_cfg = self._execute(mock_search_manager, target_resource_utilization=ResourceUtilization(weights_memory=1000, activation_memory=1000)) self.assertTrue(len(bit_cfg) == 1) @@ -180,8 +172,7 @@ def test_search_total_resource_utilization(self): 0: ResourceUtilization(weights_memory=3, activation_memory=3, total_memory=6)}} mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping, {RUTarget.TOTAL}) - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) + bit_cfg = self._execute(mock_search_manager, target_resource_utilization=target_resource_utilization) self.assertTrue(len(bit_cfg) == 1) self.assertTrue(bit_cfg[0] == 1) @@ -193,8 +184,7 @@ def test_search_bops_ru(self): 0: ResourceUtilization(bops=3)}} mock_search_manager = MockMixedPrecisionSearchManager(layer_to_ru_mapping, {RUTarget.BOPS}) - bit_cfg = mp_integer_programming_search(mock_search_manager, - target_resource_utilization=target_resource_utilization) + bit_cfg = self._execute(mock_search_manager, target_resource_utilization=target_resource_utilization) self.assertTrue(len(bit_cfg) == 1) self.assertTrue(bit_cfg[0] == 1) From ab69efbf1f6304bded657f1508a6c2c80b795e60 Mon Sep 17 00:00:00 2001 From: irenab Date: Sun, 23 Mar 2025 11:20:03 +0200 Subject: [PATCH 12/12] add missing type hints --- .../mixed_precision/mixed_precision_search_manager.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py index 124e0e317..c878dccfb 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py @@ -17,7 +17,7 @@ from tqdm import tqdm -from typing import Dict, List +from typing import Dict, List, Tuple import numpy as np @@ -199,7 +199,7 @@ def _build_sensitivity_mapping(self, eps: float = EPS) -> Dict[int, Dict[int, fl return layer_to_metrics_mapping - def _get_mp_graph(self, graph, target_resource_utilization): + def _get_mp_graph(self, graph: Graph, target_resource_utilization: ResourceUtilization) -> Tuple[Graph, bool]: """ Get graph for mixed precision search. Virtual graph is built if bops is restricted and both activation and weights are configurable. @@ -209,7 +209,8 @@ def _get_mp_graph(self, graph, target_resource_utilization): target_resource_utilization: target resource utilization. Returns: - Graph for mixed precision search (virtual or original). + Graph for mixed precision search (virtual or original), and a boolean flag whether a virtual graph has been + constructed. """ if (target_resource_utilization.bops_restricted() and graph.has_any_configurable_activation() and