merge from main

reuvenp · reuvenp · commit 2011098b7bd2 · 2025-04-01T17:08:46.000+03:00
diff --git a/model_compression_toolkit/core/common/graph/base_graph.py b/model_compression_toolkit/core/common/graph/base_graph.py
@@ -740,6 +740,22 @@ def get_final_activation_config(self) -> List[Tuple[BaseNode, int]]:
         sorted_conf_activation = self.get_sorted_activation_configurable_nodes()
         return [(n, n.final_activation_quantization_cfg.activation_n_bits) for n in sorted_conf_activation]
 
+    def retrieve_preserved_quantization_node(self, node: BaseNode) -> BaseNode:
+        """
+        For a node with quantization_preserving == True, get the previous non-quantization_preserving node
+        to get activation quantization config from. If quantization_preserving is False return node.
+        Args:
+            node: quantization preserving node.
+
+        Returns:
+            The node that the quantization preserving node should get the activation quantization from.
+
+        """
+        while node.is_quantization_preserving():
+            prev_nodes = self.get_prev_nodes(node)
+            assert len(prev_nodes) == 1, "Activation preserving node should have only 1 input."
+            node = prev_nodes[0]
+        return node
 
     def has_any_configurable_activation(self) -> bool:
         """
diff --git a/model_compression_toolkit/core/common/graph/base_node.py b/model_compression_toolkit/core/common/graph/base_node.py
@@ -131,6 +131,19 @@ def is_activation_quantization_enabled(self) -> bool:
                    qc.activation_quantization_cfg.enable_activation_quantization
         return self.candidates_quantization_cfg[0].activation_quantization_cfg.enable_activation_quantization
 
+    def is_quantization_preserving(self) -> bool:
+        """
+        Returns: Whether node activation quantization information is preserved from its inputs.
+        """
+        if self.final_activation_quantization_cfg:
+            # if we have a final configuration, then we only care to check if it enables activation quantization.
+            return self.final_activation_quantization_cfg.quantization_preserving
+
+        for qc in self.candidates_quantization_cfg:
+            assert self.candidates_quantization_cfg[0].activation_quantization_cfg.quantization_preserving == \
+                   qc.activation_quantization_cfg.quantization_preserving
+        return self.candidates_quantization_cfg[0].activation_quantization_cfg.quantization_preserving
+
     def is_weights_quantization_enabled(self, attr_name: str) -> bool:
         """
         Checks whether a node's weights attribute quantization is enabled.
diff --git a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py
@@ -335,13 +335,35 @@ def compute_activations_utilization(self,
         """
         return self.compute_activation_utilization_by_cut(target_criterion, bitwidth_mode, act_qcs)
 
+    def _extract_qc(self, n: BaseNode, act_qcs: Optional[ActivationQCfgPerNode] = None
+                    ) -> Union[NodeActivationQuantizationConfig, None]:
+        """
+        Extract quantization config the activation configs dictionary is provided. If node is quantization
+        preserving, extract the quantization config from the preceding activation quantized node (i.e.
+        the Quantization the original node preserves).
+
+        Args:
+            n: Node to extract qc for.
+            act_qcs: custom activations quantization configuration. If not provided, the default
+             configuration will be extracted from the node.
+
+        Returns:
+            The relevant quantization config.
+        """
+        if act_qcs:
+            assert not (n.is_quantization_preserving() and act_qcs.get(n.name) is not None), \
+                f"Quantization preserving node {n.name} should not have a qc for this computation."
+            return act_qcs.get(self.graph.retrieve_preserved_quantization_node(n).name)
+        return None
+
     def compute_activation_utilization_by_cut(self,
                                               target_criterion: TargetInclusionCriterion,
                                               bitwidth_mode: BitwidthMode,
                                               act_qcs: Optional[ActivationQCfgPerNode] = None) \
             -> Tuple[float, Dict[Cut, Utilization], Dict[Cut, Dict[BaseNode, Utilization]]]:
         """
-        Compute graph activation cuts utilization.
+        Compute graph activation cuts utilization. If activation quantization configs are provided, then for
+        quantization preserving nodes, get the previous quantized activation node bit-width.
 
         Args:
             target_criterion: criterion to include weights for computation.
@@ -369,7 +391,7 @@ def compute_activation_utilization_by_cut(self,
             if not cut_target_nodes:
                 continue
             for n in cut_target_nodes:
-                qc = act_qcs.get(n.name) if act_qcs else None
+                qc = self._extract_qc(n, act_qcs)
                 util_per_cut_per_node[cut][n.name] = self.compute_node_activation_tensor_utilization(n, target_criterion,
                                                                                                      bitwidth_mode, qc)
             util_per_cut[cut] = sum(util_per_cut_per_node[cut].values())    # type: ignore
@@ -384,7 +406,8 @@ def compute_activation_tensors_utilization(self,
                                                include_reused=False) \
             -> Tuple[float, Dict[NodeName, Utilization]]:
         """
-        Compute resource utilization for graph's activations tensors.
+        Compute resource utilization for graph's activations tensors. If activation quantization configs are provided, then for
+        quantization preserving nodes, get the previous quantized activation node bit-width.
 
         Args:
             target_criterion: criterion to include weights for computation.
@@ -405,7 +428,7 @@ def compute_activation_tensors_utilization(self,
 
         util_per_node: Dict[NodeName, Utilization] = {}
         for n in self._topo_sort(nodes):
-            qc = act_qcs.get(n.name) if act_qcs else None
+            qc = self._extract_qc(n, act_qcs)
             util = self.compute_node_activation_tensor_utilization(n, None, bitwidth_mode, qc)
             util_per_node[n.name] = util
 
@@ -659,7 +682,7 @@ def _get_target_activation_nodes(self,
         if target_criterion == TargetInclusionCriterion.QConfigurable:
             nodes = [n for n in nodes if n.has_configurable_activation()]
         elif target_criterion == TargetInclusionCriterion.AnyQuantized:
-            nodes = [n for n in nodes if n.is_activation_quantization_enabled()]
+            nodes = [n for n in nodes if n.is_activation_quantization_enabled() or n.is_quantization_preserving()]
         elif target_criterion == TargetInclusionCriterion.QNonConfigurable:
             nodes = [n for n in nodes if n.is_activation_quantization_enabled() and not n.has_configurable_activation()]
         elif target_criterion != TargetInclusionCriterion.Any:    # pragma: no cover
@@ -668,8 +691,7 @@ def _get_target_activation_nodes(self,
             nodes = [n for n in nodes if not n.reuse]
         return nodes
 
-    @classmethod
-    def _get_activation_nbits(cls,
+    def _get_activation_nbits(self,
                               n: BaseNode,
                               bitwidth_mode: BitwidthMode,
                               act_qc: Optional[NodeActivationQuantizationConfig]) -> int:
@@ -690,21 +712,22 @@ def _get_activation_nbits(cls,
             assert bitwidth_mode == BitwidthMode.QCustom
             return act_qc.activation_n_bits if act_qc.enable_activation_quantization else FLOAT_BITWIDTH
 
-        if bitwidth_mode == BitwidthMode.Float or not n.is_activation_quantization_enabled():
+        if bitwidth_mode == BitwidthMode.Float or not (n.is_activation_quantization_enabled() or
+                                                       n.is_quantization_preserving()):
             return FLOAT_BITWIDTH
 
         if bitwidth_mode == BitwidthMode.Q8Bit:
             return 8
 
-        if bitwidth_mode in cls._bitwidth_mode_fn:
+        if bitwidth_mode in self._bitwidth_mode_fn:
             candidates_nbits = [c.activation_quantization_cfg.activation_n_bits for c in n.candidates_quantization_cfg]
-            return cls._bitwidth_mode_fn[bitwidth_mode](candidates_nbits)
+            return self._bitwidth_mode_fn[bitwidth_mode](candidates_nbits)
 
         if bitwidth_mode in [BitwidthMode.QCustom, BitwidthMode.QDefaultSP]:
-            qcs = n.get_unique_activation_candidates()
+            qcs = self.graph.retrieve_preserved_quantization_node(n).get_unique_activation_candidates()
             if len(qcs) != 1:
                 raise ValueError(f'Could not retrieve the activation quantization candidate for node {n} '
-                                 f'as it has {len(qcs)}!=1 unique candidates .')
+                                 f'as it has {len(qcs)}!=1 unique candidates.')
             return qcs[0].activation_quantization_cfg.activation_n_bits
 
         raise ValueError(f'Unknown mode {bitwidth_mode}')    # pragma: no cover
diff --git a/tests_pytest/_test_util/graph_builder_utils.py b/tests_pytest/_test_util/graph_builder_utils.py
@@ -70,7 +70,7 @@ def full_attr_name(canonical_name: Union[str, dict, Iterable]):
 
 
 def build_nbits_qc(a_nbits=8, a_enable=True, w_attr=None, pos_attr=(32, False, ()),
-                   convert_canonical_attr=True) -> CandidateNodeQuantizationConfig:
+                   convert_canonical_attr=True, q_preserving=False) -> CandidateNodeQuantizationConfig:
     """
     Build quantization config with configurable nbits and enabling/disabling quantization only.
 
@@ -87,6 +87,8 @@ def build_nbits_qc(a_nbits=8, a_enable=True, w_attr=None, pos_attr=(32, False, (
     Returns:
 
     """
+    assert not(a_enable and q_preserving)
+
     w_attr = w_attr or {}
     attr_weights_configs_mapping = {
         k: AttributeQuantizationConfig(weights_n_bits=v[0], enable_weights_quantization=v[1])
@@ -102,7 +104,7 @@ def build_nbits_qc(a_nbits=8, a_enable=True, w_attr=None, pos_attr=(32, False, (
         default_weight_attr_config=AttributeQuantizationConfig(weights_n_bits=pos_attr[0],
                                                                enable_weights_quantization=pos_attr[1]),
         activation_quantization_method=QuantizationMethod.POWER_OF_TWO,
-        quantization_preserving=False,
+        quantization_preserving=q_preserving,
         supported_input_activation_n_bits=[2, 4, 8],
         fixed_scale=None,
         fixed_zero_point=None,
diff --git a/tests_pytest/common_tests/unit_tests/core/graph/test_quantization_preserving_node.py b/tests_pytest/common_tests/unit_tests/core/graph/test_quantization_preserving_node.py
@@ -0,0 +1,37 @@
+# Copyright 2025 Sony Semiconductor Israel, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from model_compression_toolkit.core.common import Graph
+from model_compression_toolkit.core.common.graph.edge import Edge
+
+from tests_pytest._test_util.graph_builder_utils import build_node, build_nbits_qc
+
+
+class TestQuantizationPreservingNode:
+
+    def test_activation_preserving_candidate(self):
+        """ Tests that the correct activation quantization candidate is selected. """
+        n1 = build_node('qact_node', qcs=[build_nbits_qc()])
+        n2 = build_node('qp1a_node', qcs=[build_nbits_qc(a_enable=False, q_preserving=True)])
+        n3 = build_node('qp1b_node', qcs=[build_nbits_qc(a_enable=False, q_preserving=True)])
+        n4 = build_node('qp2a_node', qcs=[build_nbits_qc()])
+        n5 = build_node('qp2b_node', qcs=[build_nbits_qc(a_enable=False, q_preserving=True)])
+        graph = Graph('g', input_nodes=[n1], nodes=[n2, n4], output_nodes=[n3, n5],
+                      edge_list=[Edge(n1, n2, 0, 0), Edge(n2, n3, 0, 0),
+                                 Edge(n1, n4, 0, 0), Edge(n4, n5, 0, 0)])
+
+        assert graph.retrieve_preserved_quantization_node(n2) is n1
+        assert graph.retrieve_preserved_quantization_node(n3) is n1
+        assert graph.retrieve_preserved_quantization_node(n4) is n4
+        assert graph.retrieve_preserved_quantization_node(n5) is n4
diff --git a/tests_pytest/common_tests/unit_tests/core/mixed_precision/resource_utilization_tools/test_resource_utilization_calculator.py b/tests_pytest/common_tests/unit_tests/core/mixed_precision/resource_utilization_tools/test_resource_utilization_calculator.py
@@ -37,6 +37,8 @@
 BM = BitwidthMode
 TIC = TargetInclusionCriterion
 
+_identity_func = lambda x: x
+
 
 class TestUtilization:
     def test_operations(self):
@@ -296,8 +298,10 @@ class TestComputeActivationTensorsUtilization:
     """ Tests for activation tensors utilization public apis. """
     def test_compute_node_activation_tensor_utilization(self, graph_mock, fw_impl_mock, fw_info_mock):
         mp_reuse = build_node('mp_reuse', output_shape=(None, 3, 14), qcs=[build_qc(4), build_qc(16)], reuse=True)
+        qp = build_node('qp', output_shape=(None, 15, 9), qcs=[build_qc(a_enable=False, q_preserving=True)])
         noq = build_node('noq', output_shape=(None, 15, 9), qcs=[build_qc(a_enable=False)])
-        graph_mock.nodes = [mp_reuse, noq]
+        graph_mock.nodes = [mp_reuse, qp, noq]
+        graph_mock.retrieve_preserved_quantization_node = lambda n: mp_reuse if n is qp else n
 
         ru_calc = ResourceUtilizationCalculator(graph_mock, fw_impl_mock, fw_info_mock)
         # _get_activation_nbits is already fully checked, just make sure we use it, and use correctly
@@ -310,6 +314,9 @@ def test_compute_node_activation_tensor_utilization(self, graph_mock, fw_impl_mo
         # reused is not ignored
         res = ru_calc.compute_node_activation_tensor_utilization(mp_reuse, TIC.QConfigurable, BM.QMinBit)
         assert res == Utilization(42, 21.)
+        # quantization preserving uses custom_qc.
+        res = ru_calc.compute_node_activation_tensor_utilization(qp, TIC.AnyQuantized, BM.QCustom, custom_qc)
+        assert res == Utilization(135, 270.)
         # not a target node
         res = ru_calc.compute_node_activation_tensor_utilization(noq, TIC.AnyQuantized, BM.QCustom, custom_qc)
         assert res == Utilization(0, 0)
@@ -391,11 +398,14 @@ def test_compute_cuts_integration(self, graph_mock, fw_impl_mock, fw_info_mock,
         """ Test integration with max cut computation. """
         # Test a simple linear dummy graph with the real max cut computation.
         n1 = build_node('n1', qcs=[build_qc()], input_shape=(None, 10, 20, 3), output_shape=(None, 10, 20, 3))
+        n1_qp = build_node('n1_qp', qcs=[build_qc(a_enable=False, q_preserving=True)],
+                           input_shape=(None, 10, 20, 3), output_shape=(None, 10, 20, 3))
         n2 = build_node('n2', qcs=[build_qc()], input_shape=(None, 10, 20, 3), output_shape=(None, 5, 10))
         n3 = build_node('n3', qcs=[build_qc()], input_shape=(None, 5, 10), output_shape=(None, 5, 10))
         n4 = build_node('n4', qcs=[build_qc()], input_shape=(None, 5, 10, 32), output_shape=(None, 5, 10, 32))
-        edges = [Edge(n1, n2, 0, 0), Edge(n2, n3, 0, 0), Edge(n3, n4, 0, 0)]
-        graph = Graph('g', input_nodes=[n1], nodes=[n2, n3], output_nodes=[n4], edge_list=edges)
+        edges = [Edge(n1, n1_qp, 0, 0), Edge(n1_qp, n2, 0, 0),
+                 Edge(n2, n3, 0, 0), Edge(n3, n4, 0, 0)]
+        graph = Graph('g', input_nodes=[n1], nodes=[n1_qp, n2, n3], output_nodes=[n4], edge_list=edges)
         ru_calc = ResourceUtilizationCalculator(graph, fw_impl_mock, fw_info_mock)
         # wrap the real implementation
         maxcut_spy = mocker.patch('model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.'
@@ -405,11 +415,11 @@ def test_compute_cuts_integration(self, graph_mock, fw_impl_mock, fw_info_mock,
         cuts_cache = ru_calc.cuts
 
         # verify the cache
-        assert len(cuts_cache) == 5
+        assert len(cuts_cache) == 6
         assert all(isinstance(k, Cut) for k in cuts_cache.keys())
         # for each cut we save a list of its nodes
         cuts_nodes = {tuple(sorted(n.name for n in nodes)) for nodes in cuts_cache.values()}
-        assert cuts_nodes == {('n1',), ('n4',), ('n1', 'n2'), ('n2', 'n3'), ('n3', 'n4')}
+        assert cuts_nodes == {('n1',), ('n4',), ('n1', 'n1_qp'), ('n1_qp', 'n2'), ('n2', 'n3'), ('n3', 'n4')}
 
         # verify cuts computation only happens the first time
         cuts_cache2 = ru_calc.cuts
@@ -420,7 +430,8 @@ def test_compute_cuts_integration(self, graph_mock, fw_impl_mock, fw_info_mock,
         nodes_to_cuts = {tuple(sorted(elem.node_name for elem in cut.mem_elements.elements)): cut
                          for cut in cuts_cache.keys()}
         cut1 = nodes_to_cuts[('n1',)]
-        cut12 = nodes_to_cuts[('n1', 'n2')]
+        cut11 = nodes_to_cuts[('n1', 'n1_qp')]
+        cut12 = nodes_to_cuts[('n1_qp', 'n2')]
         cut23 = nodes_to_cuts[('n2', 'n3')]
         cut34 = nodes_to_cuts[('n3', 'n4')]
         cut4 = nodes_to_cuts[('n4',)]
@@ -430,7 +441,8 @@ def test_compute_cuts_integration(self, graph_mock, fw_impl_mock, fw_info_mock,
                                                                                          bitwidth_mode=BM.QDefaultSP)
 
         assert per_cut_per_node == {cut1: {'n1': Utilization(10 * 20 * 3, 600)},
-                                    cut12: {'n1': Utilization(10 * 20 * 3, 600),
+                                    cut11: {'n1': Utilization(10 * 20 * 3, 600), 'n1_qp': Utilization(10 * 20 * 3, 600)},
+                                    cut12: {'n1_qp': Utilization(10 * 20 * 3, 600),
                                             'n2': Utilization(5 * 10, 50)},
                                     cut23: {'n2': Utilization(5*10, 50),
                                             'n3': Utilization(5*10, 50)},
@@ -439,7 +451,8 @@ def test_compute_cuts_integration(self, graph_mock, fw_impl_mock, fw_info_mock,
                                     cut4: {'n4': Utilization(5 * 10 * 32, 1600)}}
         assert per_cut == {
             nodes_to_cuts[('n1',)]: Utilization(600, 600),
-            nodes_to_cuts[('n1', 'n2')]: Utilization(650, 650),
+            nodes_to_cuts[('n1', 'n1_qp')]: Utilization(1200, 1200),
+            nodes_to_cuts[('n1_qp', 'n2')]: Utilization(650, 650),
             nodes_to_cuts[('n2', 'n3')]: Utilization(100, 100),
             nodes_to_cuts[('n3', 'n4')]: Utilization(1650, 1650),
             nodes_to_cuts[('n4',)]: Utilization(1600, 1600)
diff --git a/tests_pytest/conftest.py b/tests_pytest/conftest.py
@@ -31,7 +31,7 @@ def minimal_tpc():
 @fixture
 def graph_mock():
     """ Basic Graph mock. """
-    return Mock(spec_set=Graph, nodes=[])
+    return Mock(spec_set=Graph, nodes=[], retrieve_preserved_quantization_node=lambda x: x)
 
 
 @fixture