dont compute bops for virtual weights node

irenab · irenab · commit d10483b71960 · 2025-02-25T18:41:48.000+02:00
diff --git a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py
@@ -32,6 +32,8 @@
     RUTarget, ResourceUtilization
 from model_compression_toolkit.core.common.quantization.node_quantization_config import NodeWeightsQuantizationConfig, \
     NodeActivationQuantizationConfig
+from model_compression_toolkit.core.common.substitutions.virtual_activation_weights_composition import \
+    BaseVirtualActivationWeightsComposition, get_input_activation_if_composable
 
 
 class BitwidthMode(Enum):
@@ -510,13 +512,19 @@ def compute_node_bops(self,
         if w_qc and bitwidth_mode != BitwidthMode.QCustom:
             raise ValueError(self.unexpected_qc_error)
 
-        # extract the original weight node for mac computation
+        if isinstance(n, VirtualSplitWeightsNode):
+            # Virtual weights node can only be present if it couldn't be merged into VirtualActivationWeightsNode.
+            # This means that during MP search we cannot compute bops for all A/W nbits combinations. To prevent
+            # inconsistencies we ignore such nodes for bops computation.
+            return 0
+
+        # Fetch the original weights node for mac computation (VirtualActivationWeightsNode input/output shapes are
+        # based on the activation original node, not weights original node)
         orig_w_node = n
         if isinstance(n, VirtualActivationWeightsNode):
             orig_w_node = n.original_weights_node
-
-        if isinstance(orig_w_node, VirtualSplitWeightsNode):
-            orig_w_node = orig_w_node.origin_node
+            if isinstance(orig_w_node, VirtualSplitWeightsNode):
+                orig_w_node = orig_w_node.origin_node
 
         # check if the node has kernel
         kernel_attrs = self.fw_info.get_kernel_op_attributes(n.type)
@@ -535,10 +543,9 @@ def compute_node_bops(self,
             # we don't need the original node (and cannot use it for custom configuration anyway)
             a_node = n
         else:
-            incoming_edges = self.graph.incoming_edges(n)
-            assert len(incoming_edges) == 1, \
-                f'Unexpected number of inputs {len(incoming_edges)} for BOPS calculation. Expected 1.'
-            a_node = incoming_edges[0].source_node
+            a_node = get_input_activation_if_composable(self.graph, n, warn=False)
+            if a_node is None:
+                return 0
 
         if (target_criterion == TargetInclusionCriterion.AnyQuantized and
                 not (a_node.is_activation_quantization_enabled() or n.is_weights_quantization_enabled(kernel_attr))):
diff --git a/model_compression_toolkit/core/common/substitutions/virtual_activation_weights_composition.py b/model_compression_toolkit/core/common/substitutions/virtual_activation_weights_composition.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from typing import Optional
 
 from model_compression_toolkit.core.common import BaseNode, Graph, BaseSubstitution
 from model_compression_toolkit.logger import Logger
@@ -44,14 +45,8 @@ def substitute(self,
             raise TypeError(f'Matched node {weights_node} was expected to be of type VirtualSplitWeightsNode. '
                             f'This substitution is expected to be called after activation-weights split.')
 
-        predecessors = graph.get_prev_nodes(weights_node)
-        assert len(predecessors) == 1, (f'Matched node for {self.__class__.__name__} substitution is expected to have'
-                                        f'exactly one input, node {weights_node} has {len(predecessors)}')
-        act_node = predecessors[0]
-        if len(graph.out_edges(act_node)) > 1:
-            Logger.warning(f"Node {act_node.name} has multiple outgoing edges, which is not supported with "
-                           f"mixed-precision search under bit-operations constraint. In such case, it might result in "
-                           f"incorrect resource utilization computation and suboptimal bits selection.")
+        act_node = get_input_activation_if_composable(graph, weights_node, warn=True)
+        if act_node is None:
             return graph
 
         # Virtual composed activation-weights node
@@ -70,3 +65,29 @@ def substitute(self,
         graph.remove_node(act_node)
 
         return graph
+
+
+def get_input_activation_if_composable(graph: Graph, weights_node: BaseNode, warn: bool) -> Optional[BaseNode]:
+    """
+    Get input activation node for composition, or None if not composable.
+
+    Args:
+        graph: graph.
+        weights_node: weights node for composition.
+        warn: whether to log a warning if not composable.
+
+    Returns:
+        Input activation node or None.
+    """
+    predecessors = graph.get_prev_nodes(weights_node)
+    assert len(predecessors) == 1, (f'Weights node is expected to have exactly one input, '
+                                    f'node {weights_node} has {len(predecessors)}')
+    act_node = predecessors[0]
+    if len(graph.out_edges(act_node)) > 1:
+        if warn:
+            Logger.warning(f"Node {act_node.name} has multiple outgoing edges, which is not supported with "
+                           f"mixed-precision search under bit-operations constraint. In such case, it might result in "
+                           f"incorrect resource utilization computation and suboptimal bits selection.")
+        return None
+
+    return act_node
diff --git a/tests_pytest/common/core/common/mixed_precision/resource_utilization_tools/test_resource_utilization_calculator.py b/tests_pytest/common/core/common/mixed_precision/resource_utilization_tools/test_resource_utilization_calculator.py
@@ -1066,9 +1066,9 @@ def test_compute_node_bops_default_qc(self, fw_impl_mock, fw_info_mock):
 
     def test_compute_virtual_aw_node_bops_fully_quantized(self, fw_impl_mock, fw_info_mock):
         # all quantized
-        g, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock,
-                                                                   quantize_a1=True, quantize_w1=True,
-                                                                   quantize_a2=True, quantize_w2=True)
+        g, _, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock,
+                                                                  quantize_a1=True, quantize_w1=True,
+                                                                  quantize_a2=True, quantize_w2=True)
         ru_calc = ResourceUtilizationCalculator(g, fw_impl_mock, fw_info_mock)
 
         assert ru_calc.compute_node_bops(a1w2, TIC.AnyQuantized, BM.Float) == 42 * 32 * 32
@@ -1083,18 +1083,16 @@ def test_compute_virtual_aw_node_bops_fully_quantized(self, fw_impl_mock, fw_inf
 
         assert ru_calc.compute_node_bops(a2, TIC.AnyQuantized, BM.Float) == 0
 
-        assert ru_calc.compute_node_bops(w3, TIC.AnyQuantized, BM.QMaxBit) == 142 * 7 * 6
-
     def test_compute_virtual_aw_node_bops_half_quantized(self, fw_impl_mock, fw_info_mock):
-        g, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock,
+        g, _, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock,
                                                                    quantize_a1=True, quantize_w1=False,
                                                                    quantize_a2=False, quantize_w2=True)
         ru_calc = ResourceUtilizationCalculator(g, fw_impl_mock, fw_info_mock)
         assert ru_calc.compute_node_bops(a1w2, TIC.AnyQuantized, BM.QMaxBit) == 42 * 16 * 32
         assert ru_calc.compute_node_bops(a2w3, TIC.AnyQuantized, BM.QMaxBit) == 142 * 32 * 6
 
     def test_compute_virtual_aw_node_bops_unquantized(self, fw_impl_mock, fw_info_mock):
-        g, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock,
+        g, _, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock,
                                                                    quantize_a1=False, quantize_w1=False,
                                                                    quantize_a2=False, quantize_w2=False)
         ru_calc = ResourceUtilizationCalculator(g, fw_impl_mock, fw_info_mock)
@@ -1105,7 +1103,7 @@ def test_compute_virtual_aw_node_bops_unquantized(self, fw_impl_mock, fw_info_mo
         assert ru_calc.compute_node_bops(a2w3, TIC.Any, BM.QMaxBit) == 142 * 32 * 32
 
     def test_compute_virtual_aw_node_bops_custom(self, fw_impl_mock, fw_info_mock):
-        g, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock,
+        g, _, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock,
                                                                    quantize_a1=False, quantize_w1=False,
                                                                    quantize_a2=True, quantize_w2=True)
         custom_qc_a1w2 = build_qc(5, w_attr={'foo': (6, True)})
@@ -1179,17 +1177,57 @@ class BOPNode2:
                             'n3': 630 * 7 * 5}
 
     def test_compute_virtual_graph_resources(self, fw_impl_mock, fw_info_mock):
-        g, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock, True, True, True, True)
+        g, _, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock, True, True, True, True)
         ru_calc = ResourceUtilizationCalculator(g, fw_impl_mock, fw_info_mock)
         ru, detailed = ru_calc.compute_resource_utilization(TIC.Any, BM.QMaxBit, return_detailed=True)
         assert (sorted(list(detailed[RUTarget.ACTIVATION].values())) ==
-                sorted([24, 24 + 50*2, 50*2+88*5/8, 88*5/8 + 24*7/8, 24*7/8, 0])), detailed[RUTarget.ACTIVATION]
-        assert detailed[RUTarget.WEIGHTS] == {a1w2.name: 42*2, a2w3.name: 142*6/8, w3.name: 142*6/8}
-        assert detailed[RUTarget.BOPS] == {a1w2.name: 42*16*16, a2w3.name: 142*5*6, w3.name: 142*7*6}
-        assert ru == ResourceUtilization(weights_memory=84 + 142*1.5,
+                sorted([24, 24 + 50*2, 50*2+88*5/8, 88*5/8 + 28*7/8, 28*7/8])), detailed[RUTarget.ACTIVATION]
+        assert detailed[RUTarget.WEIGHTS] == {a1w2.name: 42*2, a2w3.name: 142*6/8}
+        assert detailed[RUTarget.BOPS] == {a1w2.name: 42*16*16, a2w3.name: 142*5*6}
+        assert ru == ResourceUtilization(weights_memory=84 + 142*6/8,
                                          activation_memory=155,
-                                         total_memory=155+297,
-                                         bops=42*256+142*30+142*42)
+                                         total_memory=155 + (84 + 142*6/8),
+                                         bops=42*256+142*30)
+
+    def test_virtual_graph_with_virtual_weight(self, fw_impl_mock, fw_info_mock):
+        # virtual weight node wasn't merged into virtual composed node
+        _, n_in, a1w2, a2, a2w3, w3, a3 = self._build_virtual_node_graph(fw_impl_mock, fw_info_mock, True, True, True, True)
+        g = Graph('g', nodes=[w3], input_nodes=[n_in], output_nodes=[a3],
+                  edge_list=[Edge(n_in, w3, 0, 0), Edge(w3, a3, 0, 0)])
+        ru_calc = ResourceUtilizationCalculator(g, fw_impl_mock, fw_info_mock)
+        ru, detailed = ru_calc.compute_resource_utilization(TIC.Any, BM.QMaxBit, return_detailed=True)
+        assert list(detailed[RUTarget.WEIGHTS].values()) == [142 * 6 / 8]
+        assert detailed[RUTarget.BOPS] == {}
+        # the extra cut that is created by virtual weight node. The rest of the cuts must be correct.
+        wa_cut = 2*28*7/8
+        assert sorted(list(detailed[RUTarget.ACTIVATION].values())) == sorted([24, 24+28*7/8, 28*7/8, wa_cut])
+        assert ru == ResourceUtilization(weights_memory=142 * 6 / 8,
+                                         activation_memory=wa_cut,
+                                         total_memory=wa_cut + (142 * 6 / 8),
+                                         bops=0)
+
+    def test_multi_output_input_activation(self, fw_impl_mock, fw_info_mock):
+        """ No bops should be calculated for weight node if its input activation has multiple outputs. """
+        n_in = build_node('in', qcs=[build_qc()], output_shape=(None, 2, 3, 4))
+        n2 = build_node('n2', layer_class=BOPNode, output_shape=(None, 2, 44),
+                        canonical_weights={'foo': np.zeros((3, 14))},
+                        qcs=[
+                            build_qc(2, w_attr={'foo': (16, True)}),
+                            build_qc(3, w_attr={'foo': (10, True)}),
+                            build_qc(4, w_attr={'foo': (7, True)}),
+                            build_qc(5, w_attr={'foo': (6, True)}),
+                        ])
+        n_out = build_node('out', qcs=[build_qc()], output_shape=(None, 27))
+        g = Graph('g', input_nodes=[n_in], nodes=[n2], output_nodes=[n_out],
+                  edge_list=[Edge(n_in, n2, 0, 0), Edge(n_in, n_out, 0, 0)])
+
+        def get_kernel_attr(node_type):
+            return {BOPNode: ['foo']}.get(node_type) or []
+        fw_info_mock.get_kernel_op_attributes = get_kernel_attr
+        fw_impl_mock.get_node_mac_operations = lambda n, fw_info: {n2: 42}.get(n, 0)
+
+        ru_calc = ResourceUtilizationCalculator(g, fw_impl_mock, fw_info_mock)
+        assert ru_calc.compute_bops(TIC.Any, BM.Float) == (0, {})
 
     def _build_regular_node_graph(self, enable_aq, enable_wq):
         n1 = build_node('n1', qcs=[build_qc(16, enable_aq), build_qc(7, enable_aq)], output_shape=(None, 5, 10))
@@ -1225,7 +1263,7 @@ class ActType:
                             build_qc(4, quantize_a2, w_attr={'foo': (7, quantize_w1)}),
                             build_qc(5, quantize_a2, w_attr={'foo': (6, quantize_w1)}),
                         ])
-        n3 = build_node('n3', layer_class=BOPNode2, output_shape=(None, 24),
+        n3 = build_node('n3', layer_class=BOPNode2, output_shape=(None, 28),
                         canonical_weights={'bar': np.zeros((2, 71))},
                         qcs=[
                             build_qc(4, w_attr={'bar': (6, quantize_w2)}),
@@ -1240,12 +1278,12 @@ def get_kernel_attr(node_type):
         fw_impl_mock.get_node_mac_operations = lambda n, fw_info: {n2: 42, n3: 142}.get(n, 0)
 
         # virtual aw node made of original nodes
-        a1w2 = VirtualActivationWeightsNode(act_node=n1, weights_node=n2, fw_info=fw_info_mock, **n2.__dict__)
+        a1w2 = VirtualActivationWeightsNode(act_node=n1, weights_node=n2, fw_info=fw_info_mock)
         a2 = VirtualSplitActivationNode(n2, ActType, {})
         w3 = VirtualSplitWeightsNode(n3, 'bar')
         # virtual aw node made of virtual split a, w nodes
-        a2w3 = VirtualActivationWeightsNode(act_node=a2, weights_node=w3, fw_info=fw_info_mock, **n3.__dict__)
+        a2w3 = VirtualActivationWeightsNode(act_node=a2, weights_node=w3, fw_info=fw_info_mock)
         a3 = VirtualSplitActivationNode(n3, ActType, {})
         g = Graph('g', nodes=[a1w2, a2w3, a3], input_nodes=[n_in], output_nodes=[w3],
-                  edge_list=[Edge(n_in, a1w2, 0, 0), Edge(a1w2, a2w3, 0, 0), Edge(a2w3, a3, 0, 0), Edge(a3, w3, 0, 0)])
-        return g, a1w2, a2, a2w3, w3, a3
+                  edge_list=[Edge(n_in, a1w2, 0, 0), Edge(a1w2, a2w3, 0, 0), Edge(a2w3, a3, 0, 0)])
+        return g, n_in, a1w2, a2, a2w3, w3, a3
diff --git a/tests_pytest/keras/core/mixed_precision/test_resource_utilization.py b/tests_pytest/keras/core/mixed_precision/test_resource_utilization.py
@@ -109,6 +109,7 @@ def data_gen():
 
 class TestRUIntegration:
     def test_orig_vs_virtual_sequential_graph(self):
+        """ Test detailed ru computation on original and corresponding virtual graph. """
         inputs = Input(shape=(18, 18, 3))
         x = Conv2D(filters=8, kernel_size=5)(inputs)
         x = tf.add(x, np.ones((14, 8)))    # => activation with const in the composed node
@@ -175,6 +176,8 @@ def test_orig_vs_virtual_sequential_graph(self):
         assert self._extract_values(detailed_virtual[RUTarget.BOPS]) == exp_bops
 
     def test_mult_output_activation(self):
+        """ Tests the case when input activation has multiple outputs -> virtual weights nodes are not merged
+            into VirtualActivationWeightsNode. """
         inputs = Input(shape=(16, 16, 3))
         x1 = Conv2D(filters=15, kernel_size=3, groups=3)(inputs)
         x2 = DepthwiseConv2D(kernel_size=3, depth_multiplier=5)(inputs)
@@ -197,20 +200,19 @@ def test_mult_output_activation(self):
                        (14 * 14 * 15 * binary_out_a_bit + 14 * 14 * 10 * linear_a_min_nbit) / 8,
                        14 * 14 * 10 * linear_a_min_nbit / 8]
 
-        # the order of conv and dwconv is not guaranteed but they have same values
+        # the order of conv and dwconv is not guaranteed, but they have same values
         exp_w_ru = [3*3*1*15*linear_w_min_nbit/8,
                     3*3*3*5*linear_w_min_nbit/8,
                     15 * 10 * linear_w_min_nbit/8]
-        exp_bops = [(3*3*1*15)*(14*14)*default_a_nbit*linear_w_min_nbit,
-                    (3*3*3*5)*(14*14)*default_a_nbit*linear_w_min_nbit,
-                    (15*10)*(14*14)*binary_out_a_bit*linear_w_min_nbit]
+        # bops are not computed for virtual weights nodes
+        exp_bops = [(15*10)*(14*14)*binary_out_a_bit*linear_w_min_nbit]
 
         assert self._extract_values(detailed_orig[RUTarget.ACTIVATION], sort=True) == sorted(exp_cuts_ru)
         assert self._extract_values(detailed_orig[RUTarget.WEIGHTS]) == exp_w_ru
         assert self._extract_values(detailed_orig[RUTarget.BOPS]) == exp_bops
 
         virtual_graph = substitute(copy.deepcopy(graph),
-                                   fw_impl.get_substitutions_virtual_weights_activation_coupling())
+                                   self.fw_impl.get_substitutions_virtual_weights_activation_coupling())
         assert len(virtual_graph.nodes) == 7
         assert len([n for n in virtual_graph.nodes if isinstance(n, VirtualActivationWeightsNode)]) == 1
         assert len([n for n in virtual_graph.nodes if isinstance(n, VirtualSplitActivationNode)]) == 3
@@ -222,8 +224,7 @@ def test_mult_output_activation(self):
                                                                             return_detailed=True)
         assert ru_virtual == ru_orig
         # conv and dwconv each remain as a pair of virtual W and virtual A nodes. Remaining virtual W nodes mess up the
-        # cuts. However, this should only add virtualW-virtualA cuts, all cuts from the original graph should be
-        # identical
+        # cuts - but this should only add virtualW-virtualA cuts, all cuts from the original graph should stay identical
         assert not set(exp_cuts_ru) - set(detailed_virtual[RUTarget.ACTIVATION].values())
         assert self._extract_values(detailed_virtual[RUTarget.WEIGHTS]) == exp_w_ru
         assert self._extract_values(detailed_virtual[RUTarget.BOPS]) == exp_bops