Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ def __init__(self, model_graph: Graph):
Args:
model_graph: A graph representation of a model.
"""

self.model_graph = model_graph

nodes = list(model_graph.nodes)
memory_tensors = []
node_to_tensor = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,18 @@ def __init__(self,
self.mp_topo_configurable_nodes = self.mp_graph.get_configurable_sorted_nodes(fw_info)

self.ru_targets = target_resource_utilization.get_restricted_targets()
self.ru_helper = MixedPrecisionRUHelper(self.mp_graph, fw_info, fw_impl)
self.ru_helper = MixedPrecisionRUHelper(self.original_graph, fw_info, fw_impl)

self.min_ru_config: Dict[BaseNode, int] = self.mp_graph.get_min_candidates_config(fw_info)
self.max_ru_config: Dict[BaseNode, int] = self.mp_graph.get_max_candidates_config(fw_info)
self.min_ru = self.ru_helper.compute_utilization(self.ru_targets, self.min_ru_config)

self.config_reconstruction_helper = ConfigReconstructionHelper(virtual_graph=self.mp_graph,
original_graph=self.original_graph)
if self.using_virtual_graph:
real_min_ru_config: Dict[BaseNode, int] = self.config_reconstruction_helper.reconstruct_config_from_virtual_graph(self.min_ru_config)
self.min_ru = self.ru_helper.compute_utilization(self.ru_targets, real_min_ru_config)
else:
self.min_ru = self.ru_helper.compute_utilization(self.ru_targets, self.min_ru_config)

def search(self) -> Dict[BaseNode, int]:
"""
Expand Down Expand Up @@ -251,7 +255,8 @@ def _compute_relative_ru_matrices(self) -> Dict[RUTarget, np.ndarray]:
else:
cfg = self.min_ru_config.copy()
cfg[node] = candidate_idx
candidate_rus = self.ru_helper.compute_utilization(self.ru_targets, cfg)
real_cfg = self.config_reconstruction_helper.reconstruct_config_from_virtual_graph(cfg)
candidate_rus = self.ru_helper.compute_utilization(self.ru_targets, real_cfg)

for target, ru in candidate_rus.items():
rus_per_candidate[target].append(ru)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from enum import Enum, auto
from typing import Dict, NamedTuple, Optional, Tuple, List, Iterable, Union, Literal, Sequence

from model_compression_toolkit.core.common.fusion.graph_fuser import GraphFuser

from model_compression_toolkit.constants import FLOAT_BITWIDTH
from model_compression_toolkit.core import FrameworkInfo
from model_compression_toolkit.core.common import Graph, BaseNode
Expand Down Expand Up @@ -145,8 +147,14 @@ def cuts(self) -> Dict[Cut, List[BaseNode]]:
raise RuntimeError("Failed to calculate activation memory cuts for graph.")
cuts = [cut for cut in cuts if cut.mem_elements.elements]
# cache cuts nodes for future use, so do not filter by target
self._cuts = {cut: [self.graph.find_node_by_name(m.node_name)[0] for m in cut.mem_elements.elements]
for cut in cuts}
self._cuts = {
cut: [
node
for m in cut.mem_elements.elements
for node in (self.graph.fusing_info.get_fused_nodes(m.node_name) or (self.graph.find_node_by_name(m.node_name)[0],))
]
for cut in cuts
}
return self._cuts

def compute_resource_utilization(self,
Expand Down Expand Up @@ -580,7 +588,9 @@ def compute_node_bops(self,

def _compute_cuts(self):
""" Compute activation cuts of the graph. """
memory_graph = MemoryGraph(deepcopy(self.graph))
# Compute memory graph on fused graph with fused nodes
graph = GraphFuser().apply_node_fusion(self.graph)
memory_graph = MemoryGraph(deepcopy(graph))
_, _, cuts = compute_graph_max_cut(memory_graph)
return cuts

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import math
import typing

import abc
Expand Down Expand Up @@ -53,7 +54,8 @@ def get_base_mp_nbits_candidates():
class MixedPrecisionActivationBaseTest(BaseKerasFeatureNetworkTest):
def __init__(self, unit_test, activation_layers_idx, num_calibration_iter=1):
super().__init__(unit_test, num_calibration_iter=num_calibration_iter)

# for the model that is used here, the two last tensors compose the max cut
self.max_cut = 10 * 10 * 32 + 13 * 13 * 32
self.activation_layers_idx = activation_layers_idx

def get_core_config(self):
Expand Down Expand Up @@ -135,14 +137,15 @@ def __init__(self, unit_test):
super().__init__(unit_test, activation_layers_idx=[1, 2, 4])

def get_resource_utilization(self):
return ResourceUtilization(weights_memory=17919, activation_memory=5407)
return ResourceUtilization(weights_memory=17919, activation_memory=self.max_cut-1)

def compare(self, quantized_model, float_model, input_x=None, quantization_info=None):
# verify chosen activation bitwidth config
# resource utilization is infinity -> should give best model - 8bits
holder_layers = get_layers_from_model_by_type(quantized_model, KerasActivationQuantizationHolder)
activation_bits = [layer.activation_holder_quantizer.get_config()['num_bits'] for layer in holder_layers]
self.unit_test.assertTrue((activation_bits == [8, 4, 8]))
# Since the max cut is the last two tensors, one of them have to get 4 bits
self.unit_test.assertIn(activation_bits, ([8, 4, 8], [8, 8, 4]))

self.verify_quantization(quantized_model, input_x,
weights_layers_idx=[2, 3],
Expand All @@ -157,7 +160,7 @@ def __init__(self, unit_test):

def get_resource_utilization(self):
# resource utilization is for 4 bits on average
return ResourceUtilization(weights_memory=17920 * 4 / 8, activation_memory=4300)
return ResourceUtilization(weights_memory=17920 * 4 / 8, activation_memory=math.ceil(self.max_cut*4/8))

def get_tpc(self):
eight_bits = generate_test_op_qc(**generate_test_attr_configs())
Expand All @@ -180,7 +183,7 @@ def compare(self, quantized_model, float_model, input_x=None, quantization_info=
# then there is no guarantee that the activation bitwidth for each layer would be 4-bit,
# this assertion tests the expected result for this specific
# test with its current setup (therefore, we don't check the input layer's bitwidth)
self.unit_test.assertTrue((activation_bits == [4, 8]))
self.unit_test.assertTrue((activation_bits == [4, 4]))


class MixedPrecisionActivationSearch2BitsAvgTest(MixedPrecisionActivationBaseTest):
Expand All @@ -189,7 +192,7 @@ def __init__(self, unit_test):

def get_resource_utilization(self):
# resource utilization is for 2 bits on average
return ResourceUtilization(weights_memory=17920.0 * 2 / 8, activation_memory=1544)
return ResourceUtilization(weights_memory=17920.0 * 2 / 8, activation_memory=math.ceil(self.max_cut * 2 / 8))

def compare(self, quantized_model, float_model, input_x=None, quantization_info=None):
# verify chosen activation bitwidth config
Expand All @@ -213,7 +216,8 @@ def __init__(self, unit_test):
super().__init__(unit_test, activation_layers_idx=[1, 3])

def get_resource_utilization(self):
return ResourceUtilization(47, 767)
# 638 = round_up((16*16*3+13*13*3)/2) -> so it must choose (4,4)
return ResourceUtilization(47, 638)

def create_networks(self):
inputs = layers.Input(shape=self.get_input_shapes()[0][1:])
Expand All @@ -225,18 +229,17 @@ def create_networks(self):

def compare(self, quantized_model, float_model, input_x=None, quantization_info=None):
# verify chosen activation bitwidth config
# resource utilization is infinity -> should give best model - 8bits
holder_layers = get_layers_from_model_by_type(quantized_model, KerasActivationQuantizationHolder)
activation_bits = [layer.activation_holder_quantizer.get_config()['num_bits'] for layer in holder_layers]
self.unit_test.assertTrue((activation_bits == [4, 8]))
self.unit_test.assertTrue((activation_bits == [4, 4]))


class MixedPrecisionActivationDepthwise4BitTest(MixedPrecisionActivationBaseTest):
def __init__(self, unit_test):
super().__init__(unit_test, activation_layers_idx=[1])

def get_resource_utilization(self):
return ResourceUtilization(48.0 * 4 / 8, 768.0 * 4 / 8)
return ResourceUtilization(48.0 * 4 / 8, math.ceil((16*16*3+13*13*3) * 4 / 8))

def get_tpc(self):
eight_bits = generate_test_op_qc(**generate_test_attr_configs())
Expand Down Expand Up @@ -464,7 +467,7 @@ def __init__(self, unit_test):

def get_resource_utilization(self):
# 17920: 8-bit weights, 6176: max cut of input+conv_bn
return ResourceUtilization(np.inf, np.inf, total_memory=(17920 + 6176) * 4 / 8)
return ResourceUtilization(np.inf, np.inf, total_memory=(17920 + self.max_cut) * 4 / 8)

def _compare(self, quantized_model, float_model, input_x=None, quantization_info: UserInformation = None):
# verify chosen activation bitwidth config
Expand All @@ -485,7 +488,7 @@ def __init__(self, unit_test):

def get_resource_utilization(self):
weights = 17920 * 4 / 8
activation = 6176 * 4 / 8
activation = math.ceil(self.max_cut * 4 / 8)
return ResourceUtilization(weights, activation, total_memory=weights + activation)

def _compare(self, quantized_model, float_model, input_x=None, quantization_info: UserInformation = None):
Expand Down Expand Up @@ -514,7 +517,7 @@ def __init__(self, unit_test):

def get_resource_utilization(self):
weights = 17920 * 4 / 8
activation = 6176 * 4 / 8 # max cut of input + conv_bn
activation = math.ceil(self.max_cut * 4 / 8)
return ResourceUtilization(weights, activation, total_memory=(weights + activation) / 2)

def _compare(self, quantized_model, float_model, input_x=None, quantization_info: UserInformation = None):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,11 +286,13 @@ def compare(self, qat_model, finalize=False, input_x=None, quantization_info=Non


class QATWrappersMixedPrecisionCfgTest(MixedPrecisionActivationBaseTest):
def __init__(self, unit_test, ru_weights=17919, ru_activation=5407, expected_mp_cfg=[0, 4, 0, 0]):
self.ru_weights = ru_weights
self.ru_activation = ru_activation
self.expected_mp_cfg = expected_mp_cfg
def __init__(self, unit_test, ru_weights=17919, ru_activation=None, expected_mp_cfg=None):
super().__init__(unit_test, activation_layers_idx=[1, 3, 6])
self.ru_weights = ru_weights
# The default test case is that the max cut (which is the fused conv-relu layer tensors, input and output)
# must be reduced to 4 bits on average.
self.ru_activation = ru_activation or (self.max_cut * 4 / 8)
self.expected_mp_cfg = expected_mp_cfg or [0, 4, 0, 1] # input, conv, conv2, relu

def run_test(self, **kwargs):
model_float = self.create_networks()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
# ==============================================================================
import abc
import math

import typing

Expand Down Expand Up @@ -46,6 +47,7 @@
class MixedPrecisionBaseTest(BaseKerasFeatureNetworkTest):
def __init__(self, unit_test, val_batch_size=1, num_calibration_iter=1):
super().__init__(unit_test, val_batch_size=val_batch_size, num_calibration_iter=num_calibration_iter)
self.max_cut = 10 * 10 * 32 + 13 * 13 * 32

def get_quantization_config(self):
return mct.core.QuantizationConfig(mct.core.QuantizationErrorMethod.MSE, mct.core.QuantizationErrorMethod.MSE,
Expand Down Expand Up @@ -361,7 +363,7 @@ class MixedPrecisionSearchTotalMemoryNonConfNodesTest(MixedPrecisionBaseTest):
def __init__(self, unit_test):
super().__init__(unit_test)
# Total ResourceUtilization for weights in 2 bit avg and non-configurable activation in 8 bit
self.target_total_ru = ResourceUtilization(total_memory=17920 * 2 / 8 + 6176)
self.target_total_ru = ResourceUtilization(total_memory=17920 * 2 / 8 + math.ceil(self.max_cut * 8 / 8))

def get_resource_utilization(self):
return self.target_total_ru
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -831,8 +831,8 @@ def test_qat(self):
QuantizationAwareTrainingQuantizersTest(self).run_test()
QuantizationAwareTrainingQuantizerHolderTest(self).run_test()
QATWrappersMixedPrecisionCfgTest(self).run_test()
QATWrappersMixedPrecisionCfgTest(self, ru_weights=17920 * 4 / 8, ru_activation=5408 * 4 / 8,
expected_mp_cfg=[0, 5, 1, 1]).run_test()
QATWrappersMixedPrecisionCfgTest(self, ru_weights=17920 * 4 / 8, ru_activation=8608 * 4 / 8,
expected_mp_cfg=[0, 4, 1, 1]).run_test()

def test_bn_attributes_quantization(self):
BNAttributesQuantization(self, quantize_linear=False).run_test()
Expand Down
6 changes: 3 additions & 3 deletions tests/keras_tests/graph_tests/test_memory_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def test_memory_graph_build(self):

self.assertTrue(len(memory_graph.a_nodes) == 4)
self.assertTrue(len(memory_graph.b_nodes) == 4)
self.assertTrue(graph.get_topo_sorted_nodes()[0] in memory_graph.sources_a)
self.assertTrue(graph.get_topo_sorted_nodes()[0].name in [node.name for node in memory_graph.sources_a])
self.assertTrue(len(memory_graph.sinks_b) == 1)
self.assertTrue(memory_graph.memory_lbound_single_op == 264)

Expand All @@ -99,7 +99,7 @@ def test_memory_graph_node_with_multiple_outputs(self):

self.assertTrue(len(memory_graph.a_nodes) == 5)
self.assertTrue(len(memory_graph.b_nodes) == 6)
self.assertTrue(graph.get_topo_sorted_nodes()[0] in memory_graph.sources_a)
self.assertTrue(graph.get_topo_sorted_nodes()[0].name in [node.name for node in memory_graph.sources_a])
self.assertTrue(len(memory_graph.sinks_b) == 1)
self.assertTrue(memory_graph.memory_lbound_single_op == 576)

Expand All @@ -117,7 +117,7 @@ def test_memory_graph_with_residual(self):

self.assertTrue(len(memory_graph.a_nodes) == 5)
self.assertTrue(len(memory_graph.b_nodes) == 5)
self.assertTrue(graph.get_topo_sorted_nodes()[0] in memory_graph.sources_a)
self.assertTrue(graph.get_topo_sorted_nodes()[0].name in [node.name for node in memory_graph.sources_a])
self.assertTrue(len(memory_graph.sinks_b) == 1)
self.assertTrue(memory_graph.memory_lbound_single_op == 199)

Expand Down
Loading