Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@
from nncf.tensor import functions as fns


def process_stats(stats: WCTensorStatistic, subset_size: int) -> tuple[Tensor, Tensor]:
def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int = -1) -> tuple[Tensor, Tensor]:
"""
A function for processing activations. Shared between AWQ, Scale Estimation and LoRA Correction algorithms.

:param stats: An object containing statistics for the layer.
:param subset_size: The number of samples for AWQ.
:param act_ch_axis: The activation channel axis.
:return: tuple of the following tensors:
s - maximum channel magnitude across samples [HiddenDim]
X - average channel magnitude across tokens in the sequence [HiddenDim, min(SampleSize, ~subset_size)]
Expand All @@ -41,7 +42,9 @@ def process_stats(stats: WCTensorStatistic, subset_size: int) -> tuple[Tensor, T

# Prevent high memory and time consumption by sampling
if X_full.shape[sample_axis] > subset_size:
lens = [reduce(mul, shape[:-1], 1) for shape in stats.shape_values]
lens = [
reduce(mul, shape[:act_ch_axis] + shape[act_ch_axis % len(shape) + 1 :], 1) for shape in stats.shape_values
]
step = X_full.shape[sample_axis] // subset_size
idxs = [i[0] for i in sorted(enumerate(lens), key=lambda x: -x[1])][::step]
X = X_full[..., idxs]
Expand Down
48 changes: 31 additions & 17 deletions src/nncf/quantization/algorithms/weight_compression/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -959,9 +959,9 @@ def get_weight_compression_parameters(
# MoE operations are usually matmuls, so the check for matmul metatype is done
# This is to avoid raising the error for non-MoE cases with 3D weights.
parsed_ov_version = f"{ov_version[0]}.{ov_version[1]}.{ov_version[2]}-{ov_version[3]}"
msg = f"""NNCF compression algorithms do not support 3D weights with current version of
OpenVINO {parsed_ov_version} due to a known issue in statistics collection
Ticket - 176465. Please update to the latest OpenVINO nightly version.
msg = f"""NNCF compression algorithms do not support 3D weights with current version of
OpenVINO {parsed_ov_version} due to a known issue in statistics collection
Ticket - 176465. Please update to the latest OpenVINO nightly version.
Node with weight: {node.node_name}."""
raise nncf.UnsupportedModelError(msg)

Expand Down Expand Up @@ -1087,6 +1087,11 @@ def apply_with_parameters(
)

if self._lora_correction:
for wc_params in all_weight_params:
if self._backend_entity.matmul_has_transposed_activations(wc_params.node_with_weight, graph):
msg = "Transposed activations are not supported yet for the LoRa correction algorithm"
raise nncf.UnsupportedModelError(msg)

lora_correction_params = self._advanced_parameters.lora_correction_params
lora_correction_algo = LoraCorrectionAlgorithm(statistics, lora_correction_params)
description += " with correction of low-rank adapters"
Expand Down Expand Up @@ -1128,19 +1133,21 @@ def apply_with_parameters(
)
return transformed_model

def _get_activation_node_and_port(self, node: NNCFNode, nncf_graph: NNCFGraph) -> tuple[NNCFNode, int]:
def _get_activation_node_port_and_channel(self, node: NNCFNode, nncf_graph: NNCFGraph) -> tuple[NNCFNode, int, int]:
"""
This method returns the activation layer and corresponding port id for the node.
This method returns the activation layer, corresponding port id and channel axis for the given node.

:param node: NNCFGraph node for which the activation is sought.
:param nncf_graph: NNCFGraph instance with the node.
:return: Tuple with the activation node and port id.
:return: Tuple with the activation node, port id and channel axis.
"""
activation_port = self._backend_entity.get_activation_port_id(node, nncf_graph)
activation_edge = nncf_graph.get_input_edge_by_port_id(node, activation_port)
activation_node = activation_edge.from_node
port_id = activation_edge.output_port_id
return activation_node, port_id
activation_channel_axis = self._backend_entity.get_activation_channel_axis(
node, activation_edge.input_port_id, activation_edge.tensor_shape
)
return activation_node, activation_edge.output_port_id, activation_channel_axis

def get_matmul_input_to_output_nodes_map(
self, matmul_nodes: list[NNCFNode], graph: NNCFGraph
Expand All @@ -1161,8 +1168,8 @@ def get_matmul_input_to_output_nodes_map(
"""
matmul_input_to_output_nodes_map = defaultdict(list)
for node in matmul_nodes:
act_node, output_port_id = self._get_activation_node_and_port(node, graph)
matmul_input_to_output_nodes_map[(act_node, output_port_id)].append(node)
act_node, output_port_id, act_channel_axis = self._get_activation_node_port_and_channel(node, graph)
matmul_input_to_output_nodes_map[(act_node, output_port_id, act_channel_axis)].append(node)
return matmul_input_to_output_nodes_map

def get_compression_nodes_info(
Expand Down Expand Up @@ -1230,7 +1237,11 @@ def get_statistic_points(

# Statistics for data aware algorithms
if self._data_aware_compression:
for (node, output_port_id), node_with_weights in matmul_input_to_output_nodes_map.items():
for (
node,
output_port_id,
input_channel_axis,
), node_with_weights in matmul_input_to_output_nodes_map.items():
statistic_point = self._backend_entity.target_point(
TargetType.POST_LAYER_OPERATION, node.node_name, port_id=output_port_id
)
Expand All @@ -1245,13 +1256,16 @@ def get_statistic_points(
]
all_weight_dims.extend(weight_dims)

# by default, reduce activations across all but the last dimension. The last dimension is
# assumed to be the hidden size dimension.
# Reduce activations across all but the hidden dimension.
n_dims = len(graph.get_output_edges_by_port_id(node, output_port_id)[0].tensor_shape)
reduction_axes = tuple(range(n_dims - 1))
# negative axis (e.g. -1 for the last axis) is converted into corresponding positive value
input_channel_axis = input_channel_axis % n_dims
reduction_axes = tuple(i for i in range(n_dims) if i != input_channel_axis)

# For 3D weights, hidden dimension is the second dimension. Reduce by all other dimensions
reduction_axes = (1,) if any(weight_dim == 3 for weight_dim in all_weight_dims) else reduction_axes
# For 3D weights, keep the batch dimention
if any(weight_dim == 3 for weight_dim in all_weight_dims):
assert len(reduction_axes) == 2
reduction_axes = reduction_axes[1:]

stat_collector = self._backend_entity.mean_statistic_collector(
reduction_axes=reduction_axes, subset_size=self._subset_size
Expand Down Expand Up @@ -1291,7 +1305,7 @@ def _get_statistics_for_weights_compression(
# Where mean_value is a 1D tensor representing an activation reduced over batch and sequence length dimensions,
# shape is an original shape of an activation before reduction, n is the size of the dataset (or subset_size).
statistics = {}
for (act_node, output_port_id), matmul_nodes in matmul_input_to_output_nodes_map.items():
for (act_node, output_port_id, _), matmul_nodes in matmul_input_to_output_nodes_map.items():
tensor_collectors = list(
statistic_points.get_algo_statistics_for_node(
act_node.node_name,
Expand Down
30 changes: 23 additions & 7 deletions src/nncf/quantization/algorithms/weight_compression/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ def apply(
weight_dtype = weight.dtype
weight = weight.astype(TensorDataType.float32)

act_ch_axis, act_shape = self._get_activation_channel_axis_and_shape(graph, wp)

if is_data_free:
scale = self._data_free_step(weight, 1 - wp.reduction_axes[0])
else:
Expand All @@ -181,24 +183,28 @@ def apply(
prev_weight = self._backend_entity.get_weight(merge_node, prev_weight_port_id, model, graph)

prev_statistics = statistics[merge_node.node_name]
scale = self._data_aware_step(wp, weight, statistics[k], prev_weight, prev_statistics)
scale = self._data_aware_step(wp, weight, statistics[k], act_ch_axis, prev_weight, prev_statistics)

w_scale = fns.unsqueeze(scale, 1 - wp.reduction_axes[0])
a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0])
a_scale = 1.0 / scale

scaled_weight = (weight * w_scale).astype(weight_dtype)
self._backend_entity.set_weight(wp.node_with_weight, weight_port_id, model, graph, scaled_weight)

if is_mergeable: # for MatMul->Multiply->MatMul pattern the scale is merged to the first MatMul
for _, port_id in self._backend_entity.get_weight_names_and_port_ids(merge_node, graph):
merge_weight = self._backend_entity.get_weight(merge_node, port_id, model, graph)
a_scale = fns.unsqueeze(a_scale, wp.reduction_axes[0])
merge_weight = (merge_weight * a_scale).astype(weight_dtype)
self._backend_entity.set_weight(merge_node, port_id, model, graph, merge_weight)
a_scale = fns.transpose(a_scale)
else: # for Act->Multiply->MatMul and Act->MatMul patterns scale inserted after Act as extra node
a_scale = fns.transpose(a_scale).astype(weight_dtype)
# Calculate the activation scale shape
a_scale_shape = [scale.shape[0] if axis == act_ch_axis else 1 for axis in range(len(act_shape))]
a_scale = fns.reshape(a_scale, tuple(a_scale_shape))

next_nodes = graph.get_next_nodes(merge_node)
source_node_output_port = graph.get_output_edges(merge_node)[0].output_port_id

scale_insertion_command = self._backend_entity.scale_insertion_command(
merge_node, next_nodes, source_node_output_port, a_scale.data
)
Expand All @@ -210,10 +216,10 @@ def apply(

return transformed_model

def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statistics=None):
def _data_aware_step(self, wp, weight, statistics, act_ch_axis, prev_weight=None, prev_statistics=None):
alpha_step = (self._alpha_max - self._alpha_min) / self._steps
config = wp.compression_config
s, X = process_stats(statistics, self._subset_size)
s, X = process_stats(statistics, self._subset_size, act_ch_axis)
s = s.astype(TensorDataType.float32)
X = X.astype(TensorDataType.float32)

Expand All @@ -222,7 +228,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis

prev_s, prev_w = None, None
if prev_statistics is not None and prev_weight is not None:
prev_s, _ = process_stats(prev_statistics, self._subset_size)
prev_s, _ = process_stats(prev_statistics, self._subset_size, act_ch_axis)
prev_s = prev_s.astype(TensorDataType.float32).max().item()
prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis)

Expand Down Expand Up @@ -311,6 +317,16 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis

return scale

def _get_activation_channel_axis_and_shape(
self, graph: NNCFGraph, wp: WeightCompressionParameters
) -> tuple[int, tuple[int, ...]]:
activation_port_id = self._backend_entity.get_activation_port_id(wp.node_with_weight, graph)
act_shape = graph.get_input_edge_by_port_id(wp.node_with_weight, activation_port_id).tensor_shape
act_ch_axis = self._backend_entity.get_activation_channel_axis(
wp.node_with_weight, activation_port_id, act_shape
)
return act_ch_axis % len(act_shape), act_shape

@staticmethod
def _clamp_scale(magnitudes, threshold, scale, clamped_scale):
return fns.where(magnitudes < threshold, scale, clamped_scale)
Expand Down
23 changes: 23 additions & 0 deletions src/nncf/quantization/algorithms/weight_compression/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,17 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: TMo
:return: The weight tensor.
"""

@abstractmethod
def matmul_has_transposed_activations(self, matmul: NNCFNode, int, graph: NNCFGraph) -> bool:
"""
Checks whether the activation input of a MatMul operation is transposed.

:param matmul: MatMul NNCFGraph node.
:param graph: The model graph associated with the model.
:return: True if the node is a matmul node and activation input is transposed,
False otherwise.
"""

@abstractmethod
def get_weight_dtype(
self, node_with_weight: NNCFNode, weight_port_id: int, model: TModel, graph: NNCFGraph
Expand Down Expand Up @@ -273,6 +284,18 @@ def get_ignored_patterns() -> GraphPattern:
:return: backend-specific ignored patterns.
"""

@staticmethod
@abstractmethod
def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int:
"""
Returns axis number of the activation tensor which correspond to it channel.

:param node: NNCFNode instance.
:param port_id: Port ID for input.
:param input_shape: Shape of the input.
:return: Channel axis number.
"""


class AWQAlgoBackend(WeightCompressionAlgoBackend):
@staticmethod
Expand Down
5 changes: 5 additions & 0 deletions src/nncf/quantization/algorithms/weight_compression/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,11 @@ def apply(
CompressWeightsMode.INT8_SYM,
]:
continue

if self._backend_entity.matmul_has_transposed_activations(wc_params.node_with_weight, graph):
msg = "Transposed activations are not supported yet for the GPTQ algorithm"
raise nncf.UnsupportedModelError(msg)

_, input_tensors = next(iter(inputs.items()))
hessian = self._calculate_hessian(node, input_tensors)
scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian, input_tensors)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def get_statistic_points(
self._set_backend_entity(model)

statistic_container = StatisticPointsContainer()
for act_node, output_port_id in nodes_and_port_ids:
for act_node, output_port_id, _ in nodes_and_port_ids:
n_dims = len(graph.get_output_edges_by_port_id(act_node, output_port_id)[0].tensor_shape)
if n_dims < 2:
msg = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from nncf.onnx.graph.model_transformer import remove_initializer
from nncf.onnx.graph.model_transformer import remove_node
from nncf.onnx.graph.model_transformer import set_initializer
from nncf.onnx.graph.node_utils import get_act_quantization_axis
from nncf.onnx.graph.node_utils import get_weight_quantization_axis
from nncf.onnx.graph.onnx_helper import ONNX_DTYPE_TO_NNCF_DTYPE
from nncf.onnx.graph.onnx_helper import get_name_to_node_map
Expand Down Expand Up @@ -186,6 +187,13 @@ def get_weight(
weight_tensor = get_tensor_value(model, weight_name)
return Tensor(weight_tensor)

def matmul_has_transposed_activations(self, matmul: NNCFNode, graph: NNCFGraph) -> bool:
if matmul.metatype != metatypes.ONNXGemmMetatype:
return False
act_port_id = self.get_activation_port_id(matmul, graph)
trans_attr = "transB" if act_port_id else "transA"
return matmul.layer_attributes.node_attrs[trans_attr]

def get_weight_dtype(
self, node_with_weight: NNCFNode, weight_port_id: int, model: onnx.ModelProto, graph: NNCFGraph
) -> TensorDataType:
Expand Down Expand Up @@ -301,6 +309,10 @@ def filter_func(point: StatisticPoint) -> bool:

return filter_func

@staticmethod
def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int:
return get_act_quantization_axis(node, port_id)

def insert_adapters(
self, wc_params: WeightCompressionParameters, lora_A: Tensor, lora_B: Tensor, int8_lora: bool
) -> None:
Expand Down Expand Up @@ -503,9 +515,13 @@ def get_ignored_patterns() -> GraphPattern:
class ONNXAWQAlgoAlgoBackend(AWQAlgoBackend, ONNXWeightCompressionAlgoBackend):
@staticmethod
def get_awq_patterns() -> dict[str, Callable]:
return get_awq_patterns(
onnx_metatypes.ONNXMatMulMetatype, onnx_metatypes.ONNXMulLayerMetatype, ATOMIC_ACTIVATIONS_OPERATIONS
)
patterns = {}
for mm_metatype in (onnx_metatypes.ONNXMatMulMetatype, onnx_metatypes.ONNXGemmMetatype):
p = get_awq_patterns(mm_metatype, onnx_metatypes.ONNXMulLayerMetatype, ATOMIC_ACTIVATIONS_OPERATIONS)
p = {f"{mm_metatype.__name__}_{k}": v for k, v in p.items()}
patterns.update(p)

return patterns

@staticmethod
def scale_insertion_command(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import openvino as ov
from openvino import opset13 as opset

import nncf
from nncf.common.graph import NNCFGraph
from nncf.common.graph import NNCFNode
from nncf.common.graph.operator_metatypes import OperatorMetatype
Expand All @@ -35,6 +34,7 @@
from nncf.openvino.graph.node_utils import convert_op
from nncf.openvino.graph.node_utils import create_ov_codebook_subgraph
from nncf.openvino.graph.node_utils import create_ov_const_from_tensor
from nncf.openvino.graph.node_utils import get_activation_channel_axis
from nncf.openvino.graph.node_utils import get_const_value_as_numpy_tensor
from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor
from nncf.openvino.graph.node_utils import get_weight_channel_axes
Expand Down Expand Up @@ -119,9 +119,6 @@ def mean_statistic_collector(

@staticmethod
def get_activation_port_id(node: NNCFNode, nncf_graph: NNCFGraph) -> int:
if node.layer_attributes.input_attributes["transpose"]:
msg = "Transposed input is not supported"
raise nncf.UnsupportedModelError(msg)
constant_ports = node.layer_attributes.get_const_port_ids()
activation_ports = [
e.input_port_id for e in nncf_graph.get_input_edges(node) if e.input_port_id not in constant_ports
Expand All @@ -143,6 +140,11 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.
weight_tensor = get_const_value_as_numpy_tensor(weight_node)
return Tensor(weight_tensor)

def matmul_has_transposed_activations(self, matmul: NNCFNode, graph: NNCFGraph) -> bool:
if matmul.metatype != om.OVMatMulMetatype:
return False
return matmul.layer_attributes.input_attributes["transpose"]

def get_weight_dtype(
self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph
) -> TensorDataType:
Expand Down Expand Up @@ -378,6 +380,10 @@ def get_ignored_patterns() -> GraphPattern:
pattern.add_pattern_alternative(create_sam_pe())
return pattern

@staticmethod
def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int:
return get_activation_channel_axis(node, port_id, input_shape)


class OVTensorWeightCompressionAlgoBackend(OVWeightCompressionAlgoBackend):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ def apply(
continue
_, weight_port_id = weight_data[0]

if self._backend_entity.matmul_has_transposed_activations(wp.node_with_weight, graph):
msg = "Transposed activations are not supported yet for the Scale Estimation algorithm"
raise nncf.UnsupportedModelError(msg)

weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)

scale, zero_point = self.calculate_quantization_params(
Expand Down
Loading