Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions src/nncf/quantization/advanced_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,22 @@ class AdvancedCompressionParameters:

:param statistics_path: Directory path to dump statistics.
:type statistics_path: str
:param lora_adapter_rank: Rank of lora adapters for FQ_LORA format. Defaults to 256.
:type lora_adapter_rank: int
:param enable_flexible_group_size: Whether to enable flexible group size searching. When enabled, each weight
for which the channel size is not divisible by the general group size value will be compressed to a newly
calculated group size. The new group size value is the maximal power of two (i.e., 2^k) such that:
- channel size is divisible by it;
- it is less than the originally specified group size value;
- it is greater than or equal to `min_flexible_group_size`.

If it's not possible to find a value satisfying these requirements, such weight is compressed to the backup
precision. If ratio < 1.0 and some weights have to be compressed to the backup precision because of group size
issues, then these weights won't contribute to the ratio of backup mode group.
:type enable_flexible_group_size: bool
:param min_flexible_group_size: Minimum group size for flexible group size searching. Defaults to 16. The reason
behind this argument is to avoid too small group size values, which may lead to performance issues.
:type min_flexible_group_size: int
:param awq_params: Advanced parameters for AWQ algorithm.
:type awq_params: AdvancedAWQParameters
:param scale_estimation_params: Advanced parameters for Scale Estimation algorithm.
Expand All @@ -377,8 +393,6 @@ class AdvancedCompressionParameters:
:type gptq_params: AdvancedGPTQParameters
:param lora_correction_params: Advanced parameters for Lora Correction algorithm.
:type lora_correction_params: AdvancedLoraCorrectionParameters
:param lora_adapter_rank: Rank of lora adapters for FQ_LORA format. Defaults to 256.
:type lora_adapter_rank: int
:param backend_params: Backend-specific parameters.
:type backend_params: dict[str, Any]
:param codebook: The codebook (LUT) for the weight compression.
Expand All @@ -387,13 +401,15 @@ class AdvancedCompressionParameters:
"""

statistics_path: Optional[str] = None
lora_adapter_rank: int = 256
enable_flexible_group_size: bool = False
min_flexible_group_size: int = 16
awq_params: AdvancedAWQParameters = field(default_factory=AdvancedAWQParameters)
scale_estimation_params: AdvancedScaleEstimationParameters = field(
default_factory=AdvancedScaleEstimationParameters
)
gptq_params: AdvancedGPTQParameters = field(default_factory=AdvancedGPTQParameters)
lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
lora_adapter_rank: int = 256
backend_params: dict[str, Any] = field(default_factory=dict)
codebook: Optional[TTensor] = None

Expand Down
114 changes: 102 additions & 12 deletions src/nncf/quantization/algorithms/weight_compression/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig
from nncf.quantization.algorithms.weight_compression.weight_lowering import get_reduction_channel_size
from nncf.scopes import IgnoredScope
from nncf.scopes import get_ignored_node_names_from_ignored_scope
from nncf.tensor import Tensor
Expand Down Expand Up @@ -318,11 +319,13 @@ def __init__(
advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
)

primary_config = self._get_primary_config()
criterion_cls = MIXED_PRECISION_CRITERIA.get(self._sensitivity_metric)
self._mixed_precision_algo = criterion_cls(primary_config, self._ratio, self._subset_size)
self._mixed_precision_algo = criterion_cls(self._ratio, self._subset_size)
self._statistics_path = self._advanced_parameters.statistics_path

self._enable_flexible_group_size = self._advanced_parameters.enable_flexible_group_size
self._min_flexible_group_size = self._advanced_parameters.min_flexible_group_size

if self._awq:
awq_params = self._advanced_parameters.awq_params
self.awq_algo = AWQ(
Expand Down Expand Up @@ -454,7 +457,7 @@ def _get_ratio_defining_params(

return ratio_defining_params

def _get_primary_config(self):
def _get_primary_config(self, group_size: int) -> WeightCompressionConfig:
codebook_values = None

if self._mode == CompressWeightsMode.CB4_F8E4M3:
Expand All @@ -464,7 +467,7 @@ def _get_primary_config(self):

return WeightCompressionConfig(
mode=self._mode,
group_size=self._group_size,
group_size=group_size,
codebook_values=codebook_values,
)

Expand All @@ -474,6 +477,7 @@ def _set_weight_compression_config(
model: TModel,
graph: NNCFGraph,
statistics_points: StatisticPointsContainer,
group_size_values: dict[str, int],
) -> None:
"""
Sets the appropriate compression configuration for weights based on some criteria.
Expand All @@ -483,13 +487,92 @@ def _set_weight_compression_config(
:param model: The model.
:param graph: The model graph associated with the model.
:param statistics_points: Statistics points.
:param group_size_values: A dictionary mapping weight names to their group size values.
"""
primary_config = self._get_primary_config()
if self._ratio == 1:
for weight_param in ratio_defining_params:
weight_param.compression_config = primary_config
if self._ratio < 1 and len(ratio_defining_params) > 0:
primary_precision_weight_params = self._mixed_precision_algo.apply(
model, graph, statistics_points, weight_params=ratio_defining_params
)
else:
self._mixed_precision_algo.apply(model, graph, statistics_points, weight_params=ratio_defining_params)
primary_precision_weight_params = ratio_defining_params

for weight_param in primary_precision_weight_params:
weight_param.compression_config = self._get_primary_config(group_size_values[weight_param.weight_name])

# Check if group size is valid for each weight in ratio_defining_params
failed_nodes = []
for w_params in ratio_defining_params:
if w_params.compression_config is None or w_params.compression_config.group_size == -1:
continue
reduction_channel_size, _ = get_reduction_channel_size(w_params.weight_shape, w_params.reduction_axes)
if reduction_channel_size % w_params.compression_config.group_size != 0:
failed_nodes.append((w_params.node_with_weight.node_name, reduction_channel_size))
if len(failed_nodes) > 0:
names = ",".join(f'"{name}"' for name, _ in failed_nodes)
msg = (
"Failed to apply group-wise quantization with "
f"group size value {self._group_size} and channel size value {failed_nodes[0][1]}.\n"
"Ensure that the group size is divisible by the channel size, "
"or include this node and others with similar issues in the ignored scope:\n"
f"nncf.compress_weight(\n\t..., \n\tignored_scope=IgnoredScope(names=[{names}]\n\t)\n)"
)
raise nncf.InvalidGroupSizeError(msg)

def _get_flexible_group_size_data(
self, weight_params: list[WeightCompressionParameters]
) -> list[tuple[WeightCompressionParameters, int]]:
"""
Compute flexible group size values.
:param weight_params: Weight parameters for which to compute flexible group size.
:return: A list of tuples, where each tuple pair contains a WeightCompressionParameters object and the
group size values associated with it. If group size can't be assigned to some weight parameter
it won't be included in the result.
"""
flexible_group_size_not_found_weight_params = []
group_size_data = []
for w_params in weight_params:
reduction_channel_size, _ = get_reduction_channel_size(w_params.weight_shape, w_params.reduction_axes)
if reduction_channel_size % self._group_size == 0:
# The weight can be compressed with the given group size, nothing else to do
group_size_data.append((w_params, self._group_size))
continue

# Find the maximal power of two that divides reduction_channel_size
flexible_group_size = reduction_channel_size & (~reduction_channel_size + 1)

if flexible_group_size < self._min_flexible_group_size:
flexible_group_size_not_found_weight_params.append(w_params)
else:
group_size_data.append((w_params, flexible_group_size))

node_strings = []
for i, (w_params, new_group_size) in enumerate(group_size_data):
if new_group_size == self._group_size:
continue
weight_shape = w_params.weight_shape
reduction_channel_size, _ = get_reduction_channel_size(weight_shape, w_params.reduction_axes)
node_strings.append(
f"{w_params.node_with_weight.node_name} "
f"(weight shape: {weight_shape}, adjusted group size: {new_group_size})"
)
if len(node_strings) > 0:
nncf_logger.info(
f"Wasn't able to set the specified group size value ({self._group_size}) to some nodes. These nodes "
f"will have an adjusted group size value:\n\t" + "\n\t".join(node_strings)
)

if len(flexible_group_size_not_found_weight_params) > 0:
node_strings = [""] * len(flexible_group_size_not_found_weight_params)
for i, w_params in enumerate(flexible_group_size_not_found_weight_params):
weight_shape = w_params.weight_shape
reduction_channel_size, _ = get_reduction_channel_size(weight_shape, w_params.reduction_axes)
node_strings[i] = f"{w_params.node_with_weight.node_name} (weight shape: {weight_shape})"
nncf_logger.warning(
"Large enough flexible group size value cannot be found for some nodes. They will be compressed "
"according to the backup mode. Nodes:\n\t" + "\n\t".join(node_strings)
)

return group_size_data

@staticmethod
def _proportion_str(num_weights_list: list[int], total_num_weights: int, total_num_params: int) -> str:
Expand Down Expand Up @@ -625,7 +708,6 @@ def apply(
if weight_dtype not in SUPPORTED_DATA_TYPES:
continue
weight_shape = self._backend_entity.get_weight_shape(node, weight_port_id, graph)
weight_size = reduce(operator.mul, weight_shape, 1)
reduction_axes = self._backend_entity.get_reduction_axes(node, weight_port_id, graph)
if (
self._group_size != -1
Expand Down Expand Up @@ -654,13 +736,21 @@ def apply(
)
wc_config = WeightCompressionConfig(mode=mode)
weight_params = WeightCompressionParameters(
weight_name, node, weight_port_id, weight_size, reduction_axes, wc_config
weight_name, node, weight_port_id, weight_shape, reduction_axes, wc_config
)
all_weight_params.append(weight_params)
weight_names.add(weight_name)

ratio_defining_params = self._get_ratio_defining_params(all_weight_params, is_last_layer_shared)
self._set_weight_compression_config(ratio_defining_params, model, graph, statistic_points)
if self._enable_flexible_group_size and self._group_size != -1:
# Compute flexible group size values if enabled
flexible_group_size_data = self._get_flexible_group_size_data(ratio_defining_params)
group_size_values = {w_param.weight_name: group_size for w_param, group_size in flexible_group_size_data}
# Select a subset of ratio_defining_params that can be compressed with some group size
ratio_defining_params = [w_param for w_param, _ in flexible_group_size_data]
else:
group_size_values = {w_param.weight_name: self._group_size for w_param in ratio_defining_params}
self._set_weight_compression_config(ratio_defining_params, model, graph, statistic_points, group_size_values)
ignored_scope_weight_statistics = self._get_ignored_scope_weight_statistics(model, graph)
nncf_logger.info(
self._get_bitwidth_distribution_str(
Expand Down
15 changes: 9 additions & 6 deletions src/nncf/quantization/algorithms/weight_compression/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import operator
from dataclasses import dataclass
from dataclasses import field
from functools import reduce
from typing import Optional, TypeVar

import numpy as np
Expand Down Expand Up @@ -86,19 +88,20 @@ class WeightCompressionParameters:
:param weight_name: Unique weight name.
:param node_with_weight: Node with weight in the NNCF graph.
:param weight_port_id: Number of elements in the weight array.
:param num_weights: Number of elements in the weight array.
:param weight_shape: Shape of the weight array.
:param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
:param compression_config: Configuration of weight compression for the weight node.
"""

weight_name: str
node_with_weight: NNCFNode
weight_port_id: int
num_weights: np.uint64
weight_shape: tuple[int, ...]
reduction_axes: tuple[int, ...]
compression_config: Optional[WeightCompressionConfig] = field(default_factory=WeightCompressionConfig)

def __post_init__(self):
# Explicitly cast num_weights to avoid overflow on finding total number of weights.
# The issue happens on Windows, because np.ndarray.size() returns np.int32 and sum of weights is more than 2^32.
self.num_weights = np.uint64(self.num_weights)
@property
def num_weights(self) -> np.uint64:
if not hasattr(self, "_num_weights"):
self._num_weights = np.uint64(reduce(operator.mul, self.weight_shape, 1))
return self._num_weights

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,16 @@

class MixedPrecisionCriterion(Algorithm):
"""
Assigns mixed quantization scheme (e.g. uniform int8 or uniform int4/non-uniform fp4)
Computes mixed quantization scheme (e.g. uniform int8 or uniform int4/non-uniform fp4)
for weights based on some criteria.
"""

def __init__(self, primary_config: WeightCompressionConfig, ratio: float, subset_size: Optional[int] = None):
def __init__(self, ratio: float, subset_size: Optional[int] = None):
"""
:param primary_config: Configuration on how to compress (quantize) weights to primary precision.
:param ratio: The ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
and the rest to INT8_ASYM).
:param subset_size: Size of dataset subset for statistics.
"""
self._primary_config = primary_config
self._ratio = ratio
self._subset_size = subset_size
self._algorithm_key = f"MPC_{hash(self)}"
Expand All @@ -79,15 +77,17 @@ def apply(
statistic_points: Optional[StatisticPointsContainer] = None,
dataset: Optional[Dataset] = None,
weight_params: list[WeightCompressionParameters] = None,
) -> None:
) -> list[WeightCompressionParameters]:
"""
Assigns quantization precision based on computed layers' sensitivities, ratio of parameters.
Selects which weights should be compressed to a primary (4 bit) precision based on computed layers'
sensitivities, ratio of parameters.
"""
self._set_backend_entity(model)

scores = self._calc_sensitivity(model, graph, weight_params, statistic_points)
num_all_weights = sum(wp.num_weights for wp in weight_params)

primary_precision_weight_params = []
indexes_of_layers_in_ascending_order_of_scores = [
i[0] for i in sorted(enumerate(scores), reverse=False, key=lambda x: x[1])
]
Expand All @@ -97,8 +97,9 @@ def apply(
current_ratio = (num_weights_in_4bit + weight_param.num_weights) / num_all_weights
if current_ratio >= self._ratio:
break
weight_param.compression_config = self._primary_config
primary_precision_weight_params.append(weight_param)
num_weights_in_4bit += weight_param.num_weights
return primary_precision_weight_params

@abstractmethod
def _set_backend_entity(self, model: TModel) -> None:
Expand Down
Loading