openvinotoolkit
diff --git a/‎src/nncf/quantization/advanced_parameters.py‎
Lines changed: 19 additions & 3 deletions b/‎src/nncf/quantization/advanced_parameters.py‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/algorithm.py‎
Lines changed: 102 additions & 12 deletions b/‎src/nncf/quantization/algorithms/weight_compression/algorithm.py‎
Lines changed: 102 additions & 12 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/config.py‎
Lines changed: 9 additions & 6 deletions b/‎src/nncf/quantization/algorithms/weight_compression/config.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/handle_errors.py‎
Lines changed: 0 additions & 32 deletions b/‎src/nncf/quantization/algorithms/weight_compression/handle_errors.py‎
Lines changed: 0 additions & 32 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/mixed_precision.py‎
Lines changed: 8 additions & 7 deletions b/‎src/nncf/quantization/algorithms/weight_compression/mixed_precision.py‎
Lines changed: 8 additions & 7 deletions
@@ -369,6 +369,22 @@ class AdvancedCompressionParameters:
 
     :param statistics_path: Directory path to dump statistics.
     :type statistics_path: str
+    :param lora_adapter_rank: Rank of lora adapters for FQ_LORA format. Defaults to 256.
+    :type lora_adapter_rank: int
+    :param enable_flexible_group_size: Whether to enable flexible group size searching. When enabled, each weight
+    for which the channel size is not divisible by the general group size value will be compressed to a newly
+    calculated group size. The new group size value is the maximal power of two (i.e., 2^k) such that:
+        - channel size is divisible by it;
+        - it is less than the originally specified group size value;
+        - it is greater than or equal to `min_flexible_group_size`.
+
+    If it's not possible to find a value satisfying these requirements, such weight is compressed to the backup
+    precision. If ratio < 1.0 and some weights have to be compressed to the backup precision because of group size
+    issues, then these weights won't contribute to the ratio of backup mode group.
+    :type enable_flexible_group_size: bool
+    :param min_flexible_group_size: Minimum group size for flexible group size searching. Defaults to 16. The reason
+        behind this argument is to avoid too small group size values, which may lead to performance issues.
+    :type min_flexible_group_size: int
     :param awq_params: Advanced parameters for AWQ algorithm.
     :type awq_params: AdvancedAWQParameters
     :param scale_estimation_params: Advanced parameters for Scale Estimation algorithm.
@@ -377,8 +393,6 @@ class AdvancedCompressionParameters:
     :type gptq_params: AdvancedGPTQParameters
     :param lora_correction_params: Advanced parameters for Lora Correction algorithm.
     :type lora_correction_params: AdvancedLoraCorrectionParameters
-    :param lora_adapter_rank: Rank of lora adapters for FQ_LORA format. Defaults to 256.
-    :type lora_adapter_rank: int
     :param backend_params: Backend-specific parameters.
     :type backend_params: dict[str, Any]
     :param codebook: The codebook (LUT) for the weight compression.
@@ -387,13 +401,15 @@ class AdvancedCompressionParameters:
     """
 
     statistics_path: Optional[str] = None
+    lora_adapter_rank: int = 256
+    enable_flexible_group_size: bool = False
+    min_flexible_group_size: int = 16
     awq_params: AdvancedAWQParameters = field(default_factory=AdvancedAWQParameters)
     scale_estimation_params: AdvancedScaleEstimationParameters = field(
         default_factory=AdvancedScaleEstimationParameters
     )
     gptq_params: AdvancedGPTQParameters = field(default_factory=AdvancedGPTQParameters)
     lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
-    lora_adapter_rank: int = 256
     backend_params: dict[str, Any] = field(default_factory=dict)
     codebook: Optional[TTensor] = None
 
 
@@ -45,6 +45,7 @@
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
 from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
 from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.weight_lowering import get_reduction_channel_size
 from nncf.scopes import IgnoredScope
 from nncf.scopes import get_ignored_node_names_from_ignored_scope
 from nncf.tensor import Tensor
@@ -318,11 +319,13 @@ def __init__(
             advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
         )
 
-        primary_config = self._get_primary_config()
         criterion_cls = MIXED_PRECISION_CRITERIA.get(self._sensitivity_metric)
-        self._mixed_precision_algo = criterion_cls(primary_config, self._ratio, self._subset_size)
+        self._mixed_precision_algo = criterion_cls(self._ratio, self._subset_size)
         self._statistics_path = self._advanced_parameters.statistics_path
 
+        self._enable_flexible_group_size = self._advanced_parameters.enable_flexible_group_size
+        self._min_flexible_group_size = self._advanced_parameters.min_flexible_group_size
+
         if self._awq:
             awq_params = self._advanced_parameters.awq_params
             self.awq_algo = AWQ(
@@ -454,7 +457,7 @@ def _get_ratio_defining_params(
 
         return ratio_defining_params
 
-    def _get_primary_config(self):
+    def _get_primary_config(self, group_size: int) -> WeightCompressionConfig:
         codebook_values = None
 
         if self._mode == CompressWeightsMode.CB4_F8E4M3:
@@ -464,7 +467,7 @@ def _get_primary_config(self):
 
         return WeightCompressionConfig(
             mode=self._mode,
-            group_size=self._group_size,
+            group_size=group_size,
             codebook_values=codebook_values,
         )
 
@@ -474,6 +477,7 @@ def _set_weight_compression_config(
         model: TModel,
         graph: NNCFGraph,
         statistics_points: StatisticPointsContainer,
+        group_size_values: dict[str, int],
     ) -> None:
         """
         Sets the appropriate compression configuration for weights based on some criteria.
@@ -483,13 +487,92 @@ def _set_weight_compression_config(
         :param model: The model.
         :param graph: The model graph associated with the model.
         :param statistics_points: Statistics points.
+        :param group_size_values: A dictionary mapping weight names to their group size values.
         """
-        primary_config = self._get_primary_config()
-        if self._ratio == 1:
-            for weight_param in ratio_defining_params:
-                weight_param.compression_config = primary_config
+        if self._ratio < 1 and len(ratio_defining_params) > 0:
+            primary_precision_weight_params = self._mixed_precision_algo.apply(
+                model, graph, statistics_points, weight_params=ratio_defining_params
+            )
         else:
-            self._mixed_precision_algo.apply(model, graph, statistics_points, weight_params=ratio_defining_params)
+            primary_precision_weight_params = ratio_defining_params
+
+        for weight_param in primary_precision_weight_params:
+            weight_param.compression_config = self._get_primary_config(group_size_values[weight_param.weight_name])
+
+        # Check if group size is valid for each weight in ratio_defining_params
+        failed_nodes = []
+        for w_params in ratio_defining_params:
+            if w_params.compression_config is None or w_params.compression_config.group_size == -1:
+                continue
+            reduction_channel_size, _ = get_reduction_channel_size(w_params.weight_shape, w_params.reduction_axes)
+            if reduction_channel_size % w_params.compression_config.group_size != 0:
+                failed_nodes.append((w_params.node_with_weight.node_name, reduction_channel_size))
+        if len(failed_nodes) > 0:
+            names = ",".join(f'"{name}"' for name, _ in failed_nodes)
+            msg = (
+                "Failed to apply group-wise quantization with "
+                f"group size value {self._group_size} and channel size value {failed_nodes[0][1]}.\n"
+                "Ensure that the group size is divisible by the channel size, "
+                "or include this node and others with similar issues in the ignored scope:\n"
+                f"nncf.compress_weight(\n\t..., \n\tignored_scope=IgnoredScope(names=[{names}]\n\t)\n)"
+            )
+            raise nncf.InvalidGroupSizeError(msg)
+
+    def _get_flexible_group_size_data(
+        self, weight_params: list[WeightCompressionParameters]
+    ) -> list[tuple[WeightCompressionParameters, int]]:
+        """
+        Compute flexible group size values.
+        :param weight_params: Weight parameters for which to compute flexible group size.
+        :return: A list of tuples, where each tuple pair contains a WeightCompressionParameters object and the
+            group size values associated with it. If group size can't be assigned to some weight parameter
+            it won't be included in the result.
+        """
+        flexible_group_size_not_found_weight_params = []
+        group_size_data = []
+        for w_params in weight_params:
+            reduction_channel_size, _ = get_reduction_channel_size(w_params.weight_shape, w_params.reduction_axes)
+            if reduction_channel_size % self._group_size == 0:
+                # The weight can be compressed with the given group size, nothing else to do
+                group_size_data.append((w_params, self._group_size))
+                continue
+
+            # Find the maximal power of two that divides reduction_channel_size
+            flexible_group_size = reduction_channel_size & (~reduction_channel_size + 1)
+
+            if flexible_group_size < self._min_flexible_group_size:
+                flexible_group_size_not_found_weight_params.append(w_params)
+            else:
+                group_size_data.append((w_params, flexible_group_size))
+
+        node_strings = []
+        for i, (w_params, new_group_size) in enumerate(group_size_data):
+            if new_group_size == self._group_size:
+                continue
+            weight_shape = w_params.weight_shape
+            reduction_channel_size, _ = get_reduction_channel_size(weight_shape, w_params.reduction_axes)
+            node_strings.append(
+                f"{w_params.node_with_weight.node_name} "
+                f"(weight shape: {weight_shape}, adjusted group size: {new_group_size})"
+            )
+        if len(node_strings) > 0:
+            nncf_logger.info(
+                f"Wasn't able to set the specified group size value ({self._group_size}) to some nodes. These nodes "
+                f"will have an adjusted group size value:\n\t" + "\n\t".join(node_strings)
+            )
+
+        if len(flexible_group_size_not_found_weight_params) > 0:
+            node_strings = [""] * len(flexible_group_size_not_found_weight_params)
+            for i, w_params in enumerate(flexible_group_size_not_found_weight_params):
+                weight_shape = w_params.weight_shape
+                reduction_channel_size, _ = get_reduction_channel_size(weight_shape, w_params.reduction_axes)
+                node_strings[i] = f"{w_params.node_with_weight.node_name} (weight shape: {weight_shape})"
+            nncf_logger.warning(
+                "Large enough flexible group size value cannot be found for some nodes. They will be compressed "
+                "according to the backup mode. Nodes:\n\t" + "\n\t".join(node_strings)
+            )
+
+        return group_size_data
 
     @staticmethod
     def _proportion_str(num_weights_list: list[int], total_num_weights: int, total_num_params: int) -> str:
@@ -625,7 +708,6 @@ def apply(
                 if weight_dtype not in SUPPORTED_DATA_TYPES:
                     continue
                 weight_shape = self._backend_entity.get_weight_shape(node, weight_port_id, graph)
-                weight_size = reduce(operator.mul, weight_shape, 1)
                 reduction_axes = self._backend_entity.get_reduction_axes(node, weight_port_id, graph)
                 if (
                     self._group_size != -1
@@ -654,13 +736,21 @@ def apply(
                     )
                     wc_config = WeightCompressionConfig(mode=mode)
                 weight_params = WeightCompressionParameters(
-                    weight_name, node, weight_port_id, weight_size, reduction_axes, wc_config
+                    weight_name, node, weight_port_id, weight_shape, reduction_axes, wc_config
                 )
                 all_weight_params.append(weight_params)
                 weight_names.add(weight_name)
 
         ratio_defining_params = self._get_ratio_defining_params(all_weight_params, is_last_layer_shared)
-        self._set_weight_compression_config(ratio_defining_params, model, graph, statistic_points)
+        if self._enable_flexible_group_size and self._group_size != -1:
+            # Compute flexible group size values if enabled
+            flexible_group_size_data = self._get_flexible_group_size_data(ratio_defining_params)
+            group_size_values = {w_param.weight_name: group_size for w_param, group_size in flexible_group_size_data}
+            # Select a subset of ratio_defining_params that can be compressed with some group size
+            ratio_defining_params = [w_param for w_param, _ in flexible_group_size_data]
+        else:
+            group_size_values = {w_param.weight_name: self._group_size for w_param in ratio_defining_params}
+        self._set_weight_compression_config(ratio_defining_params, model, graph, statistic_points, group_size_values)
         ignored_scope_weight_statistics = self._get_ignored_scope_weight_statistics(model, graph)
         nncf_logger.info(
             self._get_bitwidth_distribution_str(
 
@@ -8,8 +8,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import operator
 from dataclasses import dataclass
 from dataclasses import field
+from functools import reduce
 from typing import Optional, TypeVar
 
 import numpy as np
@@ -86,19 +88,20 @@ class WeightCompressionParameters:
     :param weight_name: Unique weight name.
     :param node_with_weight: Node with weight in the NNCF graph.
     :param weight_port_id: Number of elements in the weight array.
-    :param num_weights: Number of elements in the weight array.
+    :param weight_shape: Shape of the weight array.
     :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
     :param compression_config: Configuration of weight compression for the weight node.
     """
 
     weight_name: str
     node_with_weight: NNCFNode
     weight_port_id: int
-    num_weights: np.uint64
+    weight_shape: tuple[int, ...]
     reduction_axes: tuple[int, ...]
     compression_config: Optional[WeightCompressionConfig] = field(default_factory=WeightCompressionConfig)
 
-    def __post_init__(self):
-        # Explicitly cast num_weights to avoid overflow on finding total number of weights.
-        # The issue happens on Windows, because np.ndarray.size() returns np.int32 and sum of weights is more than 2^32.
-        self.num_weights = np.uint64(self.num_weights)
+    @property
+    def num_weights(self) -> np.uint64:
+        if not hasattr(self, "_num_weights"):
+            self._num_weights = np.uint64(reduce(operator.mul, self.weight_shape, 1))
+        return self._num_weights
@@ -41,18 +41,16 @@
 
 class MixedPrecisionCriterion(Algorithm):
     """
-    Assigns mixed quantization scheme (e.g. uniform int8 or uniform int4/non-uniform fp4)
+    Computes mixed quantization scheme (e.g. uniform int8 or uniform int4/non-uniform fp4)
     for weights based on some criteria.
     """
 
-    def __init__(self, primary_config: WeightCompressionConfig, ratio: float, subset_size: Optional[int] = None):
+    def __init__(self, ratio: float, subset_size: Optional[int] = None):
         """
-        :param primary_config: Configuration on how to compress (quantize) weights to primary precision.
         :param ratio: The ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
             and the rest to INT8_ASYM).
         :param subset_size: Size of dataset subset for statistics.
         """
-        self._primary_config = primary_config
         self._ratio = ratio
         self._subset_size = subset_size
         self._algorithm_key = f"MPC_{hash(self)}"
@@ -79,15 +77,17 @@ def apply(
         statistic_points: Optional[StatisticPointsContainer] = None,
         dataset: Optional[Dataset] = None,
         weight_params: list[WeightCompressionParameters] = None,
-    ) -> None:
+    ) -> list[WeightCompressionParameters]:
         """
-        Assigns quantization precision based on computed layers' sensitivities, ratio of parameters.
+        Selects which weights should be compressed to a primary (4 bit) precision based on computed layers'
+        sensitivities, ratio of parameters.
         """
         self._set_backend_entity(model)
 
         scores = self._calc_sensitivity(model, graph, weight_params, statistic_points)
         num_all_weights = sum(wp.num_weights for wp in weight_params)
 
+        primary_precision_weight_params = []
         indexes_of_layers_in_ascending_order_of_scores = [
             i[0] for i in sorted(enumerate(scores), reverse=False, key=lambda x: x[1])
         ]
@@ -97,8 +97,9 @@ def apply(
             current_ratio = (num_weights_in_4bit + weight_param.num_weights) / num_all_weights
             if current_ratio >= self._ratio:
                 break
-            weight_param.compression_config = self._primary_config
+            primary_precision_weight_params.append(weight_param)
             num_weights_in_4bit += weight_param.num_weights
+        return primary_precision_weight_params
 
     @abstractmethod
     def _set_backend_entity(self, model: TModel) -> None: