Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
from nncf.tensor import functions as fns


def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int = -1) -> tuple[Tensor, Tensor]:
def process_stats(
stats: WCTensorStatistic,
subset_size: int,
act_ch_axis: int = -1,
transpose_a: bool = False,
) -> tuple[Tensor, Tensor]:
"""
A function for processing activations. Shared between AWQ, Scale Estimation and LoRA Correction algorithms.

Expand All @@ -37,6 +42,11 @@ def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int =
axes = list(range(1, len(X.shape))) + [0]
X_full = fns.transpose(X, axes=axes)

if transpose_a:
axes = list(range(len(X_full.shape)))
axes[-1], axes[-2] = axes[-2], axes[-1]
X_full = fns.transpose(X_full, axes=axes)

# The sample dimension is always the last axis after transpose
sample_axis = -1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def calculate_adapters(
layer_name = wc_params.node_with_weight.node_name
layer_statistics = self._statistics[layer_name]
is_debug = self._debug_interface is not None
transpose_a_flag = getattr(wc_params.node_with_weight, "transpose_a", False)
lora_A, lora_B, mean_noises = self.calculate_low_rank_matrices(
weight,
compressed_weight,
Expand All @@ -129,6 +130,7 @@ def calculate_adapters(
self._lora_correction_params,
layer_statistics,
is_debug,
transpose_a=transpose_a_flag,
)
if is_debug:
self._debug_interface.add_noises(layer_name, mean_noises)
Expand All @@ -143,6 +145,7 @@ def calculate_low_rank_matrices(
lora_correction_params: AdvancedLoraCorrectionParameters,
layer_statistics: WCTensorStatistic,
is_debug: Optional[bool] = False,
transpose_a: bool = False,
):
"""
Calculates low rank matrices for a given original and compressed weights.
Expand Down Expand Up @@ -170,7 +173,15 @@ def calculate_low_rank_matrices(
)
mode = compression_config.mode
assert len(reduction_axes) == 1, "Assumed a single reduction axis"
reduction_axis = reduction_axes[0] if compression_config.group_size != -1 else -1

if compression_config.group_size != -1:
reduction_axis = reduction_axes[0]
else:
reduction_axis = -1

if transpose_a and reduction_axis != -1:
reduction_axis = 1

if mode in (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM):
fq_weights = do_integer_dequantization(
compressed_weight.tensor,
Expand All @@ -192,9 +203,8 @@ def calculate_low_rank_matrices(
# reduction axes is all axes except output dimension in linear/conv layers.
if reduction_axes[0] == 1:
svd_residual = fns.transpose(svd_residual)
residual = svd_residual.clone() # [H, O]

s, X = process_stats(layer_statistics, subset_size) # [H], [H, SS]
residual = fns.transpose(svd_residual) if transpose_a else svd_residual # [H, O] or [O, H]
s, X = process_stats(layer_statistics, subset_size, act_ch_axis=-1, transpose_a=transpose_a)
X = fns.transpose(X) # [SS, H]
if compression_config.group_size > 0:
# Multiply residual of weights by maximum channel magnitude of activations normalized per quantization
Expand Down
78 changes: 78 additions & 0 deletions tests/openvino/native/quantization/test_weights_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from nncf import SensitivityMetric
from nncf.common.factory import build_graph
from nncf.common.tensor_statistics.collectors import AggregatorBase
from nncf.common.tensor_statistics.statistics import WCTensorStatistic
from nncf.common.utils.debug import nncf_debug
from nncf.common.utils.helpers import set_env_variable
from nncf.data.dataset import Dataset
Expand All @@ -42,6 +43,7 @@
from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams
from nncf.quantization.advanced_parameters import GroupSizeFallbackMode
from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
Expand Down Expand Up @@ -2574,3 +2576,79 @@ def test_awq_scale_ref() -> list[dict[str, Tensor]]:
@pytest.fixture
def transpose_a_supported(self) -> bool:
return True


def test_process_stats_with_transpose_a_changes_layout():
activations = np.random.randn(10, 3, 8)

stats = WCTensorStatistic(
Tensor(activations),
shape_values=activations.shape,
)

subset_size = 10

s_default, X_default = process_stats(
stats,
subset_size=subset_size,
act_ch_axis=-1,
transpose_a=False,
)

s_transposed, X_transposed = process_stats(
stats,
subset_size=subset_size,
act_ch_axis=-1,
transpose_a=True,
)

# Rank must stay the same
assert len(s_default.shape) == len(s_transposed.shape)

# Reduction dimension (seq_len) must be preserved
assert s_default.shape[0] == s_transposed.shape[0] == 3

# Layout must change
assert X_default.shape != X_transposed.shape

# Element count preserved
assert np.prod(X_default.shape) == np.prod(X_transposed.shape)


@pytest.mark.parametrize(
"transpose_a,transpose_b",
[
(False, False),
(False, True),
],
)
def test_lora_transpose_a_fix(transpose_a, transpose_b):
"""
Test LoRA correction insertion only with transpose_a=False
because transposed activations are not yet supported by LoRA.
"""
# Setup LoRA parameters
params = LoraParams(adapter_rank=4, use_int8_adapters=False)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that makes sense. I can update the existing tests to cover the act_ch_axis/transpose handling instead of adding separate ones, so the verification of LoRA Correction with transposed inputs is integrated with the current test suite.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please don't forget to update the tests

advanced_parameters = CompressionParams(lora_correction_params=params)

# Initialize model with given transpose configuration
model = LMLinearModel(transpose_b=transpose_b, transpose_a=transpose_a)
ov_model = model.ov_model

# Use dummy dataset with same shape as model input
dataset = Dataset(np.ones(inp.shape) for inp in ov_model.inputs)

# Compress weights with LoRA correction enabled
compressed_model = compress_weights(
ov_model,
mode=CompressWeightsMode.INT4_SYM,
ratio=1.0,
group_size=8,
dataset=dataset,
all_layers=True,
lora_correction=True,
advanced_parameters=advanced_parameters,
)

# Simple assertion: compressed model is returned
assert compressed_model is not None