updated the review comments

amitsrivastava78 · amitsrivastava78 · commit 66e61c586ed2 · 2025-08-06T15:14:38.000+05:30
diff --git a/keras/src/models/model.py b/keras/src/models/model.py
@@ -434,14 +434,7 @@ def quantize(self, mode, **kwargs):
         from keras.src.dtype_policies import QUANTIZATION_MODES
 
         if mode == "gptq":
-            try:
-                from keras.src.quantizers.gptqconfig import GPTQConfig
-            except ImportError:
-                raise ImportError(
-                    "To use 'gptq' mode, please ensure the necessary "
-                    "quantization modules are correctly placed in"
-                    "keras/src/quantizers."
-                )
+            from keras.src.quantizers.gptqconfig import GPTQConfig
 
             config = kwargs.get("quant_config")
             if not isinstance(config, GPTQConfig):
diff --git a/keras/src/models/model_test.py b/keras/src/models/model_test.py
@@ -1,4 +1,3 @@
-import logging
 import os
 import pickle
 from collections import namedtuple
@@ -19,9 +18,6 @@
 from keras.src.models.model import model_from_json
 from keras.src.quantizers.gptqconfig import GPTQConfig
 
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-
 
 def _get_model():
     input_a = Input(shape=(3,), batch_size=2, name="input_a")
@@ -1295,81 +1291,67 @@ def call(self, inputs):
     return model
 
 
-def _run_gptq_test_on_dataset(test_case, dataset):
-    """Helper function to run a full GPTQ quantization
-    test on a given dataset."""
-    model = _get_model_with_dense_attention()
+@pytest.mark.requires_trainable_backend
+class ModelQuantizationTest(testing.TestCase):
+    def _run_gptq_test_on_dataset(self, dataset):
+        """Helper function to run a full GPTQ quantization
+        test on a given dataset."""
 
-    # --- 1. Common Setup ---
-    NUM_SAMPLES = 16
-    SEQUENCE_LENGTH = 128
-    VOCAB_SIZE = 1000
-    W_BITS = 4
-    GROUP_SIZE = 32
+        model = _get_model_with_dense_attention()
 
-    mock_tokenizer = lambda text: np.array([ord(c) % VOCAB_SIZE for c in text])
-    mock_tokenizer.tokenize = mock_tokenizer
+        # 1. Common setup
+        NUM_SAMPLES = 16
+        SEQUENCE_LENGTH = 128
+        VOCAB_SIZE = 1000
+        W_BITS = 4
+        GROUP_SIZE = 32
 
-    # --- 2. Find Target Layer and Get Original Weights ---
-    target_layer = None
-    for layer in model.layers:
-        if hasattr(layer, "ffn") and hasattr(layer.ffn, "layers"):
-            dense_layer_in_ffn = next(
-                (
-                    ffn_layer
-                    for ffn_layer in layer.ffn.layers
-                    if isinstance(ffn_layer, layers.Dense)
-                ),
-                None,
-            )
-            if dense_layer_in_ffn:
-                target_layer = dense_layer_in_ffn
-                break
+        mock_tokenizer = lambda text: np.array(
+            [ord(c) % VOCAB_SIZE for c in text]
+        )
+        mock_tokenizer.tokenize = mock_tokenizer
 
-    test_case.assertIsNotNone(
-        target_layer,
-        "Test setup failed: No Dense layer was found inside an 'ffn' block.",
-    )
-    original_weights = np.copy(target_layer.kernel.numpy())
-
-    # --- 3. Configure and Run Quantization ---
-    gptq_config = GPTQConfig(
-        dataset=dataset,
-        tokenizer=mock_tokenizer,
-        wbits=W_BITS,
-        nsamples=NUM_SAMPLES,
-        seqlen=SEQUENCE_LENGTH,
-        groupsize=GROUP_SIZE,
-    )
-    model.quantize("gptq", quant_config=gptq_config)
+        # 2. Find target layer and get original weights
+        target_layer = model.layers[2].ffn.layers[0]
 
-    # --- 4. Assertions and Verification ---
-    quantized_weights = target_layer.kernel.numpy()
+        self.assertIsNotNone(
+            target_layer,
+            "Test setup failed: No Dense layer was found inside "
+            "an 'ffn' block.",
+        )
+        original_weights = np.copy(target_layer.kernel.numpy())
+
+        # 3. Configure and run quantization
+        gptq_config = GPTQConfig(
+            dataset=dataset,
+            tokenizer=mock_tokenizer,
+            wbits=W_BITS,
+            nsamples=NUM_SAMPLES,
+            seqlen=SEQUENCE_LENGTH,
+            groupsize=GROUP_SIZE,
+        )
+        model.quantize("gptq", quant_config=gptq_config)
 
-    # Assert that the weights have been changed
-    test_case.assertFalse(
-        np.allclose(original_weights, quantized_weights),
-        f"Weights were not changed by the GPTQ process for dataset: {dataset}",
-    )
+        #  4. Assertions and verification
+        quantized_weights = target_layer.kernel.numpy()
+
+        # Assert that the weights have been changed
+        self.assertFalse(
+            np.allclose(original_weights, quantized_weights),
+            "Weights were not changed by the GPTQ process for "
+            "dataset: {dataset}",
+        )
 
-    # Verify the quantized model can still make a prediction
-    try:
+        # Verify the quantized model can still make a prediction
         dummy_input = np.random.randint(
             0, VOCAB_SIZE, size=(1, SEQUENCE_LENGTH)
         )
         _ = model.predict(dummy_input)
-    except Exception as e:
-        test_case.fail(
-            "Prediction failed for the quantized model with dataset: "
-            f"{dataset}. Error: {e}"
-        )
-
 
-@pytest.mark.requires_trainable_backend
-class ModelQuantizationTest(testing.TestCase):
-    def test_quantize_gptq_with_dense_attention(self):
-        """Tests GPTQ with an in-memory list of strings as the dataset."""
+    def test_quantize_gptq_on_different_datasets(self):
+        """Tests GPTQ with various dataset types (string list, generator)."""
 
+        # Define the datasets to be tested
         long_text = """auto-gptq is an easy-to-use model quantization library
         with user-friendly apis, based on GPTQ algorithm. The goal is to
         quantize pre-trained models to 4-bit or even 3-bit precision with
@@ -1382,12 +1364,16 @@ def test_quantize_gptq_with_dense_attention(self):
         models in resource-constrained environments where every bit of memory
         and every millisecond of latency counts."""
 
-        string_dataset = [long_text]
-        _run_gptq_test_on_dataset(self, string_dataset)
+        datasets_to_test = {
+            "string_dataset": [long_text],
+            "generator_dataset": dummy_dataset_generator(
+                nsamples=16, seqlen=128, vocab_size=1000
+            ),
+        }
 
-    def test_quantize_gptq_with_data_gen(self):
-        """Tests GPTQ with a Python generator as the dataset."""
-        generator_dataset = dummy_dataset_generator(
-            nsamples=16, seqlen=128, vocab_size=1000
-        )
-        _run_gptq_test_on_dataset(self, generator_dataset)
+        # Loop through the datasets and run each as a sub-test
+        for dataset_name, dataset in datasets_to_test.items():
+            # 'with self.subTest(...)' ensures that failures are reported
+            # for each specific dataset without stopping the whole test.
+            with self.subTest(dataset_type=dataset_name):
+                self._run_gptq_test_on_dataset(dataset)
diff --git a/keras/src/quantizers/gptq.py b/keras/src/quantizers/gptq.py
@@ -61,6 +61,27 @@ def __init__(self, layer):
         self.H = ops.zeros((self.rows, self.rows), dtype="float32")
 
     def update_hessian_with_batch(self, inp):
+        """
+        Updates the running average of the Hessian matrix with a new batch.
+
+        This method computes the Hessian matrix for a given batch of input
+        activations and updates the accumulated Hessian (`self.H`) using a
+        numerically stable running average. This allows the Hessian to be
+        computed over a large dataset without loading all samples into memory
+        at once.
+
+        The input tensor is first reshaped into a 2D matrix [num_samples,
+        num_features] before the Hessian is calculated.
+
+        Args:
+            inp: A 2D or higher-dimensional tensor of input activations from a
+                calibration batch.
+
+        Raises:
+            ValueError: If the feature dimension of the input tensor `inp` does
+                not match the dimensions of the pre-initialized Hessian matrix
+                `self.H`.
+        """
         if len(inp.shape) > 2:
             inp = ops.reshape(inp, (-1, inp.shape[-1]))
         inp = ops.cast(inp, "float32")
@@ -85,6 +106,51 @@ def update_hessian_with_batch(self, inp):
     def quantize_and_correct_block(
         self, blocksize=128, percdamp=0.01, groupsize=-1, actorder=False
     ):
+        """
+        Performs GPTQ quantization and correction on the layer's weights.
+
+        This method implements the core logic of the "Optimal Brain Quant"
+        (OBQ) method, as applied by GPTQ, to quantize the weights of a single
+        layer. It iteratively quantizes blocks of weights and corrects for the
+        quantization error by updating the remaining weights.
+
+        The algorithm follows these main steps:
+        1.  **Initialization**: It optionally reorders the weight columns based
+            on activation magnitudes (`actorder=True`) to protect more salient
+            weights.
+        2.  **Hessian Modification**: The Hessian matrix `H`, pre-computed from
+            calibration data, is dampened to ensure its invertibility and
+            stability.
+        3.  **Iterative Quantization**: The function iterates through the
+            weight columns in blocks (`blocksize`). In each iteration, it:
+            a. Quantizes one column (`w`).
+            b. Calculates the quantization error (`err`).
+            c. Updates the remaining weights in the *current* block by
+                distributing the error, using the inverse Hessian (`Hinv`).
+        4.  **Block-wise Correction**: After a block is quantized, the total
+            error from that block is propagated to the *next* block of weights
+            to be processed.
+        5.  **Finalization**: The quantized weights (`Q`) are reordered back if
+            `actorder` was used, and the layer's weights are updated.
+
+        This implementation is based on the official GPTQ paper and repository.
+        For more details, see:
+        - Paper: https://arxiv.org/abs/2210.17323
+        - Original Code: https://github.com/IST-DASLab/gptq
+
+        Args:
+            blocksize (int, optional): The size of the weight block to process
+             at a time. Defaults to 128.
+            percdamp (float, optional): The percentage of dampening to add the
+                Hessian's diagonal. A value of 0.01 is recommended.
+                Defaults to 0.01.
+            groupsize (int, optional): The number of weights that share the
+                same quantization parameters (scale and zero-point).
+                A value of -1 indicates per-channel quantization.
+            actorder (bool, optional): If True, reorders weight columns based
+                on their activation's second-order information.
+        """
+
         W = ops.transpose(ops.cast(self.layer.kernel, "float32"))
         H = ops.cast(self.H, "float32")
 
@@ -94,26 +160,32 @@ def quantize_and_correct_block(
             H = ops.take(ops.take(H, perm, axis=0), perm, axis=1)
             invperm = ops.argsort(perm)
 
+        # Dampen the Hessian for Stability
         diag_H = ops.diagonal(H)
         dead = ops.equal(diag_H, 0.0)
         diag_H = ops.where(dead, 1.0, diag_H)
         H = H + ops.diag(ops.where(dead, 1.0, ops.zeros_like(diag_H)))
+
+        # Add dampening factor to the Hessian diagonal
         damp = percdamp * ops.mean(diag_H)
         diag_H = diag_H + damp
         H = (H - ops.diag(ops.diagonal(H))) + ops.diag(diag_H)
 
+        # Compute the inverse Hessian, which is used for error correction
         Hinv = ops.linalg.inv(H)
         Q = ops.zeros_like(W)
 
         for i1 in range(0, self.rows, blocksize):
             i2 = min(i1 + blocksize, self.rows)
             count = i2 - i1
-
+            # Extract the current block of weights and its corresponding
+            # Hessian
             W1 = W[:, i1:i2]
             Q1 = ops.zeros_like(W1)
             Err1 = ops.zeros_like(W1)
             Hinv1 = Hinv[i1:i2, i1:i2]
 
+            # Process one column at a time within the block
             for i in range(count):
                 w = W1[:, i]
                 d = Hinv1[i, i]
@@ -128,6 +200,7 @@ def quantize_and_correct_block(
                         ops.expand_dims(w, 1), weight=True
                     )
 
+                # Quantize the current weight column
                 q = quantize(
                     ops.expand_dims(w, 1),
                     self.quantizer.scale,
@@ -148,11 +221,11 @@ def quantize_and_correct_block(
                     )
 
                     # Efficiently update the remaining part of the W1 tensor.
-                    # This is equivalent to W1[:, i + 1 :] -= update
                     slice_to_update = W1[:, i + 1 :]
                     updated_slice = slice_to_update - update
                     W1 = ops.slice_update(W1, (0, i + 1), updated_slice)
 
+            # Update the full quantized matrix Q with the processed block
             Q = ops.concatenate([Q[:, :i1], Q1, Q[:, i2:]], axis=1)
 
             if i2 < self.rows:
@@ -169,6 +242,7 @@ def quantize_and_correct_block(
         if isinstance(self.original_layer, EinsumDense):
             Q = ops.reshape(Q, self.kernel_shape)
 
+        # Set the new quantized weights in the original layer
         new_weights = [ops.convert_to_numpy(Q)]
         if self.original_layer.bias is not None:
             new_weights.append(ops.convert_to_numpy(self.original_layer.bias))
diff --git a/keras/src/quantizers/gptqconfig.py b/keras/src/quantizers/gptqconfig.py
@@ -1,4 +1,4 @@
-import logging
+from absl import logging
 
 from .gptqutils import quantize_model
 
diff --git a/keras/src/quantizers/gptqutils.py b/keras/src/quantizers/gptqutils.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-import logging`
	`1`	`+from absl import logging`
`2`	`2`
`3`	`3`	`from .gptqutils import quantize_model`
`4`	`4`