reworked the comments

amitsrivastava78 · amitsrivastava78 · commit baa0682c1a50 · 2025-08-12T15:01:03.000+05:30
diff --git a/keras/src/models/model_test.py b/keras/src/models/model_test.py
@@ -1,6 +1,7 @@
 import os
 import pickle
 from collections import namedtuple
+from collections.abc import Callable
 
 import numpy as np
 import pytest
@@ -1250,9 +1251,8 @@ def dummy_dataset_generator(nsamples, seqlen, vocab_size=1000):
         yield rng.integers(low=0, high=vocab_size, size=(1, seqlen))
 
 
-# Helper function to build a simple transformer model that uses standard
-# Keras `Dense` layers for its attention projections.
-def _get_model_with_dense_attention():
+# Helper function to build a simple transformer model.
+def get_model_with_dense_attention():
     """Builds a simple transformer model using Dense for attention."""
     vocab_size = 1000
     embed_dim = 32
@@ -1262,8 +1262,6 @@ def _get_model_with_dense_attention():
     class SimpleTransformerBlock(layers.Layer):
         def __init__(self, embed_dim, num_heads, ff_dim, **kwargs):
             super().__init__(**kwargs)
-            # The standard MultiHeadAttention layer uses Dense layers
-            # for its projections.
             self.att = layers.MultiHeadAttention(
                 num_heads=num_heads, key_dim=embed_dim
             )
@@ -1292,22 +1290,44 @@ def call(self, inputs):
     return model
 
 
+# Define parameters for the tests
+long_text = """gptq is an easy-to-use model quantization library..."""
+DATASETS = {
+    "string_dataset": [long_text],
+    "generator_dataset": lambda: dummy_dataset_generator(
+        nsamples=16, seqlen=128
+    ),
+}
+CONFIGS = {
+    "default": {},
+    "per_channel": {"group_size": -1},
+    "act_order": {"act_order": True},
+    "symmetric": {"symmetric": True},
+}
+
+
 @pytest.mark.requires_trainable_backend
-class ModelQuantizationTest(testing.TestCase):
+class TestModelQuantization:
     def _run_gptq_test_on_dataset(self, dataset, **config_kwargs):
         """Helper function to run a full GPTQ quantization test."""
-        model = _get_model_with_dense_attention()
+        if isinstance(dataset, Callable):
+            dataset = dataset()
+        model = get_model_with_dense_attention()
         rng = np.random.default_rng(seed=42)
 
-        # 1. Common setup
         NUM_SAMPLES = 16
         SEQUENCE_LENGTH = 128
         VOCAB_SIZE = 1000
         W_BITS = 4
 
-        # Default config that can be overridden by config_kwargs
+        mock_tokenizer = lambda text: np.array(
+            [ord(c) % VOCAB_SIZE for c in text]
+        )
+        mock_tokenizer.tokenize = mock_tokenizer
+
         base_config = {
             "dataset": dataset,
+            "tokenizer": mock_tokenizer,
             "wbits": W_BITS,
             "nsamples": NUM_SAMPLES,
             "seqlen": SEQUENCE_LENGTH,
@@ -1316,82 +1336,26 @@ def _run_gptq_test_on_dataset(self, dataset, **config_kwargs):
             "act_order": False,
         }
 
-        mock_tokenizer = lambda text: np.array(
-            [ord(c) % VOCAB_SIZE for c in text]
-        )
-        mock_tokenizer.tokenize = mock_tokenizer
-        base_config["tokenizer"] = mock_tokenizer
-
-        # Find target layer and get original weights
         target_layer = model.layers[2].ffn.layers[0]
-        self.assertIsNotNone(
-            target_layer,
-            "Test setup failed: No Dense layer found in 'ffn' block.",
-        )
+        assert target_layer is not None
         original_weights = np.copy(target_layer.kernel)
 
-        # Configure and run quantization
         final_config = {**base_config, **config_kwargs}
         gptq_config = GPTQConfig(**final_config)
 
         model.quantize("gptq", config=gptq_config)
 
-        # Assertions and verification
         quantized_weights = target_layer.kernel
 
-        self.assertNotAllClose(
-            original_weights,
-            quantized_weights,
-            msg=f"Weights not changed by GPTQ for config: {config_kwargs}",
-        )
+        assert not np.allclose(original_weights, quantized_weights)
 
         dummy_sample = rng.integers(
             low=0, high=VOCAB_SIZE, size=(1, SEQUENCE_LENGTH)
         )
         _ = model.predict(dummy_sample)
 
-    def test_quantize_gptq_on_different_datasets(self):
-        """Tests GPTQ with various dataset types (string list, generator)."""
-
-        # Define the datasets to be tested
-        long_text = """gptq is an easy-to-use model quantization library
-        with user-friendly apis, based on GPTQ algorithm. The goal is to
-        quantize pre-trained models to 4-bit or even 3-bit precision with
-        minimal performance degradation.
-        This allows for running larger models on less powerful hardware,
-        reducing memory footprint and increasing inference speed.
-        The process involves calibrating the model on a small dataset
-        to determine the quantization parameters.
-        This technique is particularly useful for deploying large language
-        models in resource-constrained environments where every bit of memory
-        and every millisecond of latency counts."""
-
-        datasets_to_test = {
-            "string_dataset": [long_text],
-            "generator_dataset": dummy_dataset_generator(
-                nsamples=16, seqlen=128, vocab_size=1000
-            ),
-        }
-
-        # Loop through the datasets and run each as a sub-test
-        for dataset_name, dataset in datasets_to_test.items():
-            with self.subTest(dataset_type=dataset_name):
-                self._run_gptq_test_on_dataset(dataset)
-
-    def test_quantize_gptq_with_config_variations(self):
-        """Tests GPTQ with specific config variations."""
-        config_variations = {
-            "per_channel": {"group_size": -1},
-            "act_order": {"act_order": True},
-            "symmetric": {"symmetric": True},
-            "all_options_enabled": {
-                "group_size": -1,
-                "act_order": True,
-                "symmetric": True,
-            },
-        }
-
-        dataset = ["This is the calibration data for the test."]
-        for config_name, config_overrides in config_variations.items():
-            with self.subTest(config_type=config_name):
-                self._run_gptq_test_on_dataset(dataset, **config_overrides)
+    @pytest.mark.parametrize("dataset", DATASETS.values(), ids=DATASETS.keys())
+    @pytest.mark.parametrize("config", CONFIGS.values(), ids=CONFIGS.keys())
+    def test_quantize_gptq_combinations(self, dataset, config):
+        """Runs GPTQ tests across different datasets and config variations."""
+        self._run_gptq_test_on_dataset(dataset, **config)
diff --git a/keras/src/quantizers/gptq.py b/keras/src/quantizers/gptq.py
@@ -1,7 +1,7 @@
 from keras.src import ops
 from keras.src.layers import Dense
 from keras.src.layers import EinsumDense
-from keras.src.quantizers.gptqquant import dequantize
+from keras.src.quantizers.gptq_quant import dequantize
 
 
 class GPTQ:
diff --git a/keras/src/quantizers/gptq_config.py b/keras/src/quantizers/gptq_config.py
@@ -1,7 +1,7 @@
 from absl import logging
 
 from keras.src.api_export import keras_export
-from keras.src.quantizers.gptqutils import quantize_model
+from keras.src.quantizers.gptq_core import quantize_model
 
 
 @keras_export(["keras.GPTQConfig", "keras.quantizers.GPTQConfig"])
diff --git a/keras/src/quantizers/gptq_core.py b/keras/src/quantizers/gptq_core.py
@@ -9,7 +9,7 @@
 from keras.src.layers import EinsumDense
 from keras.src.layers import Embedding
 from keras.src.quantizers.gptq import GPTQ
-from keras.src.quantizers.gptqquant import GPTQQuant
+from keras.src.quantizers.gptq_quant import GPTQQuant
 
 
 def get_dataloader(tokenizer, seqlen, dataset, nsamples=128):
@@ -20,9 +20,9 @@ def get_dataloader(tokenizer, seqlen, dataset, nsamples=128):
 
     if isinstance(dataset, str):
         raise TypeError(
-            "The `dataset` argument must be an iterable (e.g., a list or "
-            "generator) of strings or pre-tokenized tensors. Loading "
-            "datasets by name is no longer supported."
+            "The `dataset` argument must be an iterable (e.g., a list of "
+            "strings or a generator). Providing a dataset name as a string "
+            "is not supported. Please pass the loaded dataset directly."
         )
 
     logging.info("Using pre-made dataset/generator...")
@@ -37,10 +37,9 @@ def get_dataloader(tokenizer, seqlen, dataset, nsamples=128):
         all_tokens = tokenizer.tokenize(full_text)
     else:
         logging.info("(Dataset is pre-tokenized, concatenating...)")
-        concatenated_tokens = ops.concatenate(
-            [ops.reshape(s, [-1]) for s in dataset_list], axis=0
+        all_tokens = np.concatenate(
+            [ops.convert_to_numpy(s).reshape(-1) for s in dataset_list], axis=0
         )
-        all_tokens = ops.convert_to_numpy(concatenated_tokens)
 
     all_tokens = np.array(all_tokens, dtype=np.int32)
 
@@ -62,10 +61,10 @@ def get_dataloader(tokenizer, seqlen, dataset, nsamples=128):
         start_index = random.randint(0, len(all_tokens) - seqlen - 1)
         end_index = start_index + seqlen
         sample = all_tokens[start_index:end_index]
-        calibration_samples.append(ops.reshape(sample, (1, seqlen)))
+        calibration_samples.append(np.reshape(sample, (1, seqlen)))
 
-    final_array = ops.stack(calibration_samples, axis=0)
-    return ops.convert_to_numpy(final_array)
+    final_array = np.stack(calibration_samples, axis=0)
+    return final_array
 
 
 def _find_layers_recursive(layer, prefix, found_layers):
@@ -106,9 +105,15 @@ def apply_gptq_layerwise(
 ):
     """Applies GPTQ quantization layer-by-layer to a Keras model.
 
-    This function performs a sequential, model-agnostic quantization process. It
-    dynamically identifies quantizable layers (e.g., Dense, EinsumDense)
-    within larger "transformer blocks" of a model.
+    This function is designed to work with common transformer architectures,
+    like those provided by KerasNLP and KerasHub. It automatically discovers
+    the model's structure by first looking for the standard KerasNLP format:
+    a `model.backbone` attribute that contains a `transformer_layers` list.
+
+    If a standard backbone is not found, it falls back to a heuristic for
+    custom models, where it assumes the first `keras.layers.Embedding` layer
+    is the input embedding and any subsequent container layers are the
+    transformer blocks to be quantized.
 
     The core logic operates as follows:
     1.  It automatically detects the model's structure, identifying the main
@@ -154,7 +159,17 @@ def apply_gptq_layerwise(
     if hasattr(model, "backbone"):
         logging.info("Detected KerasNLP model structure.")
         backbone = model.backbone
-        transformer_blocks = backbone.transformer_layers
+
+        # Add the check for the 'transformer_layers' attribute.
+        if hasattr(backbone, "transformer_layers"):
+            transformer_blocks = backbone.transformer_layers
+        else:
+            # Raise a specific error if the attribute is missing.
+            raise ValueError(
+                "The model's backbone does not have a 'transformer_layers' "
+                "attribute. Please ensure you are using a standard KerasNLP "
+                "transformer model."
+            )
         # Find the embedding layer by checking for common names or by type.
         if hasattr(backbone, "token_embedding"):
             embedding_layer = backbone.token_embedding
@@ -256,13 +271,12 @@ def hook(*args, **kwargs):
                 inp_reshaped = ops.reshape(layer_inputs, (-1, num_features))
                 gptq_object.update_hessian_with_batch(inp_reshaped)
 
-            quantizer = GPTQQuant()
-            quantizer.configure(
-                wbits,
-                perchannel=True,
-                symmetric=symmetric,
-                group_size=group_size,
-            )
+                quantizer = GPTQQuant(
+                    wbits,
+                    perchannel=True,
+                    symmetric=symmetric,
+                    group_size=group_size,
+                )
             for name, gptq_object in gptq_objects.items():
                 logging.info(f"Quantizing {name}...")
                 gptq_object.quantizer = quantizer
diff --git a/keras/src/quantizers/gptq_quant.py b/keras/src/quantizers/gptq_quant.py
@@ -2,21 +2,15 @@
 
 
 def dequantize(x, scale, zero, maxq):
-    """The core quantization function with correct broadcasting."""
-    # Ensure scale is broadcastable with the input tensor x
-    if scale.shape != x.shape:
-        scale = ops.broadcast_to(scale, x.shape)
-
-    # Ensure zero-point is also broadcastable
-    if zero.shape != x.shape:
-        zero = ops.broadcast_to(zero, x.shape)
-
-    epsilon = 1e-8
+    """The core quantization function."""
+    epsilon = ops.cast(1e-8, dtype=scale.dtype)
     scale = ops.where(ops.equal(scale, 0), epsilon, scale)
+
     quantized_x = ops.divide(x, scale)
     quantized_x = ops.round(quantized_x)
     q = ops.add(quantized_x, zero)
     q = ops.clip(q, 0, maxq)
+
     dequantized_x = ops.subtract(q, zero)
     return ops.multiply(scale, dequantized_x)
 
@@ -48,23 +42,17 @@ class GPTQQuant:
             Defaults to -1.
     """
 
-    def __init__(self):
-        self.scale = None
-        self.zero = None
-        self.maxq = None
-        self.wbits = None
-        self.perchannel = False
-        self.symmetric = False
-        self.group_size = -1
-
-    def configure(self, wbits, perchannel=True, symmetric=False, group_size=-1):
-        """Configures the quantizer settings."""
+    def __init__(self, wbits, perchannel=True, symmetric=False, group_size=-1):
         self.wbits = wbits
         self.maxq = ops.cast((2**wbits) - 1, "float32")
         self.perchannel = perchannel
         self.symmetric = symmetric
         self.group_size = group_size
 
+        # These are now determined later by `find_params`
+        self.scale = None
+        self.zero = None
+
     def find_params(self, x, weight=False):
         """Finds quantization parameters (scale and zero) for a given tensor."""