refine sampler (#1077)

wenhuach21 · web-flow · commit 727e573e194e · 2025-11-28T18:42:27.000+08:00
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -33,6 +33,7 @@
 from auto_round import envs
 from auto_round.auto_scheme.gen_auto_scheme import AutoScheme
 from auto_round.compressors.utils import (
+    IndexSampler,
     block_forward,
     check_need_act_calibration,
     check_skippable_keywords,
@@ -196,7 +197,7 @@ def __init__(
             disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
             enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
             **kwargs: Backward compatible options:
-                - enable_alg_ext, quant_lm_head, lr, lr_scheduler, sampler, not_use_best_mse, dynamic_max_gap,
+                - enable_alg_ext, quant_lm_head, lr, lr_scheduler, not_use_best_mse, dynamic_max_gap,
                   super_group_size, super_bits, scale_dtype ("fp16" etc.),
                   nblocks, to_quant_block_names,
                   enable_norm_bias_tuning, enable_quanted_input,
@@ -259,7 +260,6 @@ def __init__(
         enable_minmax_tuning = kwargs.pop("enable_minmax_tuning", True)
         minmax_lr = kwargs.pop("minmax_lr", None)
         lr_scheduler = kwargs.pop("lr_scheduler", None)
-        sampler = kwargs.pop("sampler", "rand")
         not_use_best_mse = kwargs.pop("not_use_best_mse", False)
         dynamic_max_gap = kwargs.pop("dynamic_max_gap", -1)
         nblocks = kwargs.pop("nblocks", 1)
@@ -350,7 +350,6 @@ def __init__(
                 self.lr = lr
         self.minmax_lr = minmax_lr or self.lr
         self.enable_alg_ext = enable_alg_ext
-        self.sampler = sampler
         self.not_use_best_mse = not_use_best_mse
         self.dynamic_max_gap = dynamic_max_gap
         self.lr_scheduler = lr_scheduler
@@ -2487,29 +2486,33 @@ def _quantize_layer(
         scaler = self._get_scaler()  # pylint: disable=assignment-from-none
         init_loss = None
         gradient_accumulate_steps = self.batch_size  # Force to low gpu
-        batch_size = 1  # Force to low gpu
-        global_batch_size = batch_size * gradient_accumulate_steps
-        global_batch_size = min(nsamples, global_batch_size)
-        if self.sampler != "rand":
-            whole_indices = torch.randperm(nsamples)[:global_batch_size]
+
         total_loss = 0
         num_elm = 1
         mse_reduction = "mean"
         if gradient_accumulate_steps != 1:
             mse_reduction = "sum"
         mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device)
+        batch_size = 1  # Force to low gpu
+        global_batch_size = self.batch_size * gradient_accumulate_steps
+        global_batch_size = min(nsamples, global_batch_size)
+        if gradient_accumulate_steps != 1 and not self.attention_mask:
+            whole_indices = torch.arange(global_batch_size)
+            if q_inputs is not None:
+                num_elm = self._get_current_num_elm(q_inputs, whole_indices)
+            else:
+                num_elm = self._get_current_num_elm(inputs, whole_indices)
+
+        index_sampler = IndexSampler(nsamples, global_batch_size)
 
         for i in range(self.iters):
             total_loss = 0
-            if self.sampler == "rand":
-                whole_indices = torch.randperm(nsamples)[:global_batch_size]
-                if gradient_accumulate_steps != 1:
-                    if q_inputs is not None:
-                        num_elm = self._get_current_num_elm(q_inputs, whole_indices)
-                    else:
-                        num_elm = self._get_current_num_elm(inputs, whole_indices)
+            global_indices = index_sampler.next_batch()
+            if self.attention_mask:
+                num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices)
+
             for tmp_step in range(gradient_accumulate_steps):
-                indices = whole_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
+                indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
                 if q_inputs is not None:
                     current_input = [q_inputs[i] for i in indices]
                     current_input = torch.cat(current_input, dim=0).to(device)
@@ -2551,7 +2554,7 @@ def _quantize_layer(
                         loss = mse_loss(  # pylint: disable=not-callable
                             output_q.to(torch.float32), current_output.to(torch.float32)
                         )
-
+                num_elm = 1 if num_elm <= 0 else num_elm
                 total_loss += loss.item() / num_elm
 
                 self._scale_loss_and_backward(scaler, loss)
@@ -2615,6 +2618,13 @@ def _get_current_num_elm(
         current_input_ids = [input_ids[i] for i in indices]
         return sum(id.numel() for id in current_input_ids)
 
+    def _get_non_zero_cnt(self, tensor: list[torch.Tensor], indices: list[int]) -> int:
+        current_tensors = [tensor[i] for i in indices]
+        non_zero_cnt = 0
+        for t in current_tensors:
+            non_zero_cnt += torch.count_nonzero(t).item()
+        return non_zero_cnt
+
     def quantize_block(
         self,
         block: torch.nn.Module,
@@ -2808,7 +2818,7 @@ def _quantize_block(
                 f"layers in the block"
             )
             logger.info(dump_info)
-            unwrapper_block(block, {})  # TODO Quant layer should change
+            unwrapper_block(block, {})
             mv_module_from_gpu(block)
             return output, output
 
@@ -2823,11 +2833,6 @@ def _quantize_block(
             nsamples = len(input_ids["hidden_states"])
         else:
             nsamples = len(input_ids)
-
-        global_batch_size = self.batch_size * self.gradient_accumulate_steps
-        global_batch_size = min(nsamples, global_batch_size)
-        if self.sampler != "rand":
-            whole_indices = torch.randperm(nsamples)[:global_batch_size]
         last_best_iter = 0
         best_loss = torch.finfo(torch.float).max
         num_elm = 1
@@ -2839,30 +2844,31 @@ def _quantize_block(
         init_loss = None
         best_params = {}
         total_loss = 0
+        global_batch_size = self.batch_size * self.gradient_accumulate_steps
+        global_batch_size = min(nsamples, global_batch_size)
         # We assume the block input and output shape is same
-        if self.gradient_accumulate_steps != 1:
+        if self.gradient_accumulate_steps != 1 and not self.attention_mask:
             whole_indices = torch.arange(global_batch_size)
             num_elm = self._get_current_num_elm(input_ids, whole_indices)
 
+        index_sampler = IndexSampler(nsamples, global_batch_size)
+        batch_size = self.batch_size
         for i in range(self.iters):
             if self.enable_alg_ext and self.data_type.endswith("dq"):
                 for n, m in block.named_modules():
                     m.cur_iter = i
             total_loss = 0
-            if self.sampler == "rand":
-                whole_indices = torch.randperm(nsamples)[:global_batch_size]
+            global_indices = index_sampler.next_batch()
+            if self.attention_mask:
+                num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices)
 
             for tmp_step in range(self.gradient_accumulate_steps):
-                indices = whole_indices[tmp_step * self.batch_size : (tmp_step + 1) * self.batch_size]
-
+                indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
                 current_output = self._get_current_output(output, indices)
-
                 current_output = to_device(current_output, loss_device)
-
                 output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device)
-
                 loss = self._get_loss(output_q, current_output, indices, mse_loss, device)
-
+                num_elm = 1 if num_elm <= 0 else num_elm
                 total_loss += loss.item() / num_elm
 
                 if self.low_gpu_mem_usage and card_0_in_high_risk:
diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import copy
 import os
+import random
 import re
 import sys
 from dataclasses import asdict, fields
@@ -1310,3 +1311,56 @@ def _flush_current_shard():
             clear_memory()
         except Exception as _cleanup_err:
             logger.warning(f"shard cleanup warning: {_cleanup_err}")
+
+
+class IndexSampler:
+    """A cyclic sampler that returns shuffled index batches.
+
+    This sampler maintains internal state so that each call to `next_batch()`
+    continues from where it left off. When the remaining number of samples is
+    less than `batch_size`, the sampler reshuffles all indices and starts from
+    the beginning, discarding the last incomplete batch.
+
+    Attributes:
+        nsamples (int): Total number of samples.
+        batch_size (int): Number of indices to return in each batch.
+        index (int): Current position in the index list.
+        indices (List[int]): Shuffled list of indices.
+    """
+
+    def __init__(self, nsamples: int, batch_size: int) -> None:
+        """Initializes the sampler.
+
+        Args:
+            nsamples (int): Total number of samples (must be >= batch_size).
+            batch_size (int): Number of indices per batch.
+
+        Raises:
+            ValueError: If batch_size is not in the range (0, nsamples].
+        """
+        if batch_size <= 0 or batch_size > nsamples:
+            raise ValueError("batch_size must be > 0 and <= nsamples")
+
+        self.nsamples: int = nsamples
+        self.batch_size: int = batch_size
+        self.index: int = 0
+
+        self.indices: list[int] = list(range(nsamples))
+        random.shuffle(self.indices)
+
+    def next_batch(self) -> list[int]:
+        """Returns the next batch of shuffled indices.
+
+        If the remaining indices are fewer than `batch_size`, the sampler
+        reshuffles the entire list and starts from the beginning.
+
+        Returns:
+            list[int]: A list of size `batch_size` containing sample indices.
+        """
+        if self.index + self.batch_size > self.nsamples:
+            random.shuffle(self.indices)
+            self.index = 0
+
+        batch = self.indices[self.index : self.index + self.batch_size]
+        self.index += self.batch_size
+        return batch
diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py
@@ -17,7 +17,7 @@
 import os
 import re
 import sys
-from typing import Any, Callable, Dict, List, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import transformers
@@ -331,11 +331,12 @@ def get_reciprocal(tensor):
     return recip
 
 
-def normalize_input(decoding_layer_inputs: list[tuple[Any]]) -> Tuple[List[torch.Tensor], Dict[str, Any]]:
+def normalize_input(
+    decoding_layer_inputs: tuple[Union[list[torch.Tensor], dict, Any], Optional[dict]],
+) -> Tuple[List[torch.Tensor], Dict[str, Any]]:
     """Normalize the decoding layer inputs into input_ids and other inputs."""
     input_ids = []
-    input_others = {}
-    input_others["positional_inputs"] = []
+    input_others = {"positional_inputs": []}
     for cur_inp in decoding_layer_inputs:
         input_ids.append(cur_inp[0][0][0])
         for key, val in cur_inp[0][1].items():