feat: new cosntraint in autoquant

h-guo18 · h-guo18 · commit 96c364a6abe3 · 2025-09-19T03:02:24.000Z
Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py
@@ -18,6 +18,7 @@
 import gc
 import hashlib
 import json
+import random
 import types
 import warnings
 from collections import defaultdict
@@ -44,6 +45,31 @@
 from .utils import is_quantized_linear, multi_context
 
 
+def get_total_weight_size(modules):
+    """Get the total weight size of the modules."""
+    return sum(
+        (module.weight.numel() if AutoQuantizeSearcher._is_auto_quantize_module(module) else 0)
+        for module in modules
+    )
+
+
+def get_base_time_cost(modules):
+    """Get the base time cost of the modules."""
+    return 42  # TODO: Implement this
+
+
+def get_total_linear_time(modules):
+    """Get the total linear time of the modules."""
+    return sum(
+        (
+            get_base_time_cost(modules)
+            if AutoQuantizeSearcher._is_auto_quantize_module(module)
+            else 0
+        )
+        for module in modules
+    )
+
+
 def estimate_quant_compression(quant_cfg: QuantizeConfig) -> float:
     """Estimate the compression ratio of a quantization configuration.
 
@@ -221,6 +247,22 @@ def importance(self) -> dict:
             for quant_recipe, importance_dict in self._importance_dict.items()
         }
 
+    @property
+    def weight_sizes(self) -> list[float]:
+        """Return the weight size of the quantization recipe."""
+        return [
+            get_total_weight_size(self.nn_modules) * getattr(quant_recipe, "compression")
+            for quant_recipe in self.choices
+        ]
+
+    @property
+    def time_costs(self) -> list[float]:
+        """Return the time cost of the quantization recipe."""
+        return [
+            get_base_time_cost(self.nn_modules) * (0.1 + 0.9 * random.random())
+            for quant_recipe in self.choices
+        ]
+
 
 class AutoQuantizeSearcher(BaseSearcher):
     """A searcher for AutoQuantize algorithm.
@@ -238,7 +280,7 @@ class AutoQuantizeSearcher(BaseSearcher):
     for other models such as ResNet.
     """
 
-    candidate_stats: dict[str, dict[str, list[float]]]
+    candidate_stats: dict[str, dict[str, list]]
     best: dict[str, Any]
     gradient_checkpointing_enable_contexts: list[tuple[Callable, Callable]] = []
 
@@ -462,21 +504,19 @@ def insert_hparams_after_merge_rules(cls, model, quant_recipes):
             for module in modules:
                 module._register_hparam("quant_recipe", hparam)
 
-    def _get_formatted_weight_compression_constraint(self):
-        effective_bits = self.constraints["effective_bits"]
-        assert effective_bits > 0 and effective_bits <= 16, (
-            "effective_bits should be between 0 and 16."
-        )
-        weight_compression = self.constraints["effective_bits"] / 16.0
-
-        return weight_compression
-
     def _verify_constraint(self, search_recipes):
         assert self.constraints["effective_bits"] >= search_recipes[0].num_bits, (
             f"The effective_bits {self.constraints['effective_bits']} constraint cannot be lower than the "
             f"num_bits of most aggressive quantization format for this search which is "
             f"{search_recipes[0]} whose num_bits = {search_recipes[0].num_bits}."
         )
+        assert (
+            self.constraints["effective_bits"] > 0 and self.constraints["effective_bits"] <= 16
+        ), "effective_bits should be between 0 and 16."
+        assert len(self.constraints) == 1 and "effective_bits" in self.constraints, (
+            f"`constraints` must contain only 'effective_bits' constraint. "
+            f"Got {self.constraints.keys()}"
+        )
 
     def _run_func(self, func, num_iters=1, desc=""):
         for i, data in tqdm(
@@ -544,51 +584,15 @@ def forward_loop(model):
         ):
             self._estimate_auto_quantize_scores()
 
-    def run_search(self):
-        """Search for the best per-layer quantization configuration and return the best model and configuration.
-
-        AutoQuantize uses Linear Programming Solver to find the optimal quantization configuration which
-        minimizes the sum of per-layer auto_quantize scores while meeting the specified constraint.
-        """
-
-        def get_total_weight_size(modules):
-            return sum(
-                (module.weight.numel() if self._is_auto_quantize_module(module) else 0)
-                for module in modules
-            )
-
-        def _get_constraints_for_search(lower_bound=None):
-            total_model_weight_size = get_total_weight_size(self.model.modules())
-
-            upper_bound = self._get_formatted_weight_compression_constraint()
-
-            if lower_bound:
-                lower_bound = lower_bound * upper_bound
-
-            constraints = {
-                "weight_size_after_compression": (
-                    lower_bound * total_model_weight_size if lower_bound else lower_bound,
-                    upper_bound * total_model_weight_size,
-                )
-            }
-            return constraints, "weight_size_after_compression"
-
-        verbose = self.config["verbose"]
-        assert len(self.constraints) == 1 and "effective_bits" in self.constraints, (
-            f"`constraints` must contain only 'effective_bits' constraint. "
-            f"Got {self.constraints.keys()}"
-        )
-
-        search_recipes = self._get_search_recipes(self.config["quantization_formats"])
+    def _get_candidate_stats(self):
+        """Calculate the candidate stats including formats, scores, weight sizes, and time costs."""
         for name, hparam in named_hparams(self.model, configurable=True):
             if not isinstance(hparam, QuantRecipeHparam):
                 continue
-            formats, scores, costs = [], [], []
+            scores = []
             prev_score = float("inf")
-            for recipe in search_recipes:
-                formats.append(recipe)
+            for recipe in hparam.choices:
                 score = hparam.importance[recipe]
-                cost = get_total_weight_size(hparam.nn_modules) * recipe.compression
 
                 # Lets get the score across Data Parallel (DP) and Tensor Parallel (TP) groups
                 # This way we constraint the same quantization format for the same layer across the DP/TP groups
@@ -600,27 +604,71 @@ def _get_constraints_for_search(lower_bound=None):
                 )
 
                 scores.append(min(score, prev_score))
-                costs.append(cost)
                 prev_score = score
-            self.candidate_stats[name]["formats"] = formats
-            self.candidate_stats[name]["scores"] = scores
-            self.candidate_stats[name]["costs"] = costs
+
+            self.candidate_stats[name] = {
+                "choices": list(hparam.choices),
+                "scores": scores,
+                "weight_sizes": hparam.weight_sizes,
+                "time_costs": hparam.time_costs,
+            }
+        return self.candidate_stats
+
+    def _get_constraints_kwargs(self, lower_bound=None):
+        """Get the constraints and constraints to candidate costs."""
+        constraints, constraints_to_candidate_costs = {}, {}
+
+        if "effective_bits" in self.constraints:
+            upper_bound = self.constraints["effective_bits"] / 16.0
+            if lower_bound:
+                lower_bound = lower_bound * upper_bound
+            constraints["total_weight_size"] = (
+                lower_bound * get_total_weight_size(self.model.modules())
+                if lower_bound
+                else lower_bound,
+                upper_bound * get_total_weight_size(self.model.modules()),
+            )
+            constraints_to_candidate_costs["total_weight_size"] = [
+                candidate_stat["weight_sizes"] for candidate_stat in self.candidate_stats.values()
+            ]
+
+        if "linear_speedup" in self.constraints:
+            upper_bound = self.constraints["linear_speedup"]
+            if lower_bound:
+                lower_bound = lower_bound * upper_bound
+            constraints["total_linear_time"] = (
+                (1 / lower_bound) * get_total_linear_time(self.model.modules())
+                if lower_bound
+                else lower_bound,
+                (1 / upper_bound) * get_total_linear_time(self.model.modules()),
+            )
+            constraints_to_candidate_costs["total_linear_time"] = [
+                candidate_stat["time_costs"] for candidate_stat in self.candidate_stats.values()
+            ]
+
+        return constraints, constraints_to_candidate_costs
+
+    def run_search(self):
+        """Search for the best per-layer quantization configuration and return the best model and configuration.
+
+        AutoQuantize uses Linear Programming Solver to find the optimal quantization configuration which
+        minimizes the sum of per-layer auto_quantize scores while meeting the specified constraint.
+        """
+        verbose = self.config["verbose"]
+
+        self.candidate_stats = self._get_candidate_stats()
 
         for lower_bound in [None, 0.99, 0.90]:
             # The LP solver for auto_quantize sometimes fails to find a solution if a lower bound is not
             # specified. I dont know why this happens.
             # As a workaround, lets specify a lower bound for the weight compression if previous
             # search without lower bound fails.
-            constraints, constraint_name = _get_constraints_for_search(lower_bound)
+            constraints, constraints_to_candidate_costs = self._get_constraints_kwargs(lower_bound)
 
             lps = LPS(
                 name="AutoQuantize",
                 constraints=constraints,
-                constraints_to_candidate_costs={
-                    constraint_name: [
-                        candidate_stat["costs"] for candidate_stat in self.candidate_stats.values()
-                    ]
-                },
+                constraints_to_candidate_costs=constraints_to_candidate_costs,
                 candidate_scores=[
                     candidate_stat["scores"] for candidate_stat in self.candidate_stats.values()
                 ],
@@ -642,9 +690,9 @@ def _get_constraints_for_search(lower_bound=None):
             self.best["is_satisfied"] = True
 
         best_recipe = {}
-        best_constraints, best_scores = 0, 0
+        best_weight_size, best_linear_time, best_scores = 0, 0, 0
         for name, selected_idx in zip(self.candidate_stats.keys(), selections):
-            best_recipe_for_name = self.candidate_stats[name]["formats"][selected_idx]
+            best_recipe_for_name = self.candidate_stats[name]["choices"][selected_idx]
 
             # LP solver could give different solutions for the same layer across DP/TP groups even though
             # the scores and costs are the same. Lets make sure the same quantization format is selected across DP/TP
@@ -657,15 +705,19 @@ def _get_constraints_for_search(lower_bound=None):
 
             best_recipe[name] = best_recipe_for_name
             get_hparam(self.model, name).active = best_recipe_for_name
-            best_constraints += self.candidate_stats[name]["costs"][selected_idx]
+            best_weight_size += self.candidate_stats[name]["weight_sizes"][selected_idx]
+            best_linear_time += self.candidate_stats[name]["time_costs"][selected_idx]
             best_scores += self.candidate_stats[name]["scores"][selected_idx]
             if verbose:
                 print_rank_0(
                     f"AutoQuantize best recipe for {name.replace('.quant_recipe', '')}: {best_recipe[name]}"
                 )
 
         self.best["recipe"] = best_recipe
-        self.best["constraints"] = {constraint_name: best_constraints}
+        self.best["constraints"] = {
+            "total_weight_size": best_weight_size,
+            "total_linear_time": best_linear_time,
+        }
         self.best["score"] = best_scores
 
         QuantRecipe.fold_pqs_to_weights(self.model)