Merge branch 'pytorch:main' into exynos-quantize-support

Jiseong-oh · web-flow · commit 3d58d14973bf · 2025-10-10T10:00:58.000+09:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -1238,12 +1238,15 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
-    model, example_inputs, example_kwarg_inputs, dynamic_shapes = (
-        EagerModelFactory.create_model(
-            module_name,
-            model_class_name,
-            llm_config=llm_config,
-        )
+    (
+        model,
+        example_inputs,
+        example_kwarg_inputs,
+        dynamic_shapes,
+    ) = EagerModelFactory.create_model(
+        module_name,
+        model_class_name,
+        llm_config=llm_config,
     )
     # Convert dtype override string to actual type.
     dtype_override = DType[llm_config.model.dtype_override.value]
@@ -1322,6 +1325,7 @@ def _get_source_transforms(  # noqa
     local_global_attention: Optional[List[int]] = None,
     use_torchao_kernels_linear: bool = False,
     use_torchao_kernels_tied_embedding: bool = False,
+    quantize_with_hqq: bool = True,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     """
     Return a list of functions that transform a graph.
@@ -1391,7 +1395,10 @@ def _get_source_transforms(  # noqa
         """
         transforms.append(
             get_quant_embedding_transform(
-                embedding_quantize, use_shared_embedding, checkpoint_dtype
+                embedding_quantize,
+                use_shared_embedding,
+                checkpoint_dtype,
+                quantize_with_hqq,
             )
         )
 
@@ -1422,6 +1429,7 @@ def _get_source_transforms(  # noqa
                 calibration_tasks=calibration_tasks,
                 calibration_limit=calibration_limit,
                 calibration_seq_length=calibration_seq_length,
+                quantize_with_hqq=quantize_with_hqq,
             )
         )
 
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -49,6 +49,7 @@ def quantize(  # noqa C901
     blocksize: int = 128,
     tokenizer_path: Optional[Path] = None,
     verbose: bool = False,
+    quantize_with_hqq: bool = True,
 ) -> torch.nn.Module:
     """
     Quantizes a model by converting all weights to int8.
@@ -119,7 +120,6 @@ def quantize(  # noqa C901
         from torchao.quantization.granularity import PerAxis, PerGroup
         from torchao.quantization.quant_api import (
             Int8DynamicActivationIntxWeightConfig,
-            MappingType,
             quantize_,
         )
         from torchao.utils import unwrap_tensor_subclass
@@ -134,9 +134,12 @@ def quantize(  # noqa C901
                     weight_granularity=(
                         PerAxis(0) if group_size == 0 else PerGroup(group_size)
                     ),
-                    weight_mapping_type=MappingType.SYMMETRIC,
                     # pyre-ignore[6]
                     intx_packing_format="opaque_torchao_auto",
+                    # pyre-ignore[6]
+                    intx_choose_qparams_algorithm=(
+                        "hqq_scale_only" if quantize_with_hqq else "affine"
+                    ),
                 ),
             )
             model = unwrap_tensor_subclass(model)
@@ -170,6 +173,10 @@ def filter_fn(m, fqn):
                 # pyre-ignore[16]
                 weight_dtype=torch.int4,
                 weight_granularity=PerGroup(group_size),
+                # pyre-ignore[6]
+                intx_choose_qparams_algorithm=(
+                    "hqq_scale_only" if quantize_with_hqq else "affine"
+                ),
             ),
             filter_fn=filter_fn,
         )
@@ -191,6 +198,10 @@ def filter_fn(m, fqn):
             # pyre-ignore[16]
             weight_dtype=torch.int4,
             granularity=PerGroup(q_group_size),
+            # pyre-ignore[6]
+            intx_choose_qparams_algorithm=(
+                "hqq_scale_only" if quantize_with_hqq else "affine"
+            ),
         )
         quantize_(model, q_config)
         model = unwrap_tensor_subclass(model)
@@ -580,6 +591,7 @@ def __init__(
         group_size: Optional[int] = None,
         packed=False,
         precision: Optional[torch.dtype] = None,
+        quantize_with_hqq: bool = True,
     ):
         if isinstance(packed, str):
             packed = packed == "True"
@@ -592,15 +604,12 @@ def __init__(
         self.precision = precision
         if (bitwidth not in [2, 4]) and packed:
             raise RuntimeError("pack only works with bitsize 2, 4")
+        self.quantize_with_hqq = quantize_with_hqq
 
     @torch.no_grad()
     def create_quantized_state_dict(self, packed=False) -> Dict:
         from torchao.quantization.granularity import PerAxis, PerGroup
-        from torchao.quantization.quant_api import (
-            IntxWeightOnlyConfig,
-            MappingType,
-            quantize_,
-        )
+        from torchao.quantization.quant_api import IntxWeightOnlyConfig, quantize_
 
         cur_state_dict = self.mod.state_dict()
 
@@ -627,7 +636,10 @@ def create_quantized_state_dict(self, packed=False) -> Dict:
                         if (self.group_size is None or self.group_size == 0)
                         else PerGroup(self.group_size)
                     ),
-                    mapping_type=MappingType.SYMMETRIC,
+                    # pyre-ignore[6]
+                    intx_choose_qparams_algorithm=(
+                        "hqq_scale_only" if self.quantize_with_hqq else "affine"
+                    ),
                 )
                 quantize_(tmp_model, config, lambda m, fqn: isinstance(m, nn.Embedding))
                 weight = tmp_model.weight.qdata  # pyre-ignore[16]
@@ -765,6 +777,7 @@ def get_quant_embedding_transform(
     embedding_quantize: str,
     use_shared_embedding: bool = False,
     dtype_override: Optional[DType] = None,
+    quantize_with_hqq: bool = True,
 ):
     if embedding_quantize.startswith("torchao:"):
         from torchao.prototype.quantization.embedding.api import (
@@ -825,6 +838,7 @@ def _torchao_embedding_quantizer(model):
         group_size=group_size,
         packed=(bitwidth in [2, 4]),
         precision=torch_dtype,
+        quantize_with_hqq=quantize_with_hqq,
     ).quantized_model()
 
 
@@ -838,6 +852,7 @@ def get_quant_weight_transform(
     calibration_tasks: Optional[list] = None,
     calibration_limit: Optional[int] = None,
     calibration_seq_length: Optional[int] = None,
+    quantize_with_hqq: bool = True,
 ):
     return partial(
         quantize,
@@ -850,6 +865,7 @@ def get_quant_weight_transform(
         calibration_limit=calibration_limit,
         calibration_seq_length=calibration_seq_length,
         tokenizer_path=(Path(path) if (path := tokenizer_path) is not None else None),
+        quantize_with_hqq=quantize_with_hqq,
     )
 
 
@@ -877,7 +893,6 @@ def _load_torchao_aten_lib(libname):
 def set_8da4w_computation_dtype(
     module: nn.Module, computation_dtype: torch.dtype
 ) -> nn.Module:
-
     from torchao.quantization.linear_quant_modules import Int8DynActInt4WeightLinear
 
     def _set_8da4w_computation_dtype(module: nn.Module, dtype: torch.dtype) -> None:
diff --git a/third-party/ao b/third-party/ao
@@ -1 +1 @@
-Subproject commit b47f1a3655004b2b4dd3b4f01a5d8eebff1faa3c
+Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715