chore: lazy import check for quantize api (#630)

DefTruth · web-flow · commit c53c4277e6d6 · 2025-12-30T13:52:35.000+08:00
diff --git a/src/cache_dit/__init__.py b/src/cache_dit/__init__.py
@@ -38,22 +38,7 @@
 from cache_dit.profiler import create_profiler_context
 from cache_dit.profiler import get_profiler_output_dir
 from cache_dit.profiler import set_profiler_output_dir
-
-try:
-    from cache_dit.quantize import quantize
-except ImportError as e:  # noqa: F841
-    err_msg = str(e)
-
-    def _raise_import_error(func_name: str):
-        raise ImportError(
-            f"{func_name} requires additional dependencies. "
-            "Please install cache-dit[quantization] or cache-dit[all] "
-            f"to use this feature. Error message: {err_msg}"
-        )
-
-    def quantize(*args, **kwargs):
-        _raise_import_error("quantize")
-
+from cache_dit.quantize import quantize
 
 NONE = CacheType.NONE
 DBCache = CacheType.DBCache
diff --git a/src/cache_dit/quantize/__init__.py b/src/cache_dit/quantize/__init__.py
@@ -1,10 +1,3 @@
-try:
-    import torchao
-except ImportError:
-    raise ImportError(
-        "Quantization functionality requires the 'quantization' extra dependencies. "
-        "Install with: pip install cache-dit[quantization]"
-    )
 import torch
 from typing import Callable, Optional, List
 from cache_dit.logger import init_logger
@@ -16,6 +9,8 @@ def quantize(
     module: torch.nn.Module,
     quant_type: Optional[str] = None,
     backend: str = "ao",
+    # Specific parameters for torchao backend
+    per_row: bool = True,
     exclude_layers: List[str] = [
         "embedder",
         "embed",
@@ -35,7 +30,7 @@ def quantize(
         return quantize_ao(
             module,
             quant_type=quant_type,
-            per_row=kwargs.pop("per_row", True),
+            per_row=per_row,
             exclude_layers=exclude_layers,
             filter_fn=filter_fn,
             **kwargs,
diff --git a/src/cache_dit/quantize/torchao/quantize_ao.py b/src/cache_dit/quantize/torchao/quantize_ao.py
@@ -10,19 +10,27 @@
 def quantize_ao(
     module: torch.nn.Module,
     quant_type: str = "float8_weight_only",
+    # Paramters for FP8 DQ quantization
+    # Whether to quantize per row (True) or per tensor (False)
+    per_row: bool = True,
     exclude_layers: List[str] = [
         "embedder",
         "embed",
     ],
     filter_fn: Optional[Callable] = None,
-    # paramters for fp8 quantization
-    per_row: bool = True,
     **kwargs,
 ) -> torch.nn.Module:
     # Apply FP8 DQ for module and skip any `embed` modules
     # by default to avoid non-trivial precision downgrade. Please
     # set `exclude_layers` as `[]` if you don't want this behavior.
     assert isinstance(module, torch.nn.Module)
+    try:
+        import torchao  # noqa: F401
+    except ImportError:
+        raise ImportError(
+            "Quantization functionality requires the 'quantization' extra dependencies. "
+            "Install with: pip install cache-dit[quantization]"
+        )
 
     alias_map = {
         "float8": "fp8_w8a8_dq",