WOQ: set default act_quant_mode to PER_BATCH_IC_BLOCK_SYM (#3321)

Xia-Weiwen · web-flow · commit 4dbda878a25c · 2024-10-21T18:01:57.000-07:00
diff --git a/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
@@ -145,8 +145,17 @@ def decorator(func):
 )
 parser.add_argument(
     "--act-quant-mode",
-    choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"],
-    default="PER_IC_BLOCK",
+    choices=[
+        "PER_TENSOR",
+        "PER_IC_BLOCK",
+        "PER_BATCH",
+        "PER_BATCH_IC_BLOCK",
+        "PER_TENSOR_SYM",
+        "PER_IC_BLOCK_SYM",
+        "PER_BATCH_SYM",
+        "PER_BATCH_IC_BLOCK_SYM",
+    ],
+    default="PER_BATCH_IC_BLOCK_SYM",
     type=str,
     help="Quantization mode for activation with different granularity. "
     "For lowp-mode=INT8 only. For other cases, it has no effect. "
@@ -155,6 +164,10 @@ def decorator(func):
     "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; "
     "PER_BATCH(2): quantize per batch; "
     "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. "
+    "PER_TENSOR_SYM(4): symmetrically quantize per tensor; "
+    "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; "
+    "PER_BATCH_SYM(6): symmetrically quantize per batch; "
+    "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. "
     "IC_BLOCK is determined by IC automatically.",
 )
 parser.add_argument(
@@ -417,6 +430,10 @@ def write_checkpoints_json():
                     "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
                     "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
                     "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+                    "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM,
+                    "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM,
+                    "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM,
+                    "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
                 }
                 weight_qscheme = (
                     WoqWeightQScheme.SYMMETRIC
@@ -1196,6 +1213,10 @@ def write_checkpoints_json():
                     "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
                     "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
                     "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+                    "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM,
+                    "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM,
+                    "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM,
+                    "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
                 }
                 weight_qscheme = (
                     WoqWeightQScheme.SYMMETRIC
@@ -1849,6 +1870,10 @@ def write_checkpoints_json():
                     "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
                     "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
                     "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+                    "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM,
+                    "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM,
+                    "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM,
+                    "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
                 }
                 weight_qscheme = (
                     WoqWeightQScheme.SYMMETRIC
diff --git a/examples/cpu/llm/inference/distributed/run_generation_tp.py b/examples/cpu/llm/inference/distributed/run_generation_tp.py
@@ -146,8 +146,17 @@
 )
 parser.add_argument(
     "--act-quant-mode",
-    choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"],
-    default="PER_IC_BLOCK",
+    choices=[
+        "PER_TENSOR",
+        "PER_IC_BLOCK",
+        "PER_BATCH",
+        "PER_BATCH_IC_BLOCK",
+        "PER_TENSOR_SYM",
+        "PER_IC_BLOCK_SYM",
+        "PER_BATCH_SYM",
+        "PER_BATCH_IC_BLOCK_SYM",
+    ],
+    default="PER_BATCH_IC_BLOCK_SYM",
     type=str,
     help="Quantization mode for activation with different granularity. "
     "For lowp-mode=INT8 only. For other cases, it has no effect. "
@@ -156,6 +165,10 @@
     "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; "
     "PER_BATCH(2): quantize per batch; "
     "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. "
+    "PER_TENSOR_SYM(4): symmetrically quantize per tensor; "
+    "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; "
+    "PER_BATCH_SYM(6): symmetrically quantize per batch; "
+    "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. "
     "IC_BLOCK is determined by IC automatically.",
 )
 parser.add_argument(
@@ -339,6 +352,10 @@ def trace_handler(prof):
         "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
         "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
         "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+        "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM,
+        "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM,
+        "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM,
+        "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
     }
     weight_qscheme = (
         WoqWeightQScheme.SYMMETRIC
diff --git a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
@@ -156,8 +156,17 @@
 )
 parser.add_argument(
     "--act-quant-mode",
-    choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"],
-    default="PER_IC_BLOCK",
+    choices=[
+        "PER_TENSOR",
+        "PER_IC_BLOCK",
+        "PER_BATCH",
+        "PER_BATCH_IC_BLOCK",
+        "PER_TENSOR_SYM",
+        "PER_IC_BLOCK_SYM",
+        "PER_BATCH_SYM",
+        "PER_BATCH_IC_BLOCK_SYM",
+    ],
+    default="PER_BATCH_IC_BLOCK_SYM",
     type=str,
     help="Quantization mode for activation with different granularity. "
     "For lowp-mode=INT8 only. For other cases, it has no effect. "
@@ -166,6 +175,10 @@
     "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; "
     "PER_BATCH(2): quantize per batch; "
     "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. "
+    "PER_TENSOR_SYM(4): symmetrically quantize per tensor; "
+    "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; "
+    "PER_BATCH_SYM(6): symmetrically quantize per batch; "
+    "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. "
     "IC_BLOCK is determined by IC automatically.",
 )
 parser.add_argument(
@@ -489,6 +502,10 @@ def write_checkpoints_json():
             "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
             "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
             "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+            "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM,
+            "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM,
+            "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM,
+            "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
         }
         weight_qscheme = (
             WoqWeightQScheme.SYMMETRIC
diff --git a/examples/cpu/llm/inference/run.py b/examples/cpu/llm/inference/run.py
@@ -195,7 +195,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
             "PER_BATCH_SYM",
             "PER_BATCH_IC_BLOCK_SYM",
         ],
-        default="PER_IC_BLOCK",
+        default="PER_BATCH_IC_BLOCK_SYM",
         type=str,
         help="Quantization mode for activation with different granularity. "
         "For lowp-mode=INT8 only. For other cases, it has no effect. "
diff --git a/examples/cpu/llm/inference/single_instance/run_quantization.py b/examples/cpu/llm/inference/single_instance/run_quantization.py
@@ -230,7 +230,7 @@
         "PER_BATCH_SYM",
         "PER_BATCH_IC_BLOCK_SYM",
     ],
-    default="PER_IC_BLOCK",
+    default="PER_BATCH_IC_BLOCK_SYM",
     type=str,
     help="Quantization mode for activation with different granularity. "
     "For lowp-mode=INT8 only. For other cases, it has no effect. "
diff --git a/intel_extension_for_pytorch/quantization/_qconfig.py b/intel_extension_for_pytorch/quantization/_qconfig.py
@@ -188,7 +188,7 @@ def get_weight_only_quant_qconfig_mapping(
     *,
     weight_dtype: int = WoqWeightDtype.INT8,
     lowp_mode: int = WoqLowpMode.NONE,
-    act_quant_mode: int = WoqActQuantMode.PER_IC_BLOCK,
+    act_quant_mode: int = WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
     group_size: int = -1,
     weight_qscheme: int = WoqWeightQScheme.UNDEFINED,
 ):
@@ -222,8 +222,8 @@ def get_weight_only_quant_qconfig_mapping(
                                 No grouping along IC for weight. For activation,
                                 IC_BLOCK is determined automatically by IC.
                         If group_size > 0:
-                            act_quant_mode can be any. If act_quant_mode is PER_IC_BLOCK
-                            or PER_BATCH_IC_BLOCK, weight is grouped along IC by group_size.
+                            act_quant_mode can be any. If act_quant_mode is PER_IC_BLOCK(_SYM)
+                            or PER_BATCH_IC_BLOCK(_SYM), weight is grouped along IC by group_size.
                             The IC_BLOCK for activation is determined by group_size automatically.
                             Each group has its own quantization parameters.
         weight_qscheme: Specify how to quantize weight, asymmetrically or symmetrically. Generally,
diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py
@@ -2155,7 +2155,7 @@ def test(feature, has_bias, w_dtype, lowp_mode, enable_amp):
                     None,
                     group_size,
                     lowp_mode,
-                    WoqActQuantMode.PER_IC_BLOCK,
+                    WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
                     compensation,
                 )
                 torch.testing.assert_close(output, output_ref)

Original file line number	Diff line number	Diff line change
`@@ -2155,7 +2155,7 @@ def test(feature, has_bias, w_dtype, lowp_mode, enable_amp):`
`2155`	`2155`	`None,`
`2156`	`2156`	`group_size,`
`2157`	`2157`	`lowp_mode,`
`2158`		`- WoqActQuantMode.PER_IC_BLOCK,`
	`2158`	`+ WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,`
`2159`	`2159`	`compensation,`
`2160`	`2160`	`)`
`2161`	`2161`	`torch.testing.assert_close(output, output_ref)`