Merge branch 'main' into int-triton-kernel-adj

chichun-charlie-liu · web-flow · commit 4a1201ee3024 · 2025-05-23T12:26:10.000-04:00
Signed-off-by: chichun-charlie-liu &lt;57839396+chichun-charlie-liu@users.noreply.github.com&gt;
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
@@ -1,6 +1,10 @@
 activations
 acc
 ADR
+aiu
+AIU
+Spyre
+spyre
 Args
 AutoGPTQ
 autoregressive
@@ -91,8 +95,11 @@ quantizes
 Quantizing
 QW
 rceil
+recomputation
 repo
 representable
+roberta
+RoBERTa
 runtime
 Runtime
 SAWB
@@ -112,9 +119,9 @@ Tokenizer
 toml
 triton
 Unquantized
+utils
 vals
 venv
 vllm
 xs
 zp
-
diff --git a/examples/AIU_CONVERSION/README.md b/examples/AIU_CONVERSION/README.md
@@ -0,0 +1,69 @@
+# Train and prepare INT8 checkpoint for the AIU using Direct Quantization
+This example builds on the [Direct Quantization (DQ) example](../DQ_SQ/README.md). We assume the user is already familiar with the DQ quantization process and would like to generate an INT8-quantized checkpoint that is made compliant with the requirements of the AIU/Spire accelerator.
+
+Once created, this checkpoint can be run on the AIU by using an inference script from [aiu-fms-testing-utils](https://github.com/foundation-model-stack/aiu-fms-testing-utils).
+
+For more information on the AIU/Spyre accelerator, see the following blogs:
+- [Introducing the IBM Spyre AI Accelerator chip](https://research.ibm.com/blog/spyre-for-z)
+- [IBM Power modernizes infrastructure and accelerates innovation with AI in the year ahead](https://newsroom.ibm.com/blog-ibm-power-modernizes-infrastructure-and-accelerates-innovation-with-ai-in-the-year-ahead)
+
+## Requirements
+- [FMS Model Optimizer requirements](../../README.md#requirements)
+
+## QuickStart
+
+**1. Prepare Data** as per DQ quantization process ([link](../DQ_SQ/README.md)). In this example, we assume the user wants to quantized RoBERTa-base model and has thus prepared the DQ data for it, stored under the folder `data_train` and `data_test`, by adapting the DQ example accordingly.
+
+**2. Apply DQ with conversion** by providing the desired quantization parameters, as well as the flags `--save_ckpt_for_aiu` and `--recompute_narrow_weights`.
+
+```bash
+python  -m fms_mo.run_quant \
+        --model_name_or_path "roberta-base" \
+        --training_data_path data_train \
+        --test_data_path data_test \
+        --torch_dtype "float16" \
+        --quant_method dq \
+        --nbits_w 8 \
+        --nbits_a 8 \
+        --nbits_kvcache 32 \
+        --qa_mode "pertokenmax"\
+        --qw_mode "maxperCh" \
+        --qmodel_calibration_new 1 \
+        --output_dir "dq_test" \
+        --save_ckpt_for_aiu \
+        --recompute_narrow_weights
+```
+> [!TIP]
+> - In this example, we are not evaluating the perplexity of the quantized model, but, if so desired, the user can add the `--eval_ppl` flag.
+> - We set a single calibration example because the quantizers in use do not need calibration: weights remain static during DQ, so a single example will initialize the quantizer correctly, and the activation quantizer `pertokenmax` will dynamically recompute the quantization range at inference time, when running on the AIU.
+
+**3. Reload checkpoint for testing** and validate its content (optional).
+
+```python
+sd = torch.load("dq_test/qmodel_for_aiu.pt", weights_only=True)
+```
+
+Check that all quantized layers have been converted to `torch.int8`, while the rest are `torch.float16`.
+
+```python
+# select quantized layers by name
+roberta_qlayers = ["attention.self.query", "attention.self.key", "attention.self.value", "attention.output.dense", "intermediate.dense", "output.dense"]
+# assert all quantized weights are int8
+assert all(v.dtype == torch.int8 for k,v in sd.items() if any(n in k for n in roberta_qlayers) and k.endswith(".weight"))
+# assert all other parameters are fp16
+assert all(v.dtype == torch.float16 for k,v in sd.items() if all(n not in k for n in roberta_qlayers) or not k.endswith(".weight"))
+```
+
+> [!TIP]
+> - We have trained the model with symmetric quantizer for activations (`qa_mode`). If an asymmetric quantizer is used, then the checkpoint will also carry a `zero_shift` parameters which is torch.float32, so this validation step should be modified accordingly.
+
+Because we have used the `narrow_weight_recomputation` option along with a `maxperCh` (max per-channel) quantizer for weights, the INT weight matrices distributions have been widened. Most values of standard deviation (per channel) should surpass the empirical threshold of 20.
+
+```python
+[f"{v.to(torch.float32).std(dim=-1).mean():.4f}" for k,v in sd.items() if k.endswith(".weight") and any(n in k for n in roberta_qlayers)]
+```
+
+> [!TIP]
+> - We cast the torch.int8 weights to torch.float32 to be able to apply the torch.std function.
+> - For per-channel weights, the recomputation is applied per-channel. Here we print a mean across channels for help of visualization.
+> - It is not a guarantee that the recomputed weights will exceed the empirical threshold after recomputation, but it is the case for several common models of BERT, RoBERTa, Llama, and Granite families.
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -43,6 +43,7 @@
     get_act_scales,
     get_act_scales_1gpu,
 )
+from fms_mo.utils.aiu_utils import save_for_aiu
 from fms_mo.utils.dq_utils import config_quantize_smooth_layers
 from fms_mo.utils.eval_utils import Evaluator, eval_llm_1GPU
 from fms_mo.utils.utils import patch_torch_bmm, prepare_input
@@ -172,7 +173,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
 
     qcfg["seq_len"] = block_size
     qcfg["model"] = model_args.model_name_or_path
-    qcfg["smoothq"] = fms_mo_args.smoothq_alpha != -1
+    qcfg["smoothq"] = qcfg.get("smoothq_alpha", -1) >= 0
     qcfg["plotsvg"] = False
 
     calibration_dataset = load_from_disk(data_args.training_data_path)
@@ -199,7 +200,11 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         scale_file.parent.mkdir(exist_ok=False)
 
     if scale_file.exists():
-        act_scales = torch.load(scale_file, map_location=getattr(model, "device", dev))
+        act_scales = torch.load(
+            scale_file,
+            map_location=getattr(model, "device", dev),
+            weights_only=True,
+        )
     else:
         logger.info("Generate activation scales")
         if qcfg["large_model"]:
@@ -217,11 +222,13 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         save_fname="dq",
     )
     logger.info(f"Quantized model {model}")
+    logger.info("==" * 20)
+
     if qcfg["smoothq"]:
         logger.info("Starting to apply smooth scale")
         dq_llm(model, act_scales, qcfg)
         logger.info("Finished applying smooth scale")
-    logger.info("==" * 20)
+
     if qcfg["qmodel_calibration_new"] > 0:
         logger.info("Starting to calibrate activation clip_val")
         if qcfg["large_model"]:
@@ -238,9 +245,15 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
                 with patch_torch_bmm(qcfg):
                     model(**data_mb)
 
-    logger.info(f"Saving quantized model and tokenizer to {opt_args.output_dir}")
-    model.save_pretrained(opt_args.output_dir, use_safetensors=True)
-    tokenizer.save_pretrained(opt_args.output_dir)
+    if opt_args.save_ckpt_for_aiu:
+        logger.info(
+            f"Saving model processed for AIU and tokenizer to {opt_args.output_dir}"
+        )
+        save_for_aiu(model, qcfg, output_dir=opt_args.output_dir, verbose=True)
+    elif opt_args.save_ckpt:
+        logger.info(f"Saving quantized model and tokenizer to {opt_args.output_dir}")
+        model.save_pretrained(opt_args.output_dir, use_safetensors=True)
+        tokenizer.save_pretrained(opt_args.output_dir)
 
     if fms_mo_args.eval_ppl:
         path_test = Path(data_args.test_data_path)
diff --git a/fms_mo/run_quant.py b/fms_mo/run_quant.py
@@ -315,9 +315,11 @@ def main():
 
         logger = set_log_level(opt_args.log_level, __name__)
 
-        logger.debug(f"Input args parsed: \nmodel_args {model_args}, data_args {data_args}, \
-                     opt_args {opt_args}, fms_mo_args {fms_mo_args}, gptq_args {gptq_args}, \
-                     fp8_args {fp8_args}")
+        logger.debug(
+            f"Input args parsed: \nmodel_args {model_args}, data_args {data_args}, "
+            f"opt_args {opt_args}, fms_mo_args {fms_mo_args}, gptq_args {gptq_args}, "
+            f"fp8_args {fp8_args}"
+        )
     except Exception as e:  # pylint: disable=broad-except
         logger.error(traceback.format_exc())
         write_termination_log(
diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py
@@ -138,6 +138,14 @@ class OptArguments(TypeChecker):
         default="INFO",
         metadata={"help": "The log level to adopt during optimization."},
     )
+    save_ckpt: bool = field(
+        default=True,
+        metadata={"help": "Save quantized checkpoint."},
+    )
+    save_ckpt_for_aiu: bool = field(
+        default=False,
+        metadata={"help": "Prepare and save AIU-compliant checkpoint."},
+    )
 
 
 @dataclass
@@ -176,6 +184,10 @@ class FMSMOArguments(TypeChecker):
     aiu_sim_triton: bool = field(
         default=False, metadata={"help": ("AIU simulation with triton kernel")}
     )
+    recompute_narrow_weights: bool = field(
+        default=False,
+        metadata={"help": "Apply recomputation during checkpoint saving for AIU."},
+    )
 
 
 @dataclass
diff --git a/fms_mo/utils/dq_utils.py b/fms_mo/utils/dq_utils.py
@@ -115,5 +115,15 @@ def config_quantize_smooth_layers(qcfg: dict):
             qcfg["smoothq_act_scale_path"] = "./act_scales/graniteCodeHF_34b_base12.pt"
         if "granite-34b-code-instruct" in qcfg["model"]:
             qcfg["smoothq_act_scale_path"] = "./act_scales/graniteCodeHF_34b_base12.pt"
+    elif "roberta" in qcfg["model"]:
+        qcfg["act_scale_path"] = "./act_scales"
+        qcfg["smoothq_scale_layers"] = [
+            "attention.self.query",
+            "attention.self.key",
+            "attention.self.value",
+            "intermediate.dense",
+        ]
+        qcfg["qskip_layer_name"] = []
+        qcfg["qlayer_name_pattern"] = ["roberta.encoder"]
     else:
         raise ValueError("The model architecture is not supported for DQ.")
diff --git a/fms_mo/utils/qconfig_utils.py b/fms_mo/utils/qconfig_utils.py
@@ -114,6 +114,7 @@ def config_defaults() -> dict:
         "extend_act_range": False,
         "plotsvg": False,
         "qskip_large_mag_layers": False,
+        "recompute_narrow_weights": False,
         # Iterable vars
         "qlayer_name_pattern": [],
         "qskip_layer_name": [],
@@ -306,6 +307,7 @@ def qconfig_init(recipe: str = None, args: Any = None) -> dict:
     qcfg["qlayer_name_pattern"] = []
     qcfg["qskip_layer_name"] = []
     qcfg["qskip_large_mag_layers"] = False
+    qcfg["recompute_narrow_weights"] = False
     qcfg["qspecial_layers"] = {}
 
     # settings about quantizing bmm/matmul
@@ -878,6 +880,7 @@ def check_config(config: dict, model_dtype: torch.dtype = None) -> None:
         "ptq_freezecvs",
         "ptq_qdrop",
         "qskip_large_mag_layers",
+        "recompute_narrow_weights",
         "smoothq",
     ]
     for boolean_var_str in boolean_vars_str: