Merge pull request #3 from BrandonGroth/mx_impl_brandon

chichun-charlie-liu · web-flow · commit 308e5e292504 · 2025-05-22T12:04:47.000-04:00
fix: Setting mx_specs outside qconfig_init
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
@@ -117,4 +117,7 @@ venv
 vllm
 xs
 zp
-
+microxcaling
+MX
+MXINT
+MXFP
diff --git a/examples/MX/README.md b/examples/MX/README.md
@@ -1,6 +1,6 @@
 # `microscaling` Examples Using a Toy Model and Direct Quantization (DQ)
 Here, we provide two simple examples of using MX format in `fms-mo`. 
-"MX format", such as `MXFP8`, is a different format compared to typical IEEE formats, e.g. PyTorch FP8s (`e4m3` or `e5m2`, see our other [FP8 example](../FP8_QUANT/README.md).)  Mainly all the `mx` format are group-based where each member of the group is using the specified format, e.g. FP8 for MXFP8 while each group has a shared (usualy 8-bit) "scale".  Group size could be as small as 32 or 16, depending on hardware design. 
+"MX format", such as `MXFP8`, is a different format compared to typical IEEE formats, e.g. PyTorch FP8s (`e4m3` or `e5m2`, see our other [FP8 example](../FP8_QUANT/README.md).)  Mainly all the `mx` format are group-based where each member of the group is using the specified format, e.g. FP8 for MXFP8 while each group has a shared (usually 8-bit) "scale".  Group size could be as small as 32 or 16, depending on hardware design. 
 > [!NOTE]
 It is important to keep in mind that `mx` is not natively supported by Hopper GPUs yet (some will be supported by Blackwell), which means the quantization configurations and corresponding behavior are simulated, i.e. no real "speed up" should be expected.
 
@@ -23,7 +23,7 @@ Expected output includes:
 
 ```
 
-The second example is the same as in the [DQ](../DQ_SQ/README.md) folder, except using [microscaling](https://arxiv.org/abs/2310.10537) format.  We demonstrate the effect of MXINT8, MXFP8, MXFP6, MXFP4 for weights, activations, and/or KV-cache. 
+The second example is the same as in the [DQ](../DQ_SQ/README.md) folder, except using [microxcaling](https://arxiv.org/abs/2310.10537) format.  We demonstrate the effect of MXINT8, MXFP8, MXFP6, MXFP4 for weights, activations, and/or KV-cache. 
 
 **1. Prepare Data** for calibration process by converting into its tokenized form. An example of tokenization using `LLAMA-3-8B`'s tokenizer is below.
 
diff --git a/fms_mo/prep.py b/fms_mo/prep.py
@@ -28,7 +28,8 @@
 from fms_mo.calib import qmodel_calib
 from fms_mo.modules import QBmm_modules, QConv2d_modules, QLinear_modules, QLSTM_modules
 from fms_mo.quant.quantizers import Qbypass
-from fms_mo.utils.qconfig_utils import check_config, qconfig_save
+from fms_mo.utils.import_utils import available_packages
+from fms_mo.utils.qconfig_utils import check_config, qconfig_save, set_mx_specs
 from fms_mo.utils.utils import prepare_inputs
 
 # import numpy as np # only used in experimental func
@@ -197,6 +198,17 @@ def make_quant_module(module, curr_full_name, qcfg, verbose=False):
     qa_mode = qcfg.get("qa_mode", "pact+")
     qw_mode = qcfg.get("qw_mode", "sawb+")
 
+    # Check if MX has been set outside of qconfig_init without mx_specs being created
+    if (
+        available_packages["mx"]
+        and "mx_specs" not in qcfg
+        and (
+            (qcfg["qa_mode"].startswith("mx_") and qcfg["qw_mode"].startswith("mx_"))
+            or any(key.startswith("mx_") for key in qcfg.keys())
+        )
+    ):
+        set_mx_specs(qcfg, use_mx=True)
+
     # check if on "black list" (need to be exact match), can be skipped or quantized those
     # to slightly higher "default" precision, or use qspecial_layers to have fine control
     if curr_full_name in qcfg["qskip_layer_name"]:
diff --git a/fms_mo/utils/qconfig_utils.py b/fms_mo/utils/qconfig_utils.py
@@ -57,7 +57,7 @@ def config_defaults():
         ("bmm1_qm1_mode", "pact"),
         ("bmm1_qm2_mode", "pact"),
         ("bmm2_qm1_mode", "pact"),
-        ("bmm1_qm2_mode", "pact"),
+        ("bmm2_qm2_mode", "pact"),
         # mode_calib vars
         ("qa_mode_calib", "percentile"),
         ("qw_mode_calib", "percentile"),
@@ -1193,10 +1193,14 @@ def check_config(config, model_dtype=None):
             # 1. can use .func pointer to find the original class
             # 2. QBmm is optional, could be partial(QBmmMX,) or QBmm
             if mapping is not None:
-                if not mapping[nn.Linear].func is QLinearMX:
+                if mapping[nn.Linear].func is not QLinearMX:
                     raise ValueError("MX mapping for nn.Linear is not QLinearMX")
 
                 qbmm_map = mapping["matmul_or_bmm"]
-                if not (qbmm_map is QBmm or getattr(qbmm_map, "func", None) is QBmmMX):
-                    raise ValueError("MX mapping for matmul_or_bmm is not QBmmMX")
+                if bmm_mode_consistency > 0:
+                    if getattr(qbmm_map, "func", None) is not QBmmMX:
+                        raise ValueError("MX mapping for matmul_or_bmm is not QBmmMX")
+                else:
+                    if qbmm_map is not QBmm:
+                        raise ValueError("Mapping for matmul_or_bmm is not QBmm")
     # End mx_specs checks
diff --git a/patches/README.md b/patches/README.md
@@ -15,7 +15,7 @@ To make a git diff patch file, first make your desired changes to the repository
 ```
 git diff > <package>.patch
 ```
-Packages may include files that differ by whitespaces even if you didn't change them.
+Packages may include files that differ by white spaces even if you didn't change them.
 To address this, add `--ignore-all-spaces` to the `git diff` command.
 
 To test the patch file, copy the `<package>.patch` file to `fms-model-optimizer/patches`.
diff --git a/tests/models/test_mx.py b/tests/models/test_mx.py
@@ -134,3 +134,30 @@ def test_residualMLP(
             assert module.mx_specs["a_elem_format"] == mx_format
 
     assert found_qmodule_mx
+
+
+@pytest.mark.skipif(
+    not available_packages["mx"],
+    reason="Skipping mx_specs error test; No package found",
+)
+def test_mx_specs_after_qconfig_init(
+    model_residualMLP: torch.nn.Module,
+    input_residualMLP: torch.FloatTensor,
+    config_fp32: dict,
+):
+    """
+    Test if a default config w/ MX qmodes trigger setting mx_specs inside qmodel_prep
+
+    Args:
+        model_residualMLP (torch.nn.Module): Single fp32 model.
+        input_residualMLP (torch.FloatTensor): Random 16x128 tensor.
+        config_fp32 (dict): Config w/ fp32 settings.
+    """
+    config_fp32["qa_mode"] = "mx_fp8_e5m2"
+    config_fp32["qw_mode"] = "mx_fp8_e5m2"
+
+    assert "mx_specs" not in config_fp32
+
+    qmodel_prep(model_residualMLP, input_residualMLP, config_fp32, use_dynamo=True)
+
+    assert "mx_specs" in config_fp32

-Original file line number
+Diff line change
 vllm
 xs
 zp
+-
 +microxcaling
 +MX
 +MXINT
 +MXFP