Merge pull request #87 from chichun-charlie-liu/main

chichun-charlie-liu · web-flow · commit 51d4cf1632c7 · 2025-04-05T08:45:42.000-04:00
fix: fix QBmm detection and default behavior
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,4 @@ fms_mo.log
 data*_train/
 data*_test/
 act_scales/
+examples/
diff --git a/fms_mo/fx/dynamo_utils.py b/fms_mo/fx/dynamo_utils.py
@@ -32,6 +32,14 @@
 
 logger = logging.getLogger(__name__)
 
+# From PyTorch 2.5+, graphModule received in dynamo custom backend will be Aten IR instead of FX IR,
+# i.e. no "call_module" nodes, all parameter tensors become "placeholder" nodes, and etc...
+# This following flag will make dynamo behaves like PyTorch 2.4. Only use it when model_analyzer()
+# really stop working and hard to recover.
+# Ref: https://pytorch.org/tutorials/recipes/regional_compilation.html
+
+# torch._dynamo.config.inline_inbuilt_nn_modules = False
+
 
 def run_fwd_once(model, sample_inp):
     """Convenient function to run model once using correct input unpack."""
@@ -836,14 +844,16 @@ def find_and_prep_bmm_gm(gm, lut_fx_mod_name_to_org: Optional[Dict[str, str]] =
         return_dict["which2patch_contextmanager"] = "torch.matmul"
         LUT2sort = all_matmuls
     else:
-        warn_msg = None
         if Nbmm_found > 0 and Nmatmul_found > 0:
-            warn_msg = "Both bmm and matmul are found. Not sure which to patch."
-        elif Nbmm_found == 0 and Nmatmul_found == 0 and len(all_sdpas) > 0:
-            warn_msg = "No bmm and matmul are found. Likely SDPA is enabled."
+            raise RuntimeError(
+                "Both bmm and matmul are found. Not sure which to patch."
+            )
+        if Nbmm_found == 0 and Nmatmul_found == 0 and len(all_sdpas) > 0:
+            logger.warning(
+                "No bmm and matmul are found. Likely SDPA is enabled. "
+                "Will patch nothing!"
+            )
 
-        if warn_msg:
-            logger.warning(f"{warn_msg} Will patch nothing.")
         return return_dict
 
     LUTmodname2linenum = {}  # see Note 4
@@ -1085,6 +1095,25 @@ def cus_backend_model_analyzer(
                 "which2patch_contextmanager"
             ]
             qcfg["bmm_prep"]["layers_with_bmm"].update(temp_dict["layers_with_bmm"])
+            # make sure there are ONLY 2 bmm per layer (self_attention). some models may use
+            # additional bmm/matmuls. Raise warning if that's the case.
+            num_layers = len(temp_dict["layers_with_bmm"])
+            num_bmms = 0
+            seen_line_num = []
+            for line_nums in temp_dict["layers_with_bmm"].values():
+                num_bmms += len(line_nums)
+                for line_num in line_nums:
+                    if line_num not in seen_line_num:
+                        seen_line_num.append(line_num)
+            qcfg["bmm_prep"]["bmm_only_in_self_attn"] = True
+            if num_bmms != num_layers * 2 or len(seen_line_num) != 2:
+                qcfg["bmm_prep"]["bmm_only_in_self_attn"] = False
+                logger.warning(
+                    "This model uses additional matmul/bmm other than those in self-attention. "
+                    "If you plan to quantize self-attention, please note that the additional bmms "
+                    "may also be quantized!"
+                    f"{temp_dict['layers_with_bmm']}\n"
+                )
 
         # Check 7: QKV
         temp_dict = find_qkvsync_candidates_gm(
@@ -1213,6 +1242,32 @@ def call_seq_hook(mod, *_args, **_kwargs):
                 )
                 setattr(mod_bmm_happened, f"QBmm{ln}", newQBmm)
 
+        # add auto QBmm check to last layer if any QBmms in model (only for transformers)
+        def qbmm_auto_check(_mod, *_args, **_kwargs):
+            """Automatic QBmm check. This hook will be attached to the last module and check once
+            only at the end of first forward() call. Throw a "warning" if a model has QBmm attached
+            but not called (as it could be intentional.)
+            """
+            num_called_qbmms = []
+            for lay, line_nums in qcfg["bmm_prep"]["layers_with_bmm"].items():
+                for ln in line_nums:
+                    qbmm_i = model.get_submodule(f"{lay}.QBmm{ln}")
+                    num_called_qbmms.append(qbmm_i.num_module_called == 1)
+
+            if not all(num_called_qbmms):
+                err_msg = (
+                    "QBmms were attached but not called during forward()."
+                    "Possibly patch_torch_bmm() context manager is missing."
+                )
+                if qcfg["force_stop_if_qbmm_auto_check_failed"]:
+                    raise RuntimeError(err_msg)
+                logger.warning(err_msg)
+
+            qcfg["hook_qbmm_auto_check"].remove()
+
+        last_mod = model.get_submodule(qcfg["mod_call_seq"][-1])
+        qcfg["hook_qbmm_auto_check"] = last_mod.register_forward_hook(qbmm_auto_check)
+
     # c) identify RPN/FPN
     # TODO this hack only works for torchvision models. will use find_rpn_fpn_gm()
 
diff --git a/fms_mo/utils/qconfig_utils.py b/fms_mo/utils/qconfig_utils.py
@@ -198,6 +198,7 @@ def qconfig_init(recipe: str = None, args: Any = None):
     qcfg["which2patch_contextmanager"] = (
         None  # an internal var that should not be set by user
     )
+    qcfg["force_stop_if_qbmm_auto_check_failed"] = False
 
     # LSTM related, if any of these is not None, then last layer (FC) will not be skipped.
     qcfg["nbits_w_lstm"] = None
@@ -372,6 +373,7 @@ def remove_unwanted_from_config(config):
         "LUTmodule_name",
         "qkvsync_my_1st_sibling",
         "graph_in_out",
+        "hook_qbmm_auto_check",
     ]
     len_before = len(config)
     dump = {k: config.pop(k) for k in unwanted_items if k in config}
diff --git a/fms_mo/utils/utils.py b/fms_mo/utils/utils.py
@@ -71,7 +71,7 @@ def move_to(obj, device):
     return obj
 
 
-def mockbmm(mat1, mat2):
+def mockbmm(mat1, mat2, default_to_torch=False):
     """
     This function is used to mock the behavior of the bmm function in PyTorch.
     It is used to work around the fact that the bmm function in PyTorch is not
@@ -86,20 +86,23 @@ def mockbmm(mat1, mat2):
     """
     cf = sys._getframe()
     qbmm_mod = None
+    qbmm_lineno = cf.f_back.f_lineno
     while cf.f_back and qbmm_mod is None:
         # First frame is QBmm's forward itself, can start searching from previous stack
         cf = cf.f_back
-        if "forward" in cf.f_code.co_name or "_attn" in cf.f_code.co_name:
+        if (
+            "forward" in cf.f_code.co_name or "_attn" in cf.f_code.co_name
+        ) and "self" in cf.f_locals:
             mod_calling_bmm_function = cf.f_locals["self"]
             # If not found -> default to torch.matmul
-            qbmm_mod = getattr(
-                mod_calling_bmm_function, "QBmm" + str(cf.f_lineno), torch.matmul
-            )
+            qbmm_mod = getattr(mod_calling_bmm_function, f"QBmm{qbmm_lineno}", None)
     del cf
+    if qbmm_mod is None and default_to_torch:
+        qbmm_mod = torch.matmul
     return qbmm_mod(mat1, mat2)
 
 
-def mockmatmul(mat1, mat2):
+def mockmatmul(mat1, mat2, default_to_torch=False):
     """
     Patches torch.matmul() with QBmm( torch.bmm() )
 
@@ -109,31 +112,37 @@ def mockmatmul(mat1, mat2):
 
     Returns:
         torch.Tensor: The result of the mock matrix multiplication.
+    NOTE:
+        1. First frame is mockmatmul itself. One frame back (cf.f_back) is where torch.matmul
+            happened, whose line number is the one used for QBmm<xxx>
+        2. QBmm module may not be attached to the immediate frame where torch.matmul happened. Need
+            to trace back and find the frame with both "forward" in name and "self" in locals, i.e.
+            a class (nn.module) has a function named "forward" something
+        3. Keep default_to_torch=False unless really needed, otherwise if something went wrong with
+            QBmm detection, it could go to default silently, which would be very difficult to debug.
     """
     cf = sys._getframe()
     qbmm_mod = None
+    qbmm_lineno = cf.f_back.f_lineno
     while cf.f_back and qbmm_mod is None:
-        # First frame is QBmm's forward itself, can start searching from previous stack
         cf = cf.f_back
         if (
             "forward" in cf.f_code.co_name or "_attn" in cf.f_code.co_name
         ) and "self" in cf.f_locals:
             mod_calling_bmm_function = cf.f_locals["self"]
             # If not found -> default to torch.bmm
-            qbmm_mod = getattr(
-                mod_calling_bmm_function, "QBmm" + str(cf.f_lineno), torch.bmm
-            )
+            qbmm_mod = getattr(mod_calling_bmm_function, f"QBmm{qbmm_lineno}", None)
     del cf
 
     # Didn't find the corresponding QBmm, default the call to torch.bmm
-    if qbmm_mod == torch.bmm:
+    if qbmm_mod is None and default_to_torch:
         org_batch_header = mat1.shape[:2]
         # Need to double check m1/m2 are 3d, otherwise reshape
         if len(mat1.shape) > 3:
             mat1 = mat1.reshape([-1, mat1.shape[-2], mat1.shape[-1]])
         if len(mat2.shape) > 3:
             mat2 = mat2.reshape([-1, mat2.shape[-2], mat2.shape[-1]])
-        output = qbmm_mod(mat1, mat2)
+        output = torch.bmm(mat1, mat2)
         output = output.reshape([*org_batch_header, *output.shape[1:]])
         return output
     return qbmm_mod(mat1, mat2)
@@ -149,6 +158,9 @@ def patch_torch_bmm(qcfg):
     if qcfg is not None:
         # could be 'torch.bmm', 'torch.matmul', or None
         ops_to_patch = qcfg.get("which2patch_contextmanager", None)
+        # if qcfg["bmm_prep"]["bmm_only_in_self_attn"] is False, may need to enable default_to_torch
+        # in mock functions, e.g. partial(mockmatmul, default_to_torch=True)
+        # This is in case a model uses extra matmuls, and QBmmXXX is not found or attached properly.
         new_target = (
             mockbmm
             if ops_to_patch == "torch.bmm"
diff --git a/tests/models/conftest.py b/tests/models/conftest.py
@@ -1092,3 +1092,16 @@ def model_bert():
         transformers.models.bert.modeling_bert.BertModel: BERT model
     """
     return BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
+
+
+@pytest.fixture(scope="function")
+def model_bert_eager():
+    """
+    Get a BERT model
+
+    Returns:
+        transformers.models.bert.modeling_bert.BertModel: BERT model
+    """
+    return BertModel.from_pretrained(
+        "google-bert/bert-base-uncased", torchscript=True, attn_implementation="eager"
+    )
diff --git a/tests/models/test_model_utils.py b/tests/models/test_model_utils.py
@@ -26,6 +26,7 @@
 import torch
 
 # Local
+from fms_mo.modules.bmm import QBmm
 from fms_mo.modules.conv import DetQConv2d, QConv2d, QConv2dPTQ, QConv2dPTQv2
 from fms_mo.modules.linear import QLinear
 from fms_mo.utils.qconfig_utils import serialize_config
@@ -99,7 +100,7 @@ def count_qmodules(model: torch.nn.Module):
     """
     torch_modules, fms_qmodules = [], []
     for n, m in model.named_modules():
-        if isinstance(m, (QConv2d, QLinear)):
+        if isinstance(m, (QConv2d, QLinear, QBmm)):
             fms_qmodules.append((n, m))
         elif isinstance(m, (Conv2d, Linear)):
             torch_modules.append((n, m))
diff --git a/tests/models/test_qmodelprep.py b/tests/models/test_qmodelprep.py
@@ -26,7 +26,8 @@
 # fms_mo imports
 from fms_mo import qmodel_prep
 from fms_mo.prep import has_quantized_module
-from tests.models.test_model_utils import delete_config, qmodule_error
+from fms_mo.utils.utils import patch_torch_bmm
+from tests.models.test_model_utils import count_qmodules, delete_config, qmodule_error
 
 ################
 # Qmodel tests #
@@ -257,3 +258,63 @@ def test_bert_dynamo(
     delete_config()
     qmodel_prep(model_bert, input_bert, config_int8, use_dynamo=True)
     qmodule_error(model_bert, 1, 72)
+
+
+def test_bert_dynamo_wi_qbmm(
+    model_bert_eager: transformers.models.bert.modeling_bert.BertModel,
+    input_bert: torch.FloatTensor,
+    config_int8: dict,
+):
+    """
+    Perform int8 quantization on BERT w/ Dynamo tracer and QBmm modules. QBmms will be run in place
+    of torch.matmul/torch.bmm automatically, if everything is set up correctly. See the 3 checks
+    below for more details.
+    NOTE:
+        1. QBmm modules will be added after qmodel_prep(), see check 1.
+        2. The self-attention forward() will still call torch.matmul as written in the original
+            python code, i.e. if we check QLinear.num_called and QBmm.num_called, they will be 1 and
+            0, respectively, meaning QBmms were attached but not called.
+        3. By using patch_torch_bmm() context manager, QBmm modules will be triggered and those
+            torch.matmul (usually 2 per attn module) calls will be redirect to QBmm's forward.
+
+    Args:
+        model_bert (transformers.models.bert.modeling_bert.BertModel): BERT model + weights
+        input_bert (torch.FloatTensor): Tokenized input for BERT
+        config (dict): Recipe Config w/ int8 settings
+    """
+    delete_config()
+    config_int8["nbits_bmm1"] = 8
+    config_int8["nbits_bmm2"] = 8
+    qmodel_prep(model_bert_eager, input_bert, config_int8, use_dynamo=True)
+
+    # check 1: make sure QBmm are added, i.e. 72 QLinear + 24 QBmm
+    qmodule_error(model_bert_eager, 1, 96)
+
+    _, fms_qmodules = count_qmodules(model_bert_eager)
+    qbmms = []
+    other_qmodules = []
+    for n, m in fms_qmodules:
+        if "QBmm" in n:
+            qbmms.append(m)
+        else:
+            other_qmodules.append(m)
+
+    # check 2: model call without our "patch" context manager, will not reach QBmm
+    #           we have an auto check in place, but it will only log warning, unless this flag
+    #           qcfg["force_stop_if_qbmm_auto_check_failed"] = True
+    with torch.no_grad():
+        model_bert_eager(**input_bert)
+    assert all(
+        m.num_module_called == 0 for m in qbmms
+    ), "Some QBmm was called when they shouldn't be."
+
+    # check 3: model call with context manager, will reach QBmm
+    with torch.no_grad(), patch_torch_bmm(config_int8):
+        model_bert_eager(**input_bert)
+    assert all(
+        m.num_module_called == 1 for m in qbmms
+    ), "Some QBmm was not called properly."
+
+    assert all(
+        m.num_module_called == 2 for m in other_qmodules
+    ), "Modules other than QBmm were not called properly."

Original file line number	Diff line number	Diff line change
`@@ -198,6 +198,7 @@ def qconfig_init(recipe: str = None, args: Any = None):`
`198`	`198`	`qcfg["which2patch_contextmanager"] = (`
`199`	`199`	`None # an internal var that should not be set by user`
`200`	`200`	`)`
	`201`	`+ qcfg["force_stop_if_qbmm_auto_check_failed"] = False`
`201`	`202`
`202`	`203`	`# LSTM related, if any of these is not None, then last layer (FC) will not be skipped.`
`203`	`204`	`qcfg["nbits_w_lstm"] = None`
`@@ -372,6 +373,7 @@ def remove_unwanted_from_config(config):`
`372`	`373`	`"LUTmodule_name",`
`373`	`374`	`"qkvsync_my_1st_sibling",`
`374`	`375`	`"graph_in_out",`
	`376`	`+ "hook_qbmm_auto_check",`
`375`	`377`	`]`
`376`	`378`	`len_before = len(config)`
`377`	`379`	`dump = {k: config.pop(k) for k in unwanted_items if k in config}`