fix QBmm detection and default behavior

chichun-charlie-liu · chichun-charlie-liu · commit dd53de5abd01 · 2025-04-03T04:56:28.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,4 @@ fms_mo.log
 data*_train/
 data*_test/
 act_scales/
+examples/
diff --git a/fms_mo/fx/dynamo_utils.py b/fms_mo/fx/dynamo_utils.py
@@ -32,6 +32,14 @@
 
 logger = logging.getLogger(__name__)
 
+# From PyTorch 2.5+, graphModule received in dynamo custom backend will be Aten IR instead of FX IR,
+# i.e. no "call_module" nodes, all parameter tensors become "placeholder" nodes, and etc...
+# This following flag will make dynamo behaves like PyTorch 2.4. Only use it when model_analyzer()
+# really stop working and hard to recover.
+# Ref: https://pytorch.org/tutorials/recipes/regional_compilation.html
+
+# torch._dynamo.config.inline_inbuilt_nn_modules = False
+
 
 def run_fwd_once(model, sample_inp):
     """Convenient function to run model once using correct input unpack."""
@@ -836,14 +844,16 @@ def find_and_prep_bmm_gm(gm, lut_fx_mod_name_to_org: Optional[Dict[str, str]] =
         return_dict["which2patch_contextmanager"] = "torch.matmul"
         LUT2sort = all_matmuls
     else:
-        warn_msg = None
         if Nbmm_found > 0 and Nmatmul_found > 0:
-            warn_msg = "Both bmm and matmul are found. Not sure which to patch."
-        elif Nbmm_found == 0 and Nmatmul_found == 0 and len(all_sdpas) > 0:
-            warn_msg = "No bmm and matmul are found. Likely SDPA is enabled."
+            raise RuntimeError(
+                "Both bmm and matmul are found. Not sure which to patch."
+            )
+        if Nbmm_found == 0 and Nmatmul_found == 0 and len(all_sdpas) > 0:
+            logger.warning(
+                "No bmm and matmul are found. Likely SDPA is enabled. "
+                "Will patch nothing!"
+            )
 
-        if warn_msg:
-            logger.warning(f"{warn_msg} Will patch nothing.")
         return return_dict
 
     LUTmodname2linenum = {}  # see Note 4
@@ -1085,6 +1095,25 @@ def cus_backend_model_analyzer(
                 "which2patch_contextmanager"
             ]
             qcfg["bmm_prep"]["layers_with_bmm"].update(temp_dict["layers_with_bmm"])
+            # make sure there are ONLY 2 bmm per layer (self_attention). some models may use
+            # additional bmm/matmuls. Raise warning if that's the case.
+            num_layers = len(temp_dict["layers_with_bmm"])
+            num_bmms = 0
+            seen_line_num = []
+            for line_nums in temp_dict["layers_with_bmm"].values():
+                num_bmms += len(line_nums)
+                for line_num in line_nums:
+                    if line_num not in seen_line_num:
+                        seen_line_num.append(line_num)
+            qcfg["bmm_prep"]["bmm_only_in_self_attn"] = True
+            if num_bmms != num_layers * 2 or len(seen_line_num) != 2:
+                qcfg["bmm_prep"]["bmm_only_in_self_attn"] = False
+                logger.warning(
+                    "This model uses additional matmul/bmm other than those in self-attention. "
+                    "If you plan to quantize self-attention, please note that the additional bmms "
+                    "may also be quantized!"
+                    f"{temp_dict['layers_with_bmm']}\n"
+                )
 
         # Check 7: QKV
         temp_dict = find_qkvsync_candidates_gm(
diff --git a/fms_mo/utils/utils.py b/fms_mo/utils/utils.py
@@ -71,7 +71,7 @@ def move_to(obj, device):
     return obj
 
 
-def mockbmm(mat1, mat2):
+def mockbmm(mat1, mat2, default_to_torch=False):
     """
     This function is used to mock the behavior of the bmm function in PyTorch.
     It is used to work around the fact that the bmm function in PyTorch is not
@@ -86,20 +86,23 @@ def mockbmm(mat1, mat2):
     """
     cf = sys._getframe()
     qbmm_mod = None
+    qbmm_lineno = cf.f_back.f_lineno
     while cf.f_back and qbmm_mod is None:
         # First frame is QBmm's forward itself, can start searching from previous stack
         cf = cf.f_back
-        if "forward" in cf.f_code.co_name or "_attn" in cf.f_code.co_name:
+        if (
+            "forward" in cf.f_code.co_name or "_attn" in cf.f_code.co_name
+        ) and "self" in cf.f_locals:
             mod_calling_bmm_function = cf.f_locals["self"]
             # If not found -> default to torch.matmul
-            qbmm_mod = getattr(
-                mod_calling_bmm_function, "QBmm" + str(cf.f_lineno), torch.matmul
-            )
+            qbmm_mod = getattr(mod_calling_bmm_function, f"QBmm{qbmm_lineno}", None)
     del cf
+    if qbmm_mod is None and default_to_torch:
+        qbmm_mod = torch.matmul
     return qbmm_mod(mat1, mat2)
 
 
-def mockmatmul(mat1, mat2):
+def mockmatmul(mat1, mat2, default_to_torch=False):
     """
     Patches torch.matmul() with QBmm( torch.bmm() )
 
@@ -109,31 +112,37 @@ def mockmatmul(mat1, mat2):
 
     Returns:
         torch.Tensor: The result of the mock matrix multiplication.
+    NOTE:
+        1. First frame is mockmatmul itself. One frame back (cf.f_back) is where torch.matmul
+            happened, whose line number is the one used for QBmm<xxx>
+        2. QBmm module may not be attached to the immediate frame where torch.matmul happened. Need
+            to trace back and find the frame with both "forward" in name and "self" in locals, i.e.
+            a class (nn.module) has a function named "forward" something
+        3. Keep default_to_torch=False unless really needed, otherwise if something went wrong with
+            QBmm detection, it could go to default silently, which would be very difficult to debug.
     """
     cf = sys._getframe()
     qbmm_mod = None
+    qbmm_lineno = cf.f_back.f_lineno
     while cf.f_back and qbmm_mod is None:
-        # First frame is QBmm's forward itself, can start searching from previous stack
         cf = cf.f_back
         if (
             "forward" in cf.f_code.co_name or "_attn" in cf.f_code.co_name
         ) and "self" in cf.f_locals:
             mod_calling_bmm_function = cf.f_locals["self"]
             # If not found -> default to torch.bmm
-            qbmm_mod = getattr(
-                mod_calling_bmm_function, "QBmm" + str(cf.f_lineno), torch.bmm
-            )
+            qbmm_mod = getattr(mod_calling_bmm_function, f"QBmm{qbmm_lineno}", None)
     del cf
 
     # Didn't find the corresponding QBmm, default the call to torch.bmm
-    if qbmm_mod == torch.bmm:
+    if qbmm_mod is None and default_to_torch:
         org_batch_header = mat1.shape[:2]
         # Need to double check m1/m2 are 3d, otherwise reshape
         if len(mat1.shape) > 3:
             mat1 = mat1.reshape([-1, mat1.shape[-2], mat1.shape[-1]])
         if len(mat2.shape) > 3:
             mat2 = mat2.reshape([-1, mat2.shape[-2], mat2.shape[-1]])
-        output = qbmm_mod(mat1, mat2)
+        output = torch.bmm(mat1, mat2)
         output = output.reshape([*org_batch_header, *output.shape[1:]])
         return output
     return qbmm_mod(mat1, mat2)
@@ -149,6 +158,9 @@ def patch_torch_bmm(qcfg):
     if qcfg is not None:
         # could be 'torch.bmm', 'torch.matmul', or None
         ops_to_patch = qcfg.get("which2patch_contextmanager", None)
+        # if qcfg["bmm_prep"]["bmm_only_in_self_attn"] is False, may need to enable default_to_torch
+        # in mock functions, e.g. partial(mockmatmul, default_to_torch=True)
+        # This is in case a model uses extra matmuls, and QBmmXXX is not found or attached properly.
         new_target = (
             mockbmm
             if ops_to_patch == "torch.bmm"