add a unit test to cover qbmm attachment and all reachable

chichun-charlie-liu · chichun-charlie-liu · commit 357622dc1a94 · 2025-04-03T17:55:20.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/tests/models/conftest.py b/tests/models/conftest.py
@@ -1092,3 +1092,16 @@ def model_bert():
         transformers.models.bert.modeling_bert.BertModel: BERT model
     """
     return BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
+
+
+@pytest.fixture(scope="function")
+def model_bert_eager():
+    """
+    Get a BERT model
+
+    Returns:
+        transformers.models.bert.modeling_bert.BertModel: BERT model
+    """
+    return BertModel.from_pretrained(
+        "google-bert/bert-base-uncased", torchscript=True, attn_implementation="eager"
+    )
diff --git a/tests/models/test_model_utils.py b/tests/models/test_model_utils.py
@@ -26,6 +26,7 @@
 import torch
 
 # Local
+from fms_mo.modules.bmm import QBmm
 from fms_mo.modules.conv import DetQConv2d, QConv2d, QConv2dPTQ, QConv2dPTQv2
 from fms_mo.modules.linear import QLinear
 from fms_mo.utils.qconfig_utils import serialize_config
@@ -99,7 +100,7 @@ def count_qmodules(model: torch.nn.Module):
     """
     torch_modules, fms_qmodules = [], []
     for n, m in model.named_modules():
-        if isinstance(m, (QConv2d, QLinear)):
+        if isinstance(m, (QConv2d, QLinear, QBmm)):
             fms_qmodules.append((n, m))
         elif isinstance(m, (Conv2d, Linear)):
             torch_modules.append((n, m))
diff --git a/tests/models/test_qmodelprep.py b/tests/models/test_qmodelprep.py
@@ -26,7 +26,8 @@
 # fms_mo imports
 from fms_mo import qmodel_prep
 from fms_mo.prep import has_quantized_module
-from tests.models.test_model_utils import delete_config, qmodule_error
+from fms_mo.utils.utils import patch_torch_bmm
+from tests.models.test_model_utils import count_qmodules, delete_config, qmodule_error
 
 ################
 # Qmodel tests #
@@ -257,3 +258,38 @@ def test_bert_dynamo(
     delete_config()
     qmodel_prep(model_bert, input_bert, config_int8, use_dynamo=True)
     qmodule_error(model_bert, 1, 72)
+
+
+def test_bert_dynamo_wi_qbmm(
+    model_bert_eager: transformers.models.bert.modeling_bert.BertModel,
+    input_bert: torch.FloatTensor,
+    config_int8: dict,
+):
+    """
+    Perform int8 quantization on BERT w/ Dynamo tracer and QBmm modules
+
+    Args:
+        model_bert (transformers.models.bert.modeling_bert.BertModel): BERT model + weights
+        input_bert (torch.FloatTensor): Tokenized input for BERT
+        config (dict): Recipe Config w/ int8 settings
+    """
+    delete_config()
+    config_int8["nbits_bmm1"] = 8
+    config_int8["nbits_bmm2"] = 8
+    qmodel_prep(model_bert_eager, input_bert, config_int8, use_dynamo=True)
+
+    # check 1: make sure QBmm are added, i.e. 72 QLinear + 24 QBmm
+    qmodule_error(model_bert_eager, 1, 96)
+
+    # check 2: make sure context manager can reach QBmm
+    _, fms_qmodules = count_qmodules(model_bert_eager)
+    with torch.no_grad(), patch_torch_bmm(config_int8):
+        model_bert_eager(**input_bert)
+    qbmms = [m for n, m in fms_qmodules if "QBmm" in n]
+
+    assert all(
+        m.num_module_called == 1 for m in qbmms
+    ), "Some QBmm was not called properly."
+    assert all(
+        m.num_module_called == 1 for _, m in fms_qmodules
+    ), "Some module was not called properly."