improve transformers tracing for last layers

chichun-charlie-liu · chichun-charlie-liu · commit ca3f7634513c · 2025-02-25T14:56:03.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -105,6 +105,12 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         or not isinstance(model_args.torch_dtype, str)
         else getattr(torch, model_args.torch_dtype)
     )
+    # NOTE for models that cannot fit in 1 GPU, keep it on CPU and use block-wise calibration.
+    # or leverage HF's device_map="auto", BUT tracing will not work properly with "auto"
+    total_gpu_memory = 1e-5
+    if torch.cuda.is_available():
+        total_gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+
     model = AutoModelForCausalLM.from_pretrained(
         model_args.model_name_or_path,
         from_tf=bool(".ckpt" in model_args.model_name_or_path),
@@ -113,8 +119,8 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         revision="main",
         use_auth_token=True if model_args.use_auth_token else None,
         torch_dtype=torch_dtype,
-        low_cpu_mem_usage=model_args.low_cpu_mem_usage,
-        device_map="auto" if model_args.low_cpu_mem_usage else None,
+        device_map=model_args.device_map,
+        low_cpu_mem_usage=bool(model_args.device_map),
     )
 
     embedding_size = model.get_input_embeddings().weight.shape[0]
@@ -125,11 +131,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     logger.info(f"Model is at {model.device} after intialization")
     logger.info(f"Tokenizer is {tokenizer}, block size is {block_size}")
     qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
-    # for models that cannot fit in 1 GPU, keep it on CPU and use block-wise calibration.
-    # or leverage HF's device_map="auto"
-    total_gpu_memory = 1e-5
-    if torch.cuda.is_available():
-        total_gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+
     model_size = model_size_Wb(model, unit="GB")
     gpu_mem_util_per = model_size / total_gpu_memory
 
@@ -145,7 +147,8 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         name in model_args.model_name_or_path for name in known_large_models
     ) or (gpu_mem_util_per > 0.7)
     dev = "cpu" if qcfg["large_model"] else "cuda"
-    model.to(dev)
+    if model_args.device_map is None:
+        model.to(dev)
 
     if hasattr(model.config, "model_type"):
         qcfg["model_type"] = model.config.model_type
diff --git a/fms_mo/fx/dynamo_utils.py b/fms_mo/fx/dynamo_utils.py
@@ -1006,17 +1006,10 @@ def cus_backend_model_analyzer(
         if len(all_conv) > 0:
             skip_candidates += find_conv_on_shortcut_gm(gm_fx, lut_fx_mod_name_to_org)
 
-        # Check 2. first/last, see Note 2 and 3
+        # Check 2. first/last, see Note 2 and 3, NOTE that transformers are handled differently
         if qcfg["N_backend_called"] > 1:
             skip_candidates += []
-        elif is_transformers:
-            _, last_only = find_1st_last_gm(
-                gm_fx,
-                return_1st_last_sep=True,
-                lut_fx_mod_name_to_org=lut_fx_mod_name_to_org,
-            )
-            skip_candidates += last_only
-        else:
+        elif not is_transformers:
             # see Note 4
             skip_candidates += find_1st_last_gm(
                 gm_fx, lut_fx_mod_name_to_org=lut_fx_mod_name_to_org
@@ -1082,6 +1075,7 @@ def cus_backend_model_analyzer(
         model_to_be_traced = model
         model_param_size = 999
 
+    is_transformers = issubclass(type(model), PreTrainedModel)
     if model_param_size > 1:
         # Standard
         import sys
@@ -1091,7 +1085,7 @@ def cus_backend_model_analyzer(
 
     cus_bknd = partial(
         cus_backend_model_analyzer,
-        is_transformers=issubclass(type(model), PreTrainedModel),
+        is_transformers=is_transformers,
         plotsvg=plotsvg,
     )
 
@@ -1104,6 +1098,27 @@ def cus_backend_model_analyzer(
     if "bmm_prep" not in qcfg:
         qcfg["bmm_prep"] = {"which2patch_contextmanager": None, "layers_with_bmm": {}}
 
+    if is_transformers:
+        # NOTE simplified method to determine 1st/last modules for transformers.
+        # will not work if model has multiple parallel heads at the end, e.g. obj det
+        def call_seq_hook(mod, *_args, **_kwargs):
+            qcfg["mod_call_seq"].append(lut_weight2modname[mod.weight])
+
+        h_hooks = []
+        qcfg["mod_call_seq"] = []
+        for n, m in model.named_modules():
+            if isinstance(m, (torch.nn.Linear, torch.nn.Conv2d)):
+                h_hooks.append(m.register_forward_hook(call_seq_hook))
+
+        with torch.no_grad():
+            model(**sample_inp)
+
+        for h in h_hooks:
+            h.remove()
+
+        # only add last layer
+        qcfg["qskip_layer_name"] += qcfg["mod_call_seq"][-1]
+
     with torch.no_grad():
         model_opt = torch.compile(
             model_to_be_traced,
diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py
@@ -56,11 +56,12 @@ class ModelArguments(TypeChecker):
 
     model_name_or_path: str = field(default="facebook/opt-125m")
     torch_dtype: str = field(default="bfloat16")
-    low_cpu_mem_usage: bool = field(
-        default=False,
+    device_map: Optional[str] = field(
+        default=None,
         metadata={
-            "help": "When set to True, leverage device_map='auto' and let HF to move modules"
-            "between cpu and cuda automatically during inference."
+            "help": "can be 'auto', 'balanced', 'balanced_low_0', 'sequential' or something like"
+            " {'encoder':'cuda:1', 'decoder': 'cuda:2'}.\n"
+            "HF will try to move modules between cpu and cuda automatically during inference."
         },
     )
     use_fast_tokenizer: bool = field(