Merge pull request #165 from bayo-ibm/eval_1gpu

chichun-charlie-liu · web-flow · commit 78214ae7e403 · 2025-07-24T13:21:09.000-04:00
fix: enabling block-by-block evaluation for granite-3.x-models
diff --git a/fms_mo/quant/ptq.py b/fms_mo/quant/ptq.py
@@ -2140,14 +2140,22 @@ def get_blocks(model, model_type=None):
             None,
             "lm_head",
         ),
-        "granite": (
+        "granite_gptbigcode": (
             "transformer.h",
             "transformer.wte",
             "transformer.wpe",
             None,
             "transformer.ln_f",
             "lm_head",
         ),
+        "granite": (
+            "model.layers",
+            "model.embed_tokens",
+            "model.rotary_emb",
+            None,
+            "model.norm",
+            "lm_head",
+        ),
         "llama": (
             "model.layers",
             "model.embed_tokens",
diff --git a/fms_mo/run_quant.py b/fms_mo/run_quant.py
@@ -155,7 +155,6 @@ def run_gptq(model_args, data_args, opt_args, gptq_args):
             v2_memory_device="cpu",
         )
 
-
     # Add custom model_type mapping to gptqmodel LUT so GPTQModel can recognize them.
     for mtype, cls in custom_gptq_classes.items():
         if mtype in MODEL_MAP:
diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py
@@ -139,8 +139,8 @@ class OptArguments(TypeChecker):
 
     quant_method: str = field(
         metadata={
-            "choices": ["gptq", "gptqv2", "fp8", "dq"], 
-            "help": "Quantization technique"
+            "choices": ["gptq", "gptqv2", "fp8", "dq"],
+            "help": "Quantization technique",
         }
     )
     output_dir: str = field(
@@ -229,7 +229,6 @@ class GPTQArguments(TypeChecker):
     cache_examples_on_gpu: bool = True
 
 
-
 @dataclass
 class FP8Arguments(TypeChecker):
     """Dataclass for FP8 related arguments that will be used by llm-compressor."""
diff --git a/fms_mo/utils/eval_utils.py b/fms_mo/utils/eval_utils.py
@@ -98,6 +98,10 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs):  #
     logger.info("All blocks are computed for evaluation")
 
     nlls = []
+
+    # Required by Granite-3.X (and potentially other models) to scale their logits
+    logits_scaling = getattr(model.config, "logits_scaling", 1)
+
     # for i, data_mb in enumerate(dloader): #if using dloader.
     for i in tqdm(range(qcfg["n_samples"]), desc="Final Evaluating..."):
         hidden_states = qcfg["cached_input"][i].to(dev)
@@ -107,6 +111,9 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs):  #
         lm_head.to(dev)
         lm_logits = lm_head(hidden_states)
 
+        # Scaling of the lm_head outputs to obtain the correct logits
+        lm_logits /= logits_scaling
+
         # Shift so that tokens < n predict n
         shift_logits = lm_logits[:, :-1, :].contiguous().float()
         shift_labels = test_dataset.input_ids[:, (i * seq_len) : ((i + 1) * seq_len)][

Original file line number	Diff line number	Diff line change
`@@ -155,7 +155,6 @@ def run_gptq(model_args, data_args, opt_args, gptq_args):`
`155`	`155`	`v2_memory_device="cpu",`
`156`	`156`	`)`
`157`	`157`
`158`		`-`
`159`	`158`	`# Add custom model_type mapping to gptqmodel LUT so GPTQModel can recognize them.`
`160`	`159`	`for mtype, cls in custom_gptq_classes.items():`
`161`	`160`	`if mtype in MODEL_MAP:`