Fix how model is sent to device to calculate smoothquant activation scales

andrea-fasoli · andrea-fasoli · commit fcd357cd9282 · 2024-12-05T21:40:50.000Z
Signed-off-by: Andrea Fasoli &lt;andrea.fasoli@ibm.com&gt;
diff --git a/fms_mo/quant/ptq.py b/fms_mo/quant/ptq.py
@@ -15,14 +15,15 @@
 """
 Post-Training Quantization (PTQ) functions
 
-Class StraightThrough, function _fold_bn, fold_bn_into_conv, reset_bn, and 
+Class StraightThrough, function _fold_bn, fold_bn_into_conv, reset_bn, and
 search_fold_and_remove_bn are modified from QDROP repo https://github.com/wimh966/QDrop
 
 
 """
 
 # Standard
 from functools import partial
+from typing import Optional
 import logging
 import math
 import random
@@ -2383,14 +2384,26 @@ def input_stats_hook(m, x, _y, name, act_scales):
 
 
 @torch.no_grad()
-def get_act_scales(model, dloader, qcfg):
-    """
-    To get max() of activations for linear layers on one device.
-    Model size will be limited by memory (GPU) or speed (cpu)
+def get_act_scales(
+    model,
+    dloader,
+    qcfg: dict,
+    device: Optional[str | torch.device] = None,
+):
+    """Compute smoothquant activation scales of quantized linear layers.
+    Model and examples are moved to selected device, if provided.
     """
 
     model.eval()
-    model.cuda()
+
+    if device is None:
+        device = next(model.parameters()).device
+    else:
+        logger.info(
+            f"Moving model to {device} to compute smoothquant activation scales"
+        )
+        model.to(device)
+
     dev = next(model.parameters()).device
     act_scales = {}
     qcfg["sample_id"] = 0
@@ -2408,7 +2421,6 @@ def get_act_scales(model, dloader, qcfg):
 
     for data_mb, _ in zip(pbar, range(n_samples)):
         qcfg["sample_id"] += 1
-        # logger.info("Now for sample: ", qcfg["sample_id"] )
         data_mb = move_to(data_mb, dev)
         if (
             qcfg["nbits_bmm1"] < 32