foundation-model-stack
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎fms_mo/dq.py‎
Lines changed: 12 additions & 3 deletions b/‎fms_mo/dq.py‎
Lines changed: 12 additions & 3 deletions
@@ -33,3 +33,8 @@ venv/
 # Generated by spelling check
 dictionary.dic
 
+# Files generated from running examples
+fms_mo.log
+data_train/
+data_test/
+act_scales/
@@ -66,14 +66,19 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         opt_args (fms_mo.training_args.OptArguments): Generic optimization arguments to be used
             during DQ
         fms_mo_args (fms_mo.training_args.FMSMOArguments): Parameters to use for DQ quantization
+
+    NOTE:
+        use dynamo tracing instead of torchscript by default. if torchscript is needed, change
+        1) config_kwarks and 2) use_dynamo in qmodel_prep()
+
     """
     # for attention or kv-cache quantization, need to use eager attention
     attn_bits = [
         fms_mo_args.nbits_bmm1,
         fms_mo_args.nbits_bmm2,
         fms_mo_args.nbits_kvcache,
     ]
-    if any(attn_bits) != 32:
+    if any(x != 32 for x in attn_bits):
         attn_implementation = "eager"
     else:
         attn_implementation = None
@@ -120,10 +125,11 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     logger.info(f"Tokenizer is {tokenizer}, block size is {block_size}")
     qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
     # for models that cannot fit in 1 GPU, keep it in CPU and use block-wise calibration.
-    total_gpu_memory = 0.0
+    total_gpu_memory = 1e-5
     if torch.cuda.is_available():
         total_gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
     model_size = model_size_Wb(model, unit="GB")
+    gpu_mem_util_per = model_size / total_gpu_memory
 
     known_large_models = [
         "Llama-2-70b",
@@ -135,7 +141,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     ]
     qcfg["large_model"] = any(
         name in model_args.model_name_or_path for name in known_large_models
-    ) or (model_size > 0.7 * total_gpu_memory)
+    ) or (gpu_mem_util_per > 0.7)
     dev = "cpu" if qcfg["large_model"] else "cuda:0"
 
     if hasattr(model.config, "model_type"):
@@ -185,6 +191,9 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         if qcfg["large_model"]:
             act_scales = get_act_scales_1gpu(model, dq_dataloader, qcfg)
         else:
+            if gpu_mem_util_per < 0.7:
+                model.to(dev)
+
             act_scales = get_act_scales(model, dq_dataloader, qcfg)
         scale_file = f"{act_scale_directory}/{qcfg['model'].replace('/', '-')}" + ".pt"
         torch.save(act_scales, scale_file)