Skip to content

Commit 16fc615

Browse files
Merge pull request #47 from foundation-model-stack/qbmm_fix_amend
fix Qbmm tracing issue
2 parents 1c74779 + 6bb97c0 commit 16fc615

File tree

5 files changed

+152
-48
lines changed

5 files changed

+152
-48
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,8 @@ venv/
3333
# Generated by spelling check
3434
dictionary.dic
3535

36+
# Files generated from running examples
37+
fms_mo.log
38+
data_train/
39+
data_test/
40+
act_scales/

fms_mo/dq.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,14 +66,19 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
6666
opt_args (fms_mo.training_args.OptArguments): Generic optimization arguments to be used
6767
during DQ
6868
fms_mo_args (fms_mo.training_args.FMSMOArguments): Parameters to use for DQ quantization
69+
70+
NOTE:
71+
use dynamo tracing instead of torchscript by default. if torchscript is needed, change
72+
1) config_kwarks and 2) use_dynamo in qmodel_prep()
73+
6974
"""
7075
# for attention or kv-cache quantization, need to use eager attention
7176
attn_bits = [
7277
fms_mo_args.nbits_bmm1,
7378
fms_mo_args.nbits_bmm2,
7479
fms_mo_args.nbits_kvcache,
7580
]
76-
if any(attn_bits) != 32:
81+
if any(x != 32 for x in attn_bits):
7782
attn_implementation = "eager"
7883
else:
7984
attn_implementation = None
@@ -120,10 +125,11 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
120125
logger.info(f"Tokenizer is {tokenizer}, block size is {block_size}")
121126
qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
122127
# for models that cannot fit in 1 GPU, keep it in CPU and use block-wise calibration.
123-
total_gpu_memory = 0.0
128+
total_gpu_memory = 1e-5
124129
if torch.cuda.is_available():
125130
total_gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
126131
model_size = model_size_Wb(model, unit="GB")
132+
gpu_mem_util_per = model_size / total_gpu_memory
127133

128134
known_large_models = [
129135
"Llama-2-70b",
@@ -135,7 +141,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
135141
]
136142
qcfg["large_model"] = any(
137143
name in model_args.model_name_or_path for name in known_large_models
138-
) or (model_size > 0.7 * total_gpu_memory)
144+
) or (gpu_mem_util_per > 0.7)
139145
dev = "cpu" if qcfg["large_model"] else "cuda:0"
140146

141147
if hasattr(model.config, "model_type"):
@@ -185,6 +191,9 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
185191
if qcfg["large_model"]:
186192
act_scales = get_act_scales_1gpu(model, dq_dataloader, qcfg)
187193
else:
194+
if gpu_mem_util_per < 0.7:
195+
model.to(dev)
196+
188197
act_scales = get_act_scales(model, dq_dataloader, qcfg)
189198
scale_file = f"{act_scale_directory}/{qcfg['model'].replace('/', '-')}" + ".pt"
190199
torch.save(act_scales, scale_file)

0 commit comments

Comments
 (0)