@@ -66,14 +66,19 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
6666 opt_args (fms_mo.training_args.OptArguments): Generic optimization arguments to be used
6767 during DQ
6868 fms_mo_args (fms_mo.training_args.FMSMOArguments): Parameters to use for DQ quantization
69+
70+ NOTE:
71+ use dynamo tracing instead of torchscript by default. if torchscript is needed, change
72+ 1) config_kwarks and 2) use_dynamo in qmodel_prep()
73+
6974 """
7075 # for attention or kv-cache quantization, need to use eager attention
7176 attn_bits = [
7277 fms_mo_args .nbits_bmm1 ,
7378 fms_mo_args .nbits_bmm2 ,
7479 fms_mo_args .nbits_kvcache ,
7580 ]
76- if any (attn_bits ) != 32 :
81+ if any (x != 32 for x in attn_bits ) :
7782 attn_implementation = "eager"
7883 else :
7984 attn_implementation = None
@@ -120,10 +125,11 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
120125 logger .info (f"Tokenizer is { tokenizer } , block size is { block_size } " )
121126 qcfg = qconfig_init (recipe = "dq" , args = fms_mo_args )
122127 # for models that cannot fit in 1 GPU, keep it in CPU and use block-wise calibration.
123- total_gpu_memory = 0.0
128+ total_gpu_memory = 1e-5
124129 if torch .cuda .is_available ():
125130 total_gpu_memory = torch .cuda .get_device_properties (0 ).total_memory / 1e9
126131 model_size = model_size_Wb (model , unit = "GB" )
132+ gpu_mem_util_per = model_size / total_gpu_memory
127133
128134 known_large_models = [
129135 "Llama-2-70b" ,
@@ -135,7 +141,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
135141 ]
136142 qcfg ["large_model" ] = any (
137143 name in model_args .model_name_or_path for name in known_large_models
138- ) or (model_size > 0.7 * total_gpu_memory )
144+ ) or (gpu_mem_util_per > 0.7 )
139145 dev = "cpu" if qcfg ["large_model" ] else "cuda:0"
140146
141147 if hasattr (model .config , "model_type" ):
@@ -185,6 +191,9 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
185191 if qcfg ["large_model" ]:
186192 act_scales = get_act_scales_1gpu (model , dq_dataloader , qcfg )
187193 else :
194+ if gpu_mem_util_per < 0.7 :
195+ model .to (dev )
196+
188197 act_scales = get_act_scales (model , dq_dataloader , qcfg )
189198 scale_file = f"{ act_scale_directory } /{ qcfg ['model' ].replace ('/' , '-' )} " + ".pt"
190199 torch .save (act_scales , scale_file )
0 commit comments