generated from amazon-archives/__template_MIT-0
-
Notifications
You must be signed in to change notification settings - Fork 19
Open
Labels
Description
Hello. I was trying to get quantized Qwen-2.5-1.5B.
However, it seems that quantization for Qwen-2.5-1.5B is not supported in the latest version of neuronx-distribted (link).
I attached the compilation script and error log below.
Is there any workaround to quantize Qwen-2.5-1.5B?
Compile script
# setup config
neuron_config = NeuronConfig(
tp_degree=12,
batch_size=1,
max_context_length=1024,
seq_len=2048,
quantized=True,
quantized_checkpoints_path=quantized_model_path,
quantization_dtype="int8",
quantization_type="per_tensor_symmetric"
)
config = Qwen2InferenceConfig(
neuron_config,
load_config=load_pretrained_config(model_path)
)
# Quantize the model and save it to `quantized_checkpoints_path`.
NeuronQwen2ForCausalLM.save_quantized_state_dict(model_path, config)
model = NeuronQwen2ForCausalLM(model_path, config)
model.compile(traced_model_path)
# Test compiled model
model.load(traced_model_path)Error log
Traceback (most recent call last):
File "/home/ubuntu/xpu/vllm_nxd/compile_quantized_qwen2.5.py", line 67, in <module>
model.load(traced_model_path)
File "/home/ubuntu/xpu/lib/neuronx-distributed-inference/src/neuronx_distributed_inference/models/application_base.
py", line 298, in load
self.load_weights(
File "/home/ubuntu/xpu/lib/neuronx-distributed-inference/src/neuronx_distributed_inference/models/application_base.
py", line 371, in load_weights
weights = self.get_builder().shard_checkpoint()
File "/home/ubuntu/xpu/lib/neuronx-distributed/src/neuronx_distributed/trace/model_builder.py", line 1688, [0/1861]_checkpoint
preprocess_checkpoint(model, checkpoint)
File "/home/ubuntu/xpu/lib/neuronx-distributed/src/neuronx_distributed/trace/trace.py", line 634, in preprocess_checkpoint
invoke_preshard_hook(model, checkpoint, "")
File "/home/ubuntu/xpu/lib/neuronx-distributed/src/neuronx_distributed/trace/trace.py", line 723, in invoke_preshard_hook
invoke_preshard_hook(child, checkpoint, prefix + name + ".")
File "/home/ubuntu/xpu/lib/neuronx-distributed/src/neuronx_distributed/trace/trace.py", line 723, in invoke_preshard_hook
invoke_preshard_hook(child, checkpoint, prefix + name + ".")
File "/home/ubuntu/xpu/lib/neuronx-distributed/src/neuronx_distributed/trace/trace.py", line 723, in invoke_preshar
d_hook
invoke_preshard_hook(child, checkpoint, prefix + name + ".")
[Previous line repeated 1 more time]
File "/home/ubuntu/xpu/lib/neuronx-distributed/src/neuronx_distributed/trace/trace.py", line 716, in invoke_preshar
d_hook
module.preshard_hook(checkpoint, prefix + "weight")
File "/home/ubuntu/xpu/lib/neuronx-distributed-inference/src/neuronx_distributed_inference/modules/attention/gqa.py
", line 1021, in preshard_hook
self.set_bias(
File "/home/ubuntu/xpu/lib/neuronx-distributed-inference/src/neuronx_distributed_inference/modules/attention/gqa.py
", line 768, in set_bias
layer.set_bias_to_state_dict(
File "/home/ubuntu/xpu/lib/neuronx-distributed/src/neuronx_distributed/quantization/quantization_layers.py", line 2
71, in set_bias_to_state_dict
QuantizedParallelLinearLayerStateDictAdaptor.set_bias_to_state_dict(
File "/home/ubuntu/xpu/lib/neuronx-distributed/src/neuronx_distributed/quantization/quantization_layers.py", line 3
60, in set_bias_to_state_dict
raise NotImplementedError()
NotImplementedError