File tree Expand file tree Collapse file tree 2 files changed +18
-1
lines changed
tests/models/quantization
vllm/model_executor/model_loader Expand file tree Collapse file tree 2 files changed +18
-1
lines changed Original file line number Diff line number Diff line change @@ -47,6 +47,12 @@ def gguf_model(self):
4747 gguf_filename = "qwen2.5-1.5b-instruct-q6_k.gguf" ,
4848)
4949
50+ QWEN3_CONFIG = GGUFTestConfig (
51+ original_model = "Qwen/Qwen3-0.6B" ,
52+ gguf_repo = "unsloth/Qwen3-0.6B-GGUF" ,
53+ gguf_filename = "Qwen3-0.6B-BF16.gguf" ,
54+ )
55+
5056PHI3_CONFIG = GGUFTestConfig (
5157 original_model = "microsoft/Phi-3.5-mini-instruct" ,
5258 gguf_repo = "bartowski/Phi-3.5-mini-instruct-GGUF" ,
@@ -87,6 +93,7 @@ def gguf_model(self):
8793MODELS = [
8894 # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
8995 QWEN2_CONFIG ,
96+ QWEN3_CONFIG ,
9097 PHI3_CONFIG ,
9198 GPT2_CONFIG ,
9299 STABLELM_CONFIG ,
Original file line number Diff line number Diff line change @@ -921,7 +921,17 @@ def gguf_quant_weights_iterator(
921921 name = gguf_to_hf_name_map [tensor .name ]
922922 if weight_type .name not in ("F32" , "BF16" , "F16" ):
923923 name = name .replace ("weight" , "qweight" )
924- param = torch .tensor (weight )
924+ if weight_type .name == "BF16" and tensor .data .dtype == np .uint8 :
925+ # BF16 is currently the only "quantization" type that isn't
926+ # actually quantized but is read as a raw byte tensor.
927+ # Reinterpret as `torch.bfloat16` tensor.
928+ weight = weight .view (np .uint16 )
929+ if reader .byte_order == "S" :
930+ # GGUF endianness != system endianness
931+ weight = weight .byteswap ()
932+ param = torch .tensor (weight ).view (torch .bfloat16 )
933+ else :
934+ param = torch .tensor (weight )
925935 yield name , param
926936
927937
You can’t perform that action at this time.
0 commit comments