@@ -428,9 +428,57 @@ def dequantize_blocks_Q2_K(blocks, block_size, type_size, dtype=None):
428428def  dequantize_blocks_BF16 (blocks , block_size , type_size , dtype = None ):
429429    return  (blocks .view (torch .int16 ).to (torch .int32 ) <<  16 ).view (torch .float32 )
430430
431+ # this part from calcuis (gguf.org) 
432+ # more info: https://github.com/calcuis/gguf-connector/blob/main/src/gguf_connector/quant2c.py 
433+ 
434+ def  dequantize_blocks_IQ4_NL (blocks , block_size , type_size , dtype = None ):
435+     kvalues  =  torch .tensor (
436+         [- 127 , - 104 , - 83 , - 65 , - 49 , - 35 , - 22 , - 10 , 1 , 13 , 25 , 38 , 53 , 69 , 89 , 113 ],
437+         dtype = torch .float32 , device = blocks .device 
438+     )
439+     n_blocks  =  blocks .shape [0 ]
440+     d , qs  =  split_block_dims (blocks , 2 )
441+     d  =  d .view (torch .float16 ).to (dtype )
442+     qs  =  qs .reshape ((n_blocks , - 1 , 1 , block_size  //  2 )) >>  torch .tensor (
443+         [0 , 4 ], device = blocks .device , dtype = torch .uint8 
444+     ).reshape ((1 , 1 , 2 , 1 ))
445+     qs  =  (qs  &  15 ).reshape ((n_blocks , - 1 )).to (torch .int64 )
446+     kvalues  =  kvalues .view (1 , 1 , 16 )
447+     qs  =  qs .unsqueeze (- 1 )
448+     qs  =  torch .gather (kvalues .expand (qs .shape [0 ], qs .shape [1 ], 16 ), 2 , qs )
449+     qs  =  qs .squeeze (- 1 ).to (dtype )
450+     return  d  *  qs 
451+ 
452+ def  dequantize_blocks_IQ4_XS (blocks , block_size , type_size , dtype = None ):
453+     kvalues  =  torch .tensor (
454+         [- 127 , - 104 , - 83 , - 65 , - 49 , - 35 , - 22 , - 10 , 1 , 13 , 25 , 38 , 53 , 69 , 89 , 113 ],
455+         dtype = torch .float32 , device = blocks .device 
456+     )
457+     n_blocks  =  blocks .shape [0 ]
458+     d , scales_h , scales_l , qs  =  split_block_dims (blocks , 2 , 2 , QK_K  //  64 )
459+     d  =  d .view (torch .float16 ).to (dtype )
460+     scales_h  =  scales_h .view (torch .int16 )
461+     scales_l  =  scales_l .reshape ((n_blocks , - 1 , 1 )) >>  torch .tensor (
462+         [0 , 4 ], device = blocks .device , dtype = torch .uint8 ).reshape ((1 , 1 , 2 ))
463+     scales_h  =  scales_h .reshape ((n_blocks , 1 , - 1 )) >>  torch .tensor (
464+         [2  *  i  for  i  in  range (QK_K  //  32 )], device = blocks .device , dtype = torch .uint8 ).reshape ((1 , - 1 , 1 ))
465+     scales_l  =  scales_l .reshape ((n_blocks , - 1 )) &  0x0F 
466+     scales_h  =  scales_h .reshape ((n_blocks , - 1 )) &  0x03 
467+     scales  =  (scales_l  |  (scales_h  <<  4 )) -  32 
468+     dl  =  (d  *  scales .to (dtype )).reshape ((n_blocks , - 1 , 1 ))
469+     shifts_q  =  torch .tensor ([0 , 4 ], device = blocks .device , dtype = torch .uint8 ).reshape (1 , 1 , 2 , 1 )
470+     qs  =  qs .reshape ((n_blocks , - 1 , 1 , 16 )) >>  shifts_q 
471+     qs  =  (qs  &  15 ).reshape ((n_blocks , - 1 , 32 )).to (torch .int64 )
472+     kvalues  =  kvalues .view (1 , 1 , 1 , 16 )
473+     qs  =  qs .unsqueeze (- 1 )
474+     qs  =  torch .gather (kvalues .expand (qs .shape [0 ], qs .shape [1 ], qs .shape [2 ], 16 ), 3 , qs )
475+     qs  =  qs .squeeze (- 1 ).to (dtype )
476+     return  (dl  *  qs ).reshape (n_blocks , - 1 )
431477
432478GGML_QUANT_SIZES  =  gguf .GGML_QUANT_SIZES 
433479dequantize_functions  =  {
480+     gguf .GGMLQuantizationType .IQ4_NL : dequantize_blocks_IQ4_NL ,
481+     gguf .GGMLQuantizationType .IQ4_XS : dequantize_blocks_IQ4_XS ,
434482    gguf .GGMLQuantizationType .BF16 : dequantize_blocks_BF16 ,
435483    gguf .GGMLQuantizationType .Q8_0 : dequantize_blocks_Q8_0 ,
436484    gguf .GGMLQuantizationType .Q5_1 : dequantize_blocks_Q5_1 ,
0 commit comments