@@ -1171,47 +1171,86 @@ class PoolingType(IntEnum):
11711171
11721172
11731173class GGMLQuantizationType (IntEnum ):
1174- F32 = 0
1175- F16 = 1
1176- Q4_0 = 2
1177- Q4_1 = 3
1178- Q5_0 = 6
1179- Q5_1 = 7
1180- Q8_0 = 8
1181- Q8_1 = 9
1182- Q2_K = 10
1183- Q3_K = 11
1184- Q4_K = 12
1185- Q5_K = 13
1186- Q6_K = 14
1187- Q8_K = 15
1188- IQ2_XXS = 16
1189- IQ2_XS = 17
1190- IQ3_XXS = 18
1191- IQ1_S = 19
1192- IQ4_NL = 20
1193- IQ3_S = 21
1194- IQ2_S = 22
1195- IQ4_XS = 23
1196- I8 = 24
1197- I16 = 25
1198- I32 = 26
1199- I64 = 27
1200- F64 = 28
1201- IQ1_M = 29
1202- BF16 = 30
1203- Q4_0_4_4 = 31
1204- Q4_0_4_8 = 32
1205- Q4_0_8_8 = 33
1206- IQ1_BN = 34 ,
1207- IQ2_BN = 35 ,
1208- Q8_K64 = 36 ,
1209- IQ2_K = 37 ,
1210- IQ3_K = 38 ,
1211- IQ4_K = 39 ,
1212- IQ5_K = 40 ,
1213- IQ6_K = 41 ,
1214- IQ2_TN = 42 ,
1174+ F32 = 0
1175+ F16 = 1
1176+ Q4_0 = 2
1177+ Q4_1 = 3
1178+ Q5_0 = 6
1179+ Q5_1 = 7
1180+ Q8_0 = 8
1181+ Q8_1 = 9
1182+ Q2_K = 10
1183+ Q3_K = 11
1184+ Q4_K = 12
1185+ Q5_K = 13
1186+ Q6_K = 14
1187+ Q8_K = 15
1188+ IQ2_XXS = 16
1189+ IQ2_XS = 17
1190+ IQ3_XXS = 18
1191+ IQ1_S = 19
1192+ IQ4_NL = 20
1193+ IQ3_S = 21
1194+ IQ2_S = 22
1195+ IQ4_XS = 23
1196+ I8 = 24
1197+ I16 = 25
1198+ I32 = 26
1199+ I64 = 27
1200+ F64 = 28
1201+ IQ1_M = 29
1202+ BF16 = 30
1203+ Q4_0_4_4 = 31
1204+ Q4_0_4_8 = 32
1205+ Q4_0_8_8 = 33
1206+ I2_S = 36
1207+ Q8_0_X4 = 97
1208+ Q8_1_X4 = 98
1209+ Q8_2_X4 = 99
1210+ Q6_0 = 133
1211+ IQ1_BN = 134
1212+ IQ2_BN = 135
1213+ Q8_K64 = 136
1214+ IQ2_K = 137
1215+ IQ3_K = 138
1216+ IQ4_K = 139
1217+ IQ5_K = 140
1218+ IQ6_K = 141
1219+ IQ4_KS = 144
1220+ IQ2_KS = 145
1221+ IQ4_KSS = 146
1222+ Q8_K16 = 147
1223+ Q8_K32 = 148
1224+ Q8_KR8 = 149
1225+ Q8_K128 = 150
1226+ Q8_KV = 151
1227+ Q4_0_R8 = 202
1228+ Q5_0_R4 = 206
1229+ Q8_0_R8 = 208
1230+ Q2_K_R4 = 210
1231+ Q3_K_R4 = 211
1232+ Q4_K_R4 = 212
1233+ Q5_K_R4 = 213
1234+ Q6_K_R4 = 214
1235+ IQ2_XXS_R4 = 216
1236+ IQ2_XS_R4 = 217
1237+ IQ3_XXS_R4 = 218
1238+ IQ1_S_R4 = 219
1239+ IQ4_NL_R4 = 220
1240+ IQ3_S_R4 = 221
1241+ IQ2_S_R4 = 222
1242+ IQ4_XS_R8 = 223
1243+ IQ1_M_R4 = 229
1244+ BF16_R16 = 230
1245+ Q6_0_R4 = 233
1246+ IQ2_BN_R4 = 335
1247+ IQ2_K_R4 = 337
1248+ IQ3_K_R4 = 338
1249+ IQ4_K_R4 = 339
1250+ IQ5_K_R4 = 340
1251+ IQ4_KS_R4 = 344
1252+ Q8_KV_R8 = 398
1253+ Q8_K_R8 = 399
12151254
12161255
12171256class ExpertGatingFuncType (IntEnum ):
@@ -1225,50 +1264,71 @@ class ExpertGatingFuncType(IntEnum):
12251264# from llama_ftype in llama.h
12261265# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
12271266class LlamaFileType (IntEnum ):
1228- ALL_F32 = 0
1229- MOSTLY_F16 = 1 # except 1d tensors
1230- MOSTLY_Q4_0 = 2 # except 1d tensors
1231- MOSTLY_Q4_1 = 3 # except 1d tensors
1232- # MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
1233- # MOSTLY_Q4_2 = 5 # support has been removed
1234- # MOSTLY_Q4_3 = 6 # support has been removed
1235- MOSTLY_Q8_0 = 7 # except 1d tensors
1236- MOSTLY_Q5_0 = 8 # except 1d tensors
1237- MOSTLY_Q5_1 = 9 # except 1d tensors
1238- MOSTLY_Q2_K = 10 # except 1d tensors
1239- MOSTLY_Q3_K_S = 11 # except 1d tensors
1240- MOSTLY_Q3_K_M = 12 # except 1d tensors
1241- MOSTLY_Q3_K_L = 13 # except 1d tensors
1242- MOSTLY_Q4_K_S = 14 # except 1d tensors
1243- MOSTLY_Q4_K_M = 15 # except 1d tensors
1244- MOSTLY_Q5_K_S = 16 # except 1d tensors
1245- MOSTLY_Q5_K_M = 17 # except 1d tensors
1246- MOSTLY_Q6_K = 18 # except 1d tensors
1247- MOSTLY_IQ2_XXS = 19 # except 1d tensors
1248- MOSTLY_IQ2_XS = 20 # except 1d tensors
1249- MOSTLY_Q2_K_S = 21 # except 1d tensors
1250- MOSTLY_IQ3_XS = 22 # except 1d tensors
1251- MOSTLY_IQ3_XXS = 23 # except 1d tensors
1252- MOSTLY_IQ1_S = 24 # except 1d tensors
1253- MOSTLY_IQ4_NL = 25 # except 1d tensors
1254- MOSTLY_IQ3_S = 26 # except 1d tensors
1255- MOSTLY_IQ3_M = 27 # except 1d tensors
1256- MOSTLY_IQ2_S = 28 # except 1d tensors
1257- MOSTLY_IQ2_M = 29 # except 1d tensors
1258- MOSTLY_IQ4_XS = 30 # except 1d tensors
1259- MOSTLY_IQ1_M = 31 # except 1d tensors
1260- MOSTLY_BF16 = 32 # except 1d tensors
1261- MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
1262- MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
1263- MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
1264- MOSTLY_IQ1_BN = 36 , # except 1d tensors
1265- MOSTLY_IQ2_BN = 37 , # except 1d tensors
1266- MOSTLY_IQ2_K = 38 , # except 1d tensors
1267- MOSTLY_IQ3_K = 39 , # except 1d tensors
1268- MOSTLY_IQ4_K = 40 , # except 1d tensors
1269- MOSTLY_IQ5_K = 41 , # except 1d tensors
1270- MOSTLY_IQ6_K = 42 , # except 1d tensors
1271- MOSTLY_IQ2_TN = 43 , # except 1d tensors
1267+ ALL_F32 = 0
1268+ MOSTLY_F16 = 1 #except 1d tensors
1269+ MOSTLY_Q4_0 = 2 #except 1d tensors
1270+ MOSTLY_Q4_1 = 3 #except 1d tensors
1271+ MOSTLY_Q4_1_SOME_F16 = 4 #tok_embeddings.weight and output.weight are F16
1272+ MOSTLY_Q8_0 = 7 #except 1d tensors
1273+ MOSTLY_Q5_0 = 8 #except 1d tensors
1274+ MOSTLY_Q5_1 = 9 #except 1d tensors
1275+ MOSTLY_Q2_K = 10 #except 1d tensors
1276+ MOSTLY_Q3_K = 11 #except 1d tensors
1277+ MOSTLY_Q4_K = 12 #except 1d tensors
1278+ MOSTLY_Q5_K = 13 #except 1d tensors
1279+ MOSTLY_Q6_K = 14 #except 1d tensors
1280+ MOSTLY_IQ2_XXS = 15 #except 1d tensors
1281+ MOSTLY_IQ2_XS = 16 #except 1d tensors
1282+ MOSTLY_IQ3_XXS = 17 #except 1d tensors
1283+ MOSTLY_IQ1_S = 18 #except 1d tensors
1284+ MOSTLY_IQ4_NL = 19 #except 1d tensors
1285+ MOSTLY_IQ3_S = 20 #except 1d tensors
1286+ MOSTLY_IQ2_S = 21 #except 1d tensors
1287+ MOSTLY_IQ4_XS = 22 #except 1d tensors
1288+ MOSTLY_IQ1_M = 23 #except 1d tensors
1289+ MOSTLY_BF16 = 24 #except 1d tensors
1290+ MOSTLY_Q4_0_4_4 = 25 #except 1d tensors
1291+ MOSTLY_Q4_0_4_8 = 26 #except 1d tensors
1292+ MOSTLY_Q4_0_8_8 = 27 #except 1d tensors
1293+ MOSTLY_Q6_0 = 127 #except 1d tensors
1294+ MOSTLY_IQ1_BN = 128 #except 1d tensors
1295+ MOSTLY_IQ2_BN = 129 #except 1d tensors
1296+ MOSTLY_IQ2_K = 130 #except 1d tensors
1297+ MOSTLY_IQ3_K = 131 #except 1d tensors
1298+ MOSTLY_IQ4_K = 132 #except 1d tensors
1299+ MOSTLY_IQ5_K = 133 #except 1d tensors
1300+ MOSTLY_IQ6_K = 134 #except 1d tensors
1301+ MOSTLY_IQ4_KS = 137 #except 1d tensors
1302+ MOSTLY_IQ2_KS = 138 #except 1d tensors
1303+ MOSTLY_IQ4_KSS = 139 #except 1d tensors
1304+ MOSTLY_Q8_KV = 140 #except 1d tensors
1305+ MOSTLY_Q4_0_R8 = 202 #except 1d tensors
1306+ MOSTLY_Q8_0_R8 = 207 #except 1d tensors
1307+ MOSTLY_Q5_0_R4 = 208 #except 1d tensors
1308+ MOSTLY_Q2_K_R4 = 210 #except 1d tensors
1309+ MOSTLY_Q3_K_R4 = 211 #except 1d tensors
1310+ MOSTLY_Q4_K_R4 = 212 #except 1d tensors
1311+ MOSTLY_Q5_K_R4 = 213 #except 1d tensors
1312+ MOSTLY_Q6_K_R4 = 214 #except 1d tensors
1313+ MOSTLY_IQ2_XXS_R4 = 215 #except 1d tensors
1314+ MOSTLY_IQ2_XS_R4 = 216 #except 1d tensors
1315+ MOSTLY_IQ3_XXS_R4 = 217 #except 1d tensors
1316+ MOSTLY_IQ1_S_R4 = 218 #except 1d tensors
1317+ MOSTLY_IQ4_NL_R4 = 219 #except 1d tensors
1318+ MOSTLY_IQ3_S_R4 = 220 #except 1d tensors
1319+ MOSTLY_IQ2_S_R4 = 221 #except 1d tensors
1320+ MOSTLY_IQ4_XS_R8 = 222 #except 1d tensors
1321+ MOSTLY_IQ1_M_R4 = 223 #except 1d tensors
1322+ MOSTLY_BF16_R16 = 224 #except 1d tensors
1323+ MOSTLY_Q6_0_R4 = 227 #except 1d tensors
1324+ MOSTLY_IQ2_BN_R4 = 329 #except 1d tensors
1325+ MOSTLY_IQ2_K_R4 = 330 #except 1d tensors
1326+ MOSTLY_IQ3_K_R4 = 331 #except 1d tensors
1327+ MOSTLY_IQ4_K_R4 = 332 #except 1d tensors
1328+ MOSTLY_IQ5_K_R4 = 333 #except 1d tensors
1329+ MOSTLY_IQ4_KS_R4 = 337 #except 1d tensors
1330+ MOSTLY_Q8_KV_R8 = 398 #except 1d tensors
1331+ MOSTLY_Q8_K_R8 = 399 #except 1d tensors
12721332
12731333
12741334 GUESSED = 1024 # not specified in the model file
@@ -1313,39 +1373,89 @@ def get_type(val: Any) -> GGUFValueType:
13131373
13141374# Items here are (block size, type size)
13151375QK_K = 256
1376+
1377+ #Values generated programatically
13161378GGML_QUANT_SIZES : dict [GGMLQuantizationType , tuple [int , int ]] = {
1317- GGMLQuantizationType .F32 : (1 , 4 ),
1318- GGMLQuantizationType .F16 : (1 , 2 ),
1319- GGMLQuantizationType .Q4_0 : (32 , 2 + 16 ),
1320- GGMLQuantizationType .Q4_1 : (32 , 2 + 2 + 16 ),
1321- GGMLQuantizationType .Q5_0 : (32 , 2 + 4 + 16 ),
1322- GGMLQuantizationType .Q5_1 : (32 , 2 + 2 + 4 + 16 ),
1323- GGMLQuantizationType .Q8_0 : (32 , 2 + 32 ),
1324- GGMLQuantizationType .Q8_1 : (32 , 4 + 4 + 32 ),
1325- GGMLQuantizationType .Q2_K : (256 , 2 + 2 + QK_K // 16 + QK_K // 4 ),
1326- GGMLQuantizationType .Q3_K : (256 , 2 + QK_K // 4 + QK_K // 8 + 12 ),
1327- GGMLQuantizationType .Q4_K : (256 , 2 + 2 + QK_K // 2 + 12 ),
1328- GGMLQuantizationType .Q5_K : (256 , 2 + 2 + QK_K // 2 + QK_K // 8 + 12 ),
1329- GGMLQuantizationType .Q6_K : (256 , 2 + QK_K // 2 + QK_K // 4 + QK_K // 16 ),
1330- GGMLQuantizationType .Q8_K : (256 , 4 + QK_K + QK_K // 8 ),
1331- GGMLQuantizationType .IQ2_XXS : (256 , 2 + QK_K // 4 ),
1332- GGMLQuantizationType .IQ2_XS : (256 , 2 + QK_K // 4 + QK_K // 32 ),
1333- GGMLQuantizationType .IQ3_XXS : (256 , 2 + QK_K // 4 + QK_K // 8 ),
1334- GGMLQuantizationType .IQ1_S : (256 , 2 + QK_K // 8 + QK_K // 16 ),
1335- GGMLQuantizationType .IQ4_NL : (32 , 2 + 16 ),
1336- GGMLQuantizationType .IQ3_S : (256 , 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4 ),
1337- GGMLQuantizationType .IQ2_S : (256 , 2 + QK_K // 4 + QK_K // 16 ),
1338- GGMLQuantizationType .IQ4_XS : (256 , 2 + 2 + QK_K // 2 + QK_K // 64 ),
1339- GGMLQuantizationType .I8 : (1 , 1 ),
1340- GGMLQuantizationType .I16 : (1 , 2 ),
1341- GGMLQuantizationType .I32 : (1 , 4 ),
1342- GGMLQuantizationType .I64 : (1 , 8 ),
1343- GGMLQuantizationType .F64 : (1 , 8 ),
1344- GGMLQuantizationType .IQ1_M : (256 , QK_K // 8 + QK_K // 16 + QK_K // 32 ),
1345- GGMLQuantizationType .BF16 : (1 , 2 ),
1346- GGMLQuantizationType .Q4_0_4_4 :(32 , 2 + 16 ),
1347- GGMLQuantizationType .Q4_0_4_8 :(32 , 2 + 16 ),
1348- GGMLQuantizationType .Q4_0_8_8 :(32 , 2 + 16 ),
1379+ GGMLQuantizationType .F32 : ( 1 , 4 ),
1380+ GGMLQuantizationType .F16 : ( 1 , 2 ),
1381+ GGMLQuantizationType .Q4_0 : ( 32 , 18 ),
1382+ GGMLQuantizationType .Q4_1 : ( 32 , 20 ),
1383+ GGMLQuantizationType .Q5_0 : ( 32 , 22 ),
1384+ GGMLQuantizationType .Q5_1 : ( 32 , 24 ),
1385+ GGMLQuantizationType .Q8_0 : ( 32 , 34 ),
1386+ GGMLQuantizationType .Q8_1 : ( 32 , 36 ),
1387+ GGMLQuantizationType .Q2_K : ( 256 , 84 ),
1388+ GGMLQuantizationType .Q3_K : ( 256 , 110 ),
1389+ GGMLQuantizationType .Q4_K : ( 256 , 144 ),
1390+ GGMLQuantizationType .Q5_K : ( 256 , 176 ),
1391+ GGMLQuantizationType .Q6_K : ( 256 , 210 ),
1392+ GGMLQuantizationType .Q8_K : ( 256 , 292 ),
1393+ GGMLQuantizationType .IQ2_XXS : ( 256 , 66 ),
1394+ GGMLQuantizationType .IQ2_XS : ( 256 , 74 ),
1395+ GGMLQuantizationType .IQ3_XXS : ( 256 , 98 ),
1396+ GGMLQuantizationType .IQ1_S : ( 256 , 50 ),
1397+ GGMLQuantizationType .IQ4_NL : ( 32 , 18 ),
1398+ GGMLQuantizationType .IQ3_S : ( 256 , 110 ),
1399+ GGMLQuantizationType .IQ2_S : ( 256 , 82 ),
1400+ GGMLQuantizationType .IQ4_XS : ( 256 , 136 ),
1401+ GGMLQuantizationType .I8 : ( 1 , 1 ),
1402+ GGMLQuantizationType .I16 : ( 1 , 2 ),
1403+ GGMLQuantizationType .I32 : ( 1 , 4 ),
1404+ GGMLQuantizationType .I64 : ( 1 , 8 ),
1405+ GGMLQuantizationType .F64 : ( 1 , 8 ),
1406+ GGMLQuantizationType .IQ1_M : ( 256 , 56 ),
1407+ GGMLQuantizationType .BF16 : ( 1 , 2 ),
1408+ GGMLQuantizationType .Q4_0_4_4 : ( 32 , 18 ),
1409+ GGMLQuantizationType .Q4_0_4_8 : ( 32 , 18 ),
1410+ GGMLQuantizationType .Q4_0_8_8 : ( 32 , 18 ),
1411+ GGMLQuantizationType .I2_S : ( 1 , 1 ),
1412+ GGMLQuantizationType .Q8_0_X4 : ( 32 , 34 ),
1413+ GGMLQuantizationType .Q8_1_X4 : ( 32 , 36 ),
1414+ GGMLQuantizationType .Q8_2_X4 : ( 32 , 36 ),
1415+ GGMLQuantizationType .Q6_0 : ( 32 , 26 ),
1416+ GGMLQuantizationType .IQ1_BN : ( 64 , 13 ),
1417+ GGMLQuantizationType .IQ2_BN : ( 64 , 16 ),
1418+ GGMLQuantizationType .Q8_K64 : ( 64 , 68 ),
1419+ GGMLQuantizationType .IQ2_K : ( 256 , 76 ),
1420+ GGMLQuantizationType .IQ3_K : ( 256 , 110 ),
1421+ GGMLQuantizationType .IQ4_K : ( 256 , 144 ),
1422+ GGMLQuantizationType .IQ5_K : ( 256 , 176 ),
1423+ GGMLQuantizationType .IQ6_K : ( 256 , 212 ),
1424+ GGMLQuantizationType .IQ4_KS : ( 256 , 136 ),
1425+ GGMLQuantizationType .IQ2_KS : ( 256 , 70 ),
1426+ GGMLQuantizationType .IQ4_KSS : ( 256 , 128 ),
1427+ GGMLQuantizationType .Q8_K16 : ( 64 , 64 ),
1428+ GGMLQuantizationType .Q8_K32 : ( 256 , 292 ),
1429+ GGMLQuantizationType .Q8_KR8 : ( 256 , 292 ),
1430+ GGMLQuantizationType .Q8_K128 : ( 128 , 140 ),
1431+ GGMLQuantizationType .Q8_KV : ( 32 , 32 ),
1432+ GGMLQuantizationType .Q4_0_R8 : ( 32 , 18 ),
1433+ GGMLQuantizationType .Q5_0_R4 : ( 32 , 22 ),
1434+ GGMLQuantizationType .Q8_0_R8 : ( 32 , 34 ),
1435+ GGMLQuantizationType .Q2_K_R4 : ( 256 , 84 ),
1436+ GGMLQuantizationType .Q3_K_R4 : ( 256 , 110 ),
1437+ GGMLQuantizationType .Q4_K_R4 : ( 256 , 144 ),
1438+ GGMLQuantizationType .Q5_K_R4 : ( 256 , 176 ),
1439+ GGMLQuantizationType .Q6_K_R4 : ( 256 , 210 ),
1440+ GGMLQuantizationType .IQ2_XXS_R4 : ( 256 , 66 ),
1441+ GGMLQuantizationType .IQ2_XS_R4 : ( 256 , 74 ),
1442+ GGMLQuantizationType .IQ3_XXS_R4 : ( 256 , 98 ),
1443+ GGMLQuantizationType .IQ1_S_R4 : ( 32 , 6 ),
1444+ GGMLQuantizationType .IQ4_NL_R4 : ( 32 , 18 ),
1445+ GGMLQuantizationType .IQ3_S_R4 : ( 256 , 110 ),
1446+ GGMLQuantizationType .IQ2_S_R4 : ( 256 , 82 ),
1447+ GGMLQuantizationType .IQ4_XS_R8 : ( 256 , 136 ),
1448+ GGMLQuantizationType .IQ1_M_R4 : ( 32 , 7 ),
1449+ GGMLQuantizationType .BF16_R16 : ( 1 , 2 ),
1450+ GGMLQuantizationType .Q6_0_R4 : ( 32 , 26 ),
1451+ GGMLQuantizationType .IQ2_BN_R4 : ( 64 , 16 ),
1452+ GGMLQuantizationType .IQ2_K_R4 : ( 256 , 76 ),
1453+ GGMLQuantizationType .IQ3_K_R4 : ( 256 , 110 ),
1454+ GGMLQuantizationType .IQ4_K_R4 : ( 256 , 144 ),
1455+ GGMLQuantizationType .IQ5_K_R4 : ( 256 , 176 ),
1456+ GGMLQuantizationType .IQ4_KS_R4 : ( 256 , 136 ),
1457+ GGMLQuantizationType .Q8_KV_R8 : ( 32 , 32 ),
1458+ GGMLQuantizationType .Q8_K_R8 : ( 256 , 258 ),
13491459}
13501460
13511461
0 commit comments