|
| 1 | +// This list is copied from gguf/types.ts, but will all types available (for backward compatibility) |
| 2 | +// NOT to be confused with GGMLQuantizationType, a FileQuantization can contain multiple GGMLQuantizationType |
| 3 | +// For example, Q4_K_M model can contains Q4_K and Q6_K tensors |
| 4 | +export enum GGMLFileQuantizationType { |
| 5 | + F32 = 0, |
| 6 | + F16 = 1, |
| 7 | + Q4_0 = 2, |
| 8 | + Q4_1 = 3, |
| 9 | + Q4_1_SOME_F16 = 4, |
| 10 | + Q4_2 = 5, |
| 11 | + Q4_3 = 6, |
| 12 | + Q8_0 = 7, |
| 13 | + Q5_0 = 8, |
| 14 | + Q5_1 = 9, |
| 15 | + Q2_K = 10, |
| 16 | + Q3_K_S = 11, |
| 17 | + Q3_K_M = 12, |
| 18 | + Q3_K_L = 13, |
| 19 | + Q4_K_S = 14, |
| 20 | + Q4_K_M = 15, |
| 21 | + Q5_K_S = 16, |
| 22 | + Q5_K_M = 17, |
| 23 | + Q6_K = 18, |
| 24 | + IQ2_XXS = 19, |
| 25 | + IQ2_XS = 20, |
| 26 | + Q2_K_S = 21, |
| 27 | + IQ3_XS = 22, |
| 28 | + IQ3_XXS = 23, |
| 29 | + IQ1_S = 24, |
| 30 | + IQ4_NL = 25, |
| 31 | + IQ3_S = 26, |
| 32 | + IQ3_M = 27, |
| 33 | + IQ2_S = 28, |
| 34 | + IQ2_M = 29, |
| 35 | + IQ4_XS = 30, |
| 36 | + IQ1_M = 31, |
| 37 | + BF16 = 32, |
| 38 | + Q4_0_4_4 = 33, |
| 39 | + Q4_0_4_8 = 34, |
| 40 | + Q4_0_8_8 = 35, |
| 41 | + TQ1_0 = 36, |
| 42 | + TQ2_0 = 37, |
| 43 | +} |
| 44 | + |
| 45 | +const ggufQuants = Object.values(GGMLFileQuantizationType).filter((v): v is string => typeof v === "string"); |
| 46 | +export const GGUF_QUANT_RE = new RegExp(`(?<quant>${ggufQuants.join("|")})` + "(_(?<sizeVariation>[A-Z]+))?"); |
| 47 | +export const GGUF_QUANT_RE_GLOBAL = new RegExp(GGUF_QUANT_RE, "g"); |
| 48 | + |
| 49 | +export function parseGGUFQuantLabel(fname: string): string | undefined { |
| 50 | + const quantLabel = fname.toUpperCase().match(GGUF_QUANT_RE_GLOBAL)?.at(-1); // if there is multiple quant substrings in a name, we prefer the last one |
| 51 | + return quantLabel; |
| 52 | +} |
| 53 | + |
| 54 | +// order of quantization, from biggest to smallest |
| 55 | +// this list must be in sync with the order in GGMLFileQuantizationType |
| 56 | +// the gguf.spec.ts tests are using verify if the order is correct |
| 57 | +export const GGUF_QUANT_ORDER: GGMLFileQuantizationType[] = [ |
| 58 | + GGMLFileQuantizationType.F32, |
| 59 | + GGMLFileQuantizationType.BF16, |
| 60 | + GGMLFileQuantizationType.F16, |
| 61 | + GGMLFileQuantizationType.Q8_0, |
| 62 | + |
| 63 | + // 6-bit quantizations |
| 64 | + GGMLFileQuantizationType.Q6_K, |
| 65 | + |
| 66 | + // 5-bit quantizations |
| 67 | + GGMLFileQuantizationType.Q5_0, |
| 68 | + GGMLFileQuantizationType.Q5_1, |
| 69 | + GGMLFileQuantizationType.Q5_K_M, |
| 70 | + GGMLFileQuantizationType.Q5_K_S, |
| 71 | + |
| 72 | + // 4-bit quantizations |
| 73 | + GGMLFileQuantizationType.Q4_K_M, |
| 74 | + GGMLFileQuantizationType.Q4_K_S, |
| 75 | + GGMLFileQuantizationType.IQ4_NL, |
| 76 | + GGMLFileQuantizationType.IQ4_XS, |
| 77 | + GGMLFileQuantizationType.Q4_0_4_4, |
| 78 | + GGMLFileQuantizationType.Q4_0_4_8, |
| 79 | + GGMLFileQuantizationType.Q4_0_8_8, |
| 80 | + GGMLFileQuantizationType.Q4_0, |
| 81 | + GGMLFileQuantizationType.Q4_1_SOME_F16, |
| 82 | + GGMLFileQuantizationType.Q4_1, |
| 83 | + GGMLFileQuantizationType.Q4_2, |
| 84 | + GGMLFileQuantizationType.Q4_3, |
| 85 | + |
| 86 | + // 3-bit quantizations |
| 87 | + GGMLFileQuantizationType.Q3_K_L, |
| 88 | + GGMLFileQuantizationType.Q3_K_M, |
| 89 | + GGMLFileQuantizationType.Q3_K_S, |
| 90 | + GGMLFileQuantizationType.IQ3_M, |
| 91 | + GGMLFileQuantizationType.IQ3_S, |
| 92 | + GGMLFileQuantizationType.IQ3_XS, |
| 93 | + GGMLFileQuantizationType.IQ3_XXS, |
| 94 | + |
| 95 | + // 2-bit quantizations |
| 96 | + GGMLFileQuantizationType.Q2_K, |
| 97 | + GGMLFileQuantizationType.Q2_K_S, |
| 98 | + GGMLFileQuantizationType.IQ2_M, |
| 99 | + GGMLFileQuantizationType.IQ2_S, |
| 100 | + GGMLFileQuantizationType.IQ2_XS, |
| 101 | + GGMLFileQuantizationType.IQ2_XXS, |
| 102 | + |
| 103 | + // 1-bit quantizations |
| 104 | + GGMLFileQuantizationType.IQ1_S, |
| 105 | + GGMLFileQuantizationType.IQ1_M, |
| 106 | + GGMLFileQuantizationType.TQ1_0, |
| 107 | + GGMLFileQuantizationType.TQ2_0, |
| 108 | +]; |
| 109 | + |
| 110 | +// This function finds the nearest quantization type that is less than or equal to the given quantization type. |
| 111 | +// It returns undefined if no such quantization type is found. |
| 112 | +export function findNearestQuantType( |
| 113 | + quant: GGMLFileQuantizationType, |
| 114 | + availableQuants: GGMLFileQuantizationType[] |
| 115 | +): GGMLFileQuantizationType | undefined { |
| 116 | + // Create a map for quick index lookup from the defined order |
| 117 | + const orderMap = new Map<GGMLFileQuantizationType, number>(); |
| 118 | + GGUF_QUANT_ORDER.forEach((q, index) => { |
| 119 | + orderMap.set(q, index); |
| 120 | + }); |
| 121 | + |
| 122 | + const targetIndex = orderMap.get(quant) ?? 0; // the 0 case should never happen |
| 123 | + |
| 124 | + // Filter the available quantizations to include only those defined in the order map, |
| 125 | + // then sort them according to the GGUF_QUANT_ORDER (from largest/index 0 to smallest/highest index). |
| 126 | + const sortedAvailable = availableQuants |
| 127 | + .filter((q) => orderMap.has(q)) |
| 128 | + .sort((a, b) => (orderMap.get(a) ?? Infinity) - (orderMap.get(b) ?? Infinity)); |
| 129 | + |
| 130 | + // If no valid quantizations are available after filtering |
| 131 | + if (sortedAvailable.length === 0) { |
| 132 | + return undefined; |
| 133 | + } |
| 134 | + |
| 135 | + // Iterate through the sorted available quantizations (largest to smallest). |
| 136 | + // Find the first one whose order index is >= the target index. |
| 137 | + // This means finding the largest quantization that is smaller than or equal to the target. |
| 138 | + for (const availableQuant of sortedAvailable) { |
| 139 | + // We know the key exists due to the filter above. |
| 140 | + const availableIndex = orderMap.get(availableQuant)!; |
| 141 | + if (availableIndex >= targetIndex) { |
| 142 | + return availableQuant; |
| 143 | + } |
| 144 | + } |
| 145 | + |
| 146 | + // If the loop completes, it means all available quantizations are larger (have a smaller index) |
| 147 | + // than the target quantization. In this case, return the "smallest" available quantization, |
| 148 | + // which is the last element in the sorted list (highest index among available). |
| 149 | + return sortedAvailable[sortedAvailable.length - 1]; |
| 150 | +} |
| 151 | + |
| 152 | +// This list is only used to calculate the size of the model, NOT to be confused with the quantization FILE type |
1 | 153 | export enum GGMLQuantizationType { |
2 | 154 | F32 = 0, |
3 | 155 | F16 = 1, |
@@ -28,13 +180,6 @@ export enum GGMLQuantizationType { |
28 | 180 | F64 = 28, |
29 | 181 | IQ1_M = 29, |
30 | 182 | BF16 = 30, |
31 | | -} |
32 | | - |
33 | | -const ggufQuants = Object.values(GGMLQuantizationType).filter((v): v is string => typeof v === "string"); |
34 | | -export const GGUF_QUANT_RE = new RegExp(`(?<quant>${ggufQuants.join("|")})` + "(_(?<sizeVariation>[A-Z]+))?"); |
35 | | -export const GGUF_QUANT_RE_GLOBAL = new RegExp(GGUF_QUANT_RE, "g"); |
36 | | - |
37 | | -export function parseGGUFQuantLabel(fname: string): string | undefined { |
38 | | - const quantLabel = fname.toUpperCase().match(GGUF_QUANT_RE_GLOBAL)?.at(-1); // if there is multiple quant substrings in a name, we prefer the last one |
39 | | - return quantLabel; |
| 183 | + TQ1_0 = 34, |
| 184 | + TQ2_0 = 35, |
40 | 185 | } |
0 commit comments