Add support for IQ2_XXS, IQ2_XS and Q2_K_S

countzero · countzero · commit 040456aaa97a · 2024-02-06T16:12:22.000+01:00
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Think batch quantization like https://huggingface.co/TheBloke does it, but on yo
 
 ## Features
 
-- Easy configuration via a `.env` file
+- Easy configuration via one `.env` file
 - Automates the synchronization of Git repositories containing large files (LFS)
 - Only fetches one LFS object at a time
 - Displays a progress indicator on downloading LFS objects
@@ -18,7 +18,7 @@ Think batch quantization like https://huggingface.co/TheBloke does it, but on yo
 
 ### Prerequisites
 
-Use https://github.com/countzero/windows_llama.cpp to compile a specific version of the [llama.cpp](https://github.com/ggerganov/llama.cpp) project on your machine.
+Use https://github.com/countzero/windows_llama.cpp to compile a specific version of the [llama.cpp](https://github.com/ggerganov/llama.cpp) project on your machine. This also makes training data available.
 
 
 ### Clone the repository from GitHub
@@ -35,7 +35,7 @@ Create the following `.env` file in the project directory. Make sure to change t
 
 ```Env
 # Path to the llama.cpp project that contains the
-# convert.py script and the quantize.exe binary.
+# required conversion and quantization programs.
 LLAMA_CPP_DIRECTORY=C:\windows_llama.cpp\vendor\llama.cpp
 
 # Path to the training data for computing the importance matrix.
@@ -53,9 +53,6 @@ TARGET_DIRECTORY=.\gguf
 # physical drive to improve the quantization speed.
 CACHE_DIRECTORY=.\cache
 
-# Automatic removal of intermediate files in the cache directory.
-CLEAN_CACHE=True
-
 #
 # Comma separated list of quantization types.
 #
@@ -84,9 +81,10 @@ CLEAN_CACHE=True
 #     F32     : 26.00G              @ 7B
 #     COPY    : only copy tensors, no quantizing
 #
-# Hint: The sweet spot is Q5_K_M.
+# Hint: The sweet spot is Q5_K_M. The smallest quantization
+# without the need for an importance matrix is IQ3_XXS.
 #
-QUANTIZATION_TYPES=Q5_K_M,Q3_K_XS
+QUANTIZATION_TYPES=Q5_K_M,IQ3_XXS
 ```
 
 
diff --git a/quantize_weights_for_llama.cpp.ps1 b/quantize_weights_for_llama.cpp.ps1
@@ -56,7 +56,7 @@ ForEach ($repositoryName in $repositoryDirectories) {
             Invoke-Expression "$convertCommand --outfile `"${unquantizedModelPath}`" `"${sourceDirectoryPath}`""
         }
 
-        # We do need to compute an importance matrix for 2-bit quantized models:
+        # We do need to compute an importance matrix for some 2-bit quantized models:
         # https://github.com/ggerganov/llama.cpp/tree/master/examples/imatrix
         $requiresImportanceMatrix = "IQ2_XXS IQ2_XS Q2_K_S".Contains($type)
 
@@ -76,24 +76,20 @@ ForEach ($repositoryName in $repositoryDirectories) {
             $quantizeCommand = "${llamaCppDirectory}\build\bin\Release\quantize.exe"
 
             if ($requiresImportanceMatrix) {
-                $quantizeCommand = "${quantizeCommand} --imatrix=`"${importanceMatrixPath}`""
+                $quantizeCommand = "${quantizeCommand} --imatrix `"${importanceMatrixPath}`""
             }
 
             Invoke-Expression "$quantizeCommand `"${unquantizedModelPath}`" `"${quantizedModelPath}`" `"${type}`""
         }
     }
 
-    if ($cleanCache -and (Test-Path -Path $unquantizedModelPath)) {
+    # Note that we are not removing *.importance-matrix.dat files because
+    # they are relatively small but take a _very_ long time to compute.
+    if (Test-Path -Path $unquantizedModelPath) {
 
         Write-Host "Removing intermediate unquantized model ${unquantizedModelPath}..." -ForegroundColor "DarkYellow"
         Remove-Item "${unquantizedModelPath}" -Recurse -Force
     }
-
-    if ($cleanCache -and (Test-Path -Path $importanceMatrixPath)) {
-
-        Write-Host "Removing intermediate unquantized model ${importanceMatrixPath}..." -ForegroundColor "DarkYellow"
-        Remove-Item "${importanceMatrixPath}" -Recurse -Force
-    }
 }
 
 $stopwatch.Stop()

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ ForEach ($repositoryName in $repositoryDirectories) {`
`56`	`56`	Invoke-Expression "$convertCommand --outfile `"${unquantizedModelPath}`" `"${sourceDirectoryPath}`""
`57`	`57`	`}`
`58`	`58`
`59`		`- # We do need to compute an importance matrix for 2-bit quantized models:`
	`59`	`+ # We do need to compute an importance matrix for some 2-bit quantized models:`
`60`	`60`	`# https://github.com/ggerganov/llama.cpp/tree/master/examples/imatrix`
`61`	`61`	`$requiresImportanceMatrix = "IQ2_XXS IQ2_XS Q2_K_S".Contains($type)`
`62`	`62`
`@@ -76,24 +76,20 @@ ForEach ($repositoryName in $repositoryDirectories) {`
`76`	`76`	`$quantizeCommand = "${llamaCppDirectory}\build\bin\Release\quantize.exe"`
`77`	`77`
`78`	`78`	`if ($requiresImportanceMatrix) {`
`79`		- $quantizeCommand = "${quantizeCommand} --imatrix=`"${importanceMatrixPath}`""
	`79`	+ $quantizeCommand = "${quantizeCommand} --imatrix `"${importanceMatrixPath}`""
`80`	`80`	`}`
`81`	`81`
`82`	`82`	Invoke-Expression "$quantizeCommand `"${unquantizedModelPath}`" `"${quantizedModelPath}`" `"${type}`""
`83`	`83`	`}`
`84`	`84`	`}`
`85`	`85`
`86`		`- if ($cleanCache -and (Test-Path -Path $unquantizedModelPath)) {`
	`86`	`+ # Note that we are not removing *.importance-matrix.dat files because`
	`87`	`+ # they are relatively small but take a _very_ long time to compute.`
	`88`	`+ if (Test-Path -Path $unquantizedModelPath) {`
`87`	`89`
`88`	`90`	`Write-Host "Removing intermediate unquantized model ${unquantizedModelPath}..." -ForegroundColor "DarkYellow"`
`89`	`91`	`Remove-Item "${unquantizedModelPath}" -Recurse -Force`
`90`	`92`	`}`
`91`		`-`
`92`		`- if ($cleanCache -and (Test-Path -Path $importanceMatrixPath)) {`
`93`		`-`
`94`		`- Write-Host "Removing intermediate unquantized model ${importanceMatrixPath}..." -ForegroundColor "DarkYellow"`
`95`		`- Remove-Item "${importanceMatrixPath}" -Recurse -Force`
`96`		`- }`
`97`	`93`	`}`
`98`	`94`
`99`	`95`	`$stopwatch.Stop()`