feat: Add support for 4-bit quantization

cpldcpu · cpldcpu · commit e74c1caa4721 · 2024-05-19T18:49:21.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ data/
 runs/
 runs_opt/
 backup/
+batchtest/
 *.obj
 *.dll
 *.exp
@@ -16,6 +17,8 @@ backup/
 *.lst
 *.bkp
 *.pdf
+*.log
+*.elog
 # python cache
 __pycache__/
 venv/
diff --git a/BitNetMCU.py b/BitNetMCU.py
@@ -14,20 +14,21 @@ class FCMNIST(nn.Module):
     @cpldcpu 2024-March-24
 
     """
-    def __init__(self,network_width1=64,network_width2=64,network_width3=64,QuantType='Binary',WScale='PerTensor',NormType='RMS'):
+    def __init__(self,network_width1=64,network_width2=64,network_width3=64,QuantType='Binary',WScale='PerTensor',NormType='RMS', quantscale=0.25):
         super(FCMNIST, self).__init__()
 
         self.network_width1 = network_width1
         self.network_width2 = network_width2
         self.network_width3 = network_width3
+        self.quantscale = quantscale
 
-        self.fc1 = BitLinear(1* 1 *16 *16, network_width1,QuantType=QuantType,NormType=NormType, WScale=WScale)
-        self.fc2 = BitLinear(network_width1, network_width2,QuantType=QuantType,NormType=NormType, WScale=WScale)
+        self.fc1 = BitLinear(1* 1 *16 *16, network_width1,QuantType=QuantType,NormType=NormType, WScale=WScale, quantscale=quantscale)
+        self.fc2 = BitLinear(network_width1, network_width2,QuantType=QuantType,NormType=NormType, WScale=WScale , quantscale=quantscale )
         if network_width3>0:
-            self.fc3 = BitLinear(network_width2, network_width3,QuantType=QuantType,NormType=NormType, WScale=WScale)
-            self.fcl = BitLinear(network_width3, 10,QuantType=QuantType,NormType=NormType, WScale=WScale)
+            self.fc3 = BitLinear(network_width2, network_width3,QuantType=QuantType,NormType=NormType, WScale=WScale , quantscale=quantscale)
+            self.fcl = BitLinear(network_width3, 10,QuantType=QuantType,NormType=NormType, WScale=WScale , quantscale=quantscale)
         else:
-            self.fcl = BitLinear(network_width2, 10,QuantType=QuantType,NormType=NormType, WScale=WScale)
+            self.fcl = BitLinear(network_width2, 10,QuantType=QuantType,NormType=NormType, WScale=WScale , quantscale=quantscale)
             
         # self.dropout = nn.Dropout(0.10)
 
@@ -64,18 +65,23 @@ class BitLinear(nn.Linear):
     - PerTensor      : The weight scaling is calculated per Tensor
     - PerOutput      : The weight scaling is calculated per Output
 
+    quantcale
+    - scalar         : The scale factor for the weight quantization, the default of 0.25 
+                       biases the stddev of the weights toward 25% of the maximum scale
+
     Implementation based on:
     https://github.com/microsoft/unilm/blob/master/bitnet/The-Era-of-1-bit-LLMs__Training_Tips_Code_FAQ.pdf
     
     This is not optimized for speed or efficiency...
 
     @cpldcpu 2024-March-24
     """
-    def __init__(self, in_features, out_features, bias=False, QuantType='Binary', WScale='PerTensor', NormType='RMS'):
+    def __init__(self, in_features, out_features, bias=False, QuantType='Binary', WScale='PerTensor', NormType='RMS', quantscale=0.25):
         super(BitLinear, self).__init__(in_features, out_features, bias=False)
         self.QuantType = QuantType
         self.NormType = NormType
         self.WScale = WScale
+        self.quantscale = quantscale
 
         # flat init - does not help so keep default
         # fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
@@ -132,12 +138,6 @@ def weight_quant(self, w):
         if self.QuantType == 'Ternary': # 1.58bits
             scale = 1.0 / mag
             u = (w * scale).round().clamp_(-1, 1) / scale
-        elif self.QuantType == 'Ternary06': # 1 bit
-            scale = 0.6 / mag
-            u = (w * scale).round().clamp_(-1, 1) / scale
-        elif self.QuantType == 'Ternary4': # 1 bit
-            scale = 4 / mag
-            u = (w * scale).round().clamp_(-1, 1) / scale
         elif self.QuantType == 'Binary': # 1 bit
             scale = mag
             e = w.mean()
@@ -146,27 +146,24 @@ def weight_quant(self, w):
             scale = mag
             # e = w.mean()
             u = w.sign() * scale
-        elif self.QuantType == 'BinarySymHS': # 1 bit
-            scale = mag
-            u = w.sign() * scale * 0.5
-        elif self.QuantType == 'BinarySymDS': # 1 bit
-            scale = mag
-            u = w.sign() * scale * 2.0
         elif self.QuantType == '2bitsym':
             scale = 1.0 / mag # 2 worst, 1 better, 1.5 almost as bad as 2
             u = ((w * scale - 0.5).round().clamp_(-2, 1) + 0.5) / scale
+        elif self.QuantType == '4bit': # 4 bit in one-complement encoding for inference with multiplication
+            scale = self.quantscale * 8.0 / mag # 2.0 for tensor, 6.5 for output
+            u = ((w * scale).round().clamp_(-8, 7)) / scale        
         elif self.QuantType == '4bitsym':
-            scale = 2.0 / mag # 2.0 for tensor, 6.5 for output
+            scale = self.quantscale * 8.0 / mag # 2.0 for tensor, 6.5 for output
             u = ((w * scale - 0.5).round().clamp_(-8, 7) + 0.5) / scale        
-        elif self.QuantType ==  'FP130': # encoding (F1.3.0) : S * ( 2^E3 + 1) -> min 2^0 = 1, max 2^7 = 127
-            scale = 16.0 / mag
+        elif self.QuantType ==  'FP130': # encoding (F1.3.0) : S * ( 2^E3 + 1) -> min 2^0 = 1, max 2^7 = 128
+            scale = 128.0 * self.quantscale / mag
             e = ((w * scale).abs()).log2().floor().clamp_(0, 7)
             u = w.sign()*(e.exp2()) / scale            
         elif self.QuantType == '5bitsym':
-            scale = 4.0 / mag # 4.0 for tensor, 13 for output
+            scale = 16.0 * self.quantscale / mag # 4.0 for tensor, 13 for output
             u = ((w * scale - 0.5).round().clamp_(-16, 15) + 0.5) / scale        
         elif self.QuantType == '8bit': # -128 to 127
-            scale = 32.0 / mag
+            scale = 128.0 * self.quantscale / mag
             u = (w * scale).round().clamp_(-128, 127) / scale   
         else:
             raise AssertionError(f"Invalid QuantType: {self.QuantType}. Expected one of: 'Binary', 'BinaryBalanced', '2bitsym', '4bitsym', '8bit'")
@@ -197,13 +194,15 @@ class QuantizedModel:
     This class represents a quantized model. It provides functionality to quantize a given model.
     """   
      
-    def __init__(self, model = None, force_quantization = None):
+    def __init__(self, model = None, force_quantization = None, quantscale=0.25):
         self.quantized_model=None
         self.total_bits=0
         self.force_quantization = force_quantization
+        self.quantscale = quantscale
 
         if model is not None:
             self.quantized_model, _ = self.quantize(model)
+            self.quantscale = model.quantscale
 
     def totalbits(self):
         """
@@ -263,21 +262,25 @@ def quantize(self,model):
                     scale = 1.0 / mag # 2 worst, 1 better, 1.5 almost as bad as 2
                     u = ((w * scale - 0.5).round().clamp_(-2, 1) + 0.5) 
                     bpw = 2
+                elif QuantType == '4bit': # 4 bit in one-complement encoding for inference with multiplication
+                    scale = 8.0 * self.quantscale / mag # 2.0 for tensor, 6.5 for output
+                    u = ((w * scale).round().clamp_(-8, 7)) 
+                    bpw = 4
                 elif QuantType == '4bitsym':
-                    scale = 2.0 / mag # 2.0 for tensor, 6.5 for output
+                    scale = 8.0 * self.quantscale / mag # 2.0 for tensor, 6.5 for output
                     u = ((w * scale - 0.5).round().clamp_(-8, 7) + 0.5) 
                     bpw = 4
                 elif QuantType ==  'FP130': 
-                    scale = 16.0 / mag 
+                    scale = 128.0 * self.quantscale / mag 
                     e = ((w * scale ).abs()).log2().floor().clamp_(0, 7)
                     u = w.sign()*(e.exp2() )    
                     bpw = 4                       
                 elif QuantType == '5bitsym':
-                    scale = 4.0 / mag # 4.0 for tensor, 14 for output
+                    scale = 16.0 * self.quantscale / mag # 4.0 for tensor, 14 for output
                     u = ((w * scale - 0.5).round().clamp_(-16, 15) + 0.5) 
                     bpw = 5
                 elif QuantType == '8bit':
-                    scale = 32.0 / mag
+                    scale = 128.0 * self.quantscale / mag
                     u = (w * scale).round().clamp_(-128, 127) 
                     bpw = 8
                 elif QuantType == 'None':
diff --git a/BitNetMCU_inference.c b/BitNetMCU_inference.c
@@ -111,6 +111,8 @@ void processfclayer( int8_t *activations,  const uint32_t *weights, int32_t bits
                     weightChunk <<= 2;
                 }
             }
+        // Muliplier-less inference for RB32EC
+#if defined(__riscv) && !defined(__riscv_mul)          
         } else if (bits_per_weight == 4 ) { 
             for (uint32_t k = 0; k < n_input; k+=8) {
                 uint32_t weightChunk = *weightidx++;
@@ -126,6 +128,30 @@ void processfclayer( int8_t *activations,  const uint32_t *weights, int32_t bits
                     weightChunk <<= 4;
                 }
             }
+#else
+        } else if (bits_per_weight == 4 ) { 
+            for (uint32_t k = 0; k < n_input; k+=8) {
+                uint32_t weightChunk = *weightidx++;
+                for (uint32_t j = 0; j < 8; j++) {
+                    int32_t in=*activations_idx++;
+                    if (in != 0) { // Skip zero activations to speed up inference in layers after first layer
+                        int32_t tmpsum = (weightChunk & 0x80000000) ? -in : in; // one complements sign (bit set equals negative)
+                        sum += tmpsum * ((weightChunk>>(32-4))&7);                                  // sign*in*1
+                    }
+                    weightChunk <<= 4;
+                }
+            }
+#endif
+        } else if (bits_per_weight == 8 + 4 ) {   // 4 bit twos-complement
+            for (uint32_t k = 0; k < n_input; k+=8) {
+                int32_t weightChunk = *weightidx++;
+                for (uint32_t j = 0; j < 8; j++) {
+                    int32_t in=*activations_idx++;
+                    int32_t weight = (weightChunk) >> (32-4); // extend sign, cut off lower bits
+                    sum += in*weight;                                  
+                    weightChunk <<= 4;
+                }
+            }
         }  else if (bits_per_weight == 16 + 4 ) {  // 4 bit shift
             for (uint32_t k = 0; k < n_input; k+=8) {
                 uint32_t weightChunk = *weightidx++;
diff --git a/docs/documentation.md b/docs/documentation.md
@@ -538,7 +538,10 @@ By simplifying the model architecture and using a full-custom implementation, I
 
 While this project focused on MNIST inference as a test case, I plan to apply this approach to other applications in the future.
 
-# Addendum: FP1.3.0 Quantization
+# Addendum: Additional quantization schemes
+
+
+## FP1.3.0 Quantization
 
 <div align="center">
     <img src="first_layer_weights_fp130.png" width="60%">
@@ -550,6 +553,20 @@ While this project focused on MNIST inference as a test case, I plan to apply th
 
 TODO
 
+```
+     1ee:	00170483          	lb	    s1,1(a4)
+     1f2:	00035463          	bgez	t1,1fa <processfclayer+0x4a>
+     1f6:	409004b3          	neg	    s1,s1
+
+     1fa:	01c35313          	srli	t1,t1,0x1c
+     1fe:	00737313          	andi	t1,t1,7
+     202:	006494b3          	sll	    s1,s1,t1
+     
+     206:	00879313          	slli	t1,a5,0x8
+
+     20a:	9626              	add	    a2,a2,s1
+```
+
 # References
 
 References and further reading:
diff --git a/exportquant.py b/exportquant.py
@@ -8,6 +8,7 @@
 import matplotlib.pyplot as plt
 import argparse
 import yaml
+import seaborn as sns
 
 # Export quantized model from saved checkpoint
 # cpldcpu 2024-04-14
@@ -80,6 +81,9 @@ def export_to_hfile(quantized_model, filename, runname):
             elif quantization_type == '4bitsym': 
                 encoded_weights = ((weights < 0).astype(data_type) << 3) | (np.floor(np.abs(weights))).astype(data_type)  # use bitwise operations to encode the weights
                 QuantID = 4
+            elif quantization_type == '4bit': 
+                encoded_weights = np.floor(weights).astype(int) & 15  # twos complement encoding
+                QuantID =  8 + 4
             elif quantization_type == 'FP130': # FP1.3.0 encoding (sign * 2^exp)
                 encoded_weights = ((weights < 0).astype(data_type) << 3) | (np.floor(np.log2(np.abs(weights)))).astype(data_type)  
                 QuantID = 16 + 4
@@ -213,12 +217,14 @@ def plot_weight_histograms(quantized_model):
 
     for layer_index, layer in enumerate(quantized_model.quantized_model):
         layer_weights = np.array(layer['quantized_weights'])
+        bpw = layer['bpw']
 
         flattened_weights = layer_weights.flatten()
 
         ax = fig.add_subplot(len(quantized_model.quantized_model), 1, layer_index + 1)
 
-        ax.hist(flattened_weights, bins='auto')
+        # ax.hist(flattened_weights, width=1, bins='auto')
+        sns.histplot(flattened_weights, bins=2**bpw, ax=ax, kde=True)
         ax.set_title(f'Layer {layer_index+1} Weight Distribution')
 
     plt.tight_layout()  
@@ -266,7 +272,8 @@ def plot_weight_histograms(quantized_model):
         network_width3=hyperparameters["network_width3"], 
         QuantType=hyperparameters["QuantType"], 
         NormType=hyperparameters["NormType"],
-        WScale=hyperparameters["WScale"]
+        WScale=hyperparameters["WScale"],
+        quantscale=hyperparameters["quantscale"]
     ).to(device)
 
     print('Loading model...')    
@@ -292,7 +299,7 @@ def plot_weight_histograms(quantized_model):
 
     print('Quantizing model...')
     # Quantize the model
-    quantized_model = QuantizedModel(model)
+    quantized_model = QuantizedModel(model, quantscale=hyperparameters["quantscale"])
 
     # Print statistics
     print_stats(quantized_model)
diff --git a/modeldata/opt_Cosine_lr0.001_Aug_BitMnist_PerTensor_4bit_RMS_width64_64_64_bs128_epochs60.pth b/modeldata/opt_Cosine_lr0.001_Aug_BitMnist_PerTensor_4bit_RMS_width64_64_64_bs128_epochs60.pth
diff --git a/training.py b/training.py
@@ -189,7 +189,8 @@ def train_model(model, device, hyperparameters, train_data, test_data):
         network_width3=hyperparameters["network_width3"], 
         QuantType=hyperparameters["QuantType"], 
         NormType=hyperparameters["NormType"],
-        WScale=hyperparameters["WScale"]
+        WScale=hyperparameters["WScale"],
+        quantscale=hyperparameters["quantscale"]
     ).to(device)
 
     print('training...')
diff --git a/trainingparameters.yaml b/trainingparameters.yaml
@@ -1,17 +1,27 @@
-num_epochs: 60
-QuantType: '4bitsym' # 'Ternary', 'Binary', 'BinaryBalanced', '2bitsym', '4bitsym', '8bit', 'None", 'FP130' 
+# Quantization settings
+QuantType: '4bitsym' # 'Ternary', 'Binary', 'BinaryBalanced', '2bitsym', '4bit', '4bitsym', '8bit', 'None", 'FP130' 
 BPW : 4
 NormType: 'RMS' # 'RMS', 'Lin', 'BatchNorm'
-WScale: 'PerTensor' # 'PerTensor', 'PerOutput', 'PerOutputLog2'
+WScale: 'PerTensor' # 'PerTensor', 'PerOutput'
+quantscale: 0.25  # How to scale the stddev of each tensor relative to the max value
+
+# Learning parameters
 batch_size: 128
+num_epochs: 60
 scheduler: "Cosine" # "StepLR", "Cosine"
 learning_rate: 0.001
 lr_decay: 0.1     # lr_decay and step size are not used with cosine scheduler
 step_size: 10
-network_width1: 64
-network_width2: 64
-network_width3: 64
+
+# Data augmentation
 augmentation: True
 rotation1: 10  # rotation1 and rotation2 are used for data augmentation
 rotation2: 10
+
+# Model parameters
+network_width1: 64
+network_width2: 64
+network_width3: 64
+
+# name
 runtag: "opt_" # runtag is prefix for runname