Add optional noise embeddings during quantization

turboderp · turboderp · commit 69bb9d6cff59 · 2024-11-20T05:48:22.000+01:00
diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py
@@ -241,6 +241,9 @@ class Params:
         # Tensors are transposed in original model weights
         self.orig_weights_transposed = False
 
+        # Add noise rows to calibration while quantizing
+        self.standard_calib_noise = None
+
         # Mistral
 
         if arch_string == "MistralForCausalLM":
diff --git a/exllamav2/conversion/convert_exl2.py b/exllamav2/conversion/convert_exl2.py
@@ -240,7 +240,8 @@ def save_job():
         else:
 
             print(f" -- Tokenizing samples (measurement)...")
-            tokenize(job, save_job, tokenizer, measure = True)
+            noise_rows = config.arch.standard_calib_noise
+            tokenize(job, save_job, tokenizer, measure = True, noise_rows = noise_rows)
             job["progress"] = "initial_embeddings"
             save_job()
 
@@ -285,7 +286,8 @@ def save_job():
     if progress == "tokens_cal":
 
         print(f" -- Tokenizing samples...")
-        tokenize(job, save_job, tokenizer)
+        noise_rows = config.arch.standard_calib_noise
+        tokenize(job, save_job, tokenizer, noise_rows = noise_rows)
         job["progress"] = "embeddings"
         save_job()
 
diff --git a/exllamav2/conversion/measure.py b/exllamav2/conversion/measure.py
@@ -80,7 +80,7 @@ def embeddings(job, save_fn, model, measure = False):
 
     module.load()
     input_ids[input_ids >= module.native_vocab_size] = 0
-    hidden_state = module.forward(input_ids)
+    hidden_state = module.forward(input_ids, negative_ids_noise = True)
     module.unload()
 
     embeddings_dict = { f"row.{i:05}": hidden_state[i:i+1, :, :].contiguous() for i in range(hidden_state.shape[0]) }
diff --git a/exllamav2/conversion/tokenize.py b/exllamav2/conversion/tokenize.py
@@ -36,7 +36,7 @@ def get_tokens(num_rows, length, filename, tokenizer):
     return all_tokens
 
 
-def tokenize(job, save_fn, tokenizer, measure = False):
+def tokenize(job, save_fn, tokenizer, measure = False, noise_rows = None):
 
     print_stage(job, "Tokenizing (1)" if measure else "Tokenizing (2)", 0, 1)
 
@@ -47,7 +47,7 @@ def tokenize(job, save_fn, tokenizer, measure = False):
         length = job["measurement_length"] if measure else job["length"]
         cal_tokens = get_tokens(rows, length, cal_ds, tokenizer)
     else:
-        cal_tokens = get_standard_calibration(job, measure, tokenizer)
+        cal_tokens = get_standard_calibration(job, measure, tokenizer, noise_rows)
         if measure:
             job["measurement_rows"] = cal_tokens.shape[0]
         else:
@@ -61,7 +61,7 @@ def tokenize(job, save_fn, tokenizer, measure = False):
     print_stage(job, "Tokenizing (1)" if measure else "Tokenizing (2)", 1, 1)
 
 
-def get_standard_calibration(job, measure, tokenizer):
+def get_standard_calibration(job, measure, tokenizer, noise_rows = None):
 
     data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "standard_cal_data")
     file_c4 =os.path.join(data_dir, "c4.utf8")
@@ -80,6 +80,10 @@ def get_standard_calibration(job, measure, tokenizer):
     rows_multilingual_s = 1 if measure else 5
     rows_technical = 2 if measure else 10
     rows_random = 2
+    if noise_rows is not None:
+        rows_noise = noise_rows[0] if measure else noise_rows[1]
+    else:
+        rows_noise = 0
 
     ctx = min(2048, job["measurement_length"] if measure else job["length"])
 
@@ -189,6 +193,11 @@ def get_standard_calibration(job, measure, tokenizer):
     for i in range(rows_technical):
         rows.append(tokenized_rows[i:i+1])
 
+    # Noise: 30 rows
+
+    for i in range(rows_noise):
+        rows.append(torch.neg(torch.ones_like(rows[-1])))
+
     # for idx, r in enumerate(rows):
     #     print("------------------------------------------------------------------------------")
     #     print(idx)
diff --git a/exllamav2/embedding.py b/exllamav2/embedding.py
@@ -109,9 +109,10 @@ def forward(
 
         cfg = self.model.config
 
-        # If input IDs contain negative values, assume they are padding tokens from a model with not pad_token_id
-        # defined
+        # If input IDs contain negative values, assume they are padding tokens from a model with no pad_token_id
+        # defined or noise values for quantizing
 
+        input_ids = hidden_states
         hidden_states = hidden_states.clamp(min = 0)
 
         # Apply indexed embeddings
@@ -185,6 +186,17 @@ def forward(
             if self.archparams.normalize_embeddings:
                 hidden_states *= cfg.hidden_size ** 0.5
 
+        # Negative tokens during quantization are noise tokens
+
+        if kwargs.get("negative_ids_noise"):
+            mask = (input_ids < 0).unsqueeze(-1)
+            unmasked_values = hidden_states[~mask.expand_as(hidden_states)].float()
+            mean, std = unmasked_values.mean(), unmasked_values.std()
+            noise = torch.randn_like(hidden_states, dtype = torch.float)
+            noise = noise * std + mean
+            noise = noise.half()
+            hidden_states = torch.where(mask, noise, hidden_states)
+
         # Move to pinned temp buffer for TP
 
         if self.is_tp: