fix vram leak in calibration

AnyISalIn · AnyISalIn · commit a30af690b69f · 2024-05-08T11:08:13.000+08:00
Signed-off-by: AnyISalIn &lt;anyisalin@gmail.com&gt;
diff --git a/quantize.py b/quantize.py
@@ -6,6 +6,7 @@
 import torch
 import torch.functional as F
 import transformers
+import tqdm
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -162,7 +163,7 @@ def forward(self, x):
 def replace_module(model, name, new_module):
     if "." in name:
         parent_name = name.rsplit(".", 1)[0]
-        child_name = name[len(parent_name) + 1 :]
+        child_name = name[len(parent_name) + 1:]
         parent = model.model.get_submodule(parent_name)
     else:
         parent_name = ""
@@ -197,8 +198,11 @@ def quantize_activations(model, calibration_tokens):
     cleanup_memory()
 
     # Calibration.
-    for row_idx in range(calibration_tokens.shape[0]):
-        _ = model(calibration_tokens[row_idx].reshape(1, -1))
+    with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating") as pbar:
+        for row_idx in range(calibration_tokens.shape[0]):
+            model(calibration_tokens[row_idx].reshape(1, -1))
+            torch.cuda.empty_cache()
+            pbar.update(1)
 
     # Replace quantizer with StaticLayer.
     for name, quantizer in model.model.named_modules():