Use torch.inference_mode() for lower memory usage during calibration (#20)

mgoin · mgoin · commit e6c222582095 · 2024-07-18T17:07:38.000-04:00
diff --git a/auto_fp8/quantize.py b/auto_fp8/quantize.py
@@ -407,6 +407,7 @@ def quantize_activations(
     cleanup_memory()
 
     # Pass through calibration data to measure activation scales
+<<<<<<< HEAD
 <<<<<<< HEAD
     with torch.inference_mode():
         with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar:
@@ -415,14 +416,27 @@ def quantize_activations(
                 cleanup_memory()
                 pbar.update(1)
 =======
+=======
+>>>>>>> 57c31bb (Use `torch.inference_mode()` for lower memory usage during calibration (#20))
     with tqdm.tqdm(
         total=calibration_tokens.shape[0], desc="Calibrating activation scales"
     ) as pbar:
         for row_idx in range(calibration_tokens.shape[0]):
             model(calibration_tokens[row_idx].reshape(1, -1))
             cleanup_memory()
             pbar.update(1)
+<<<<<<< HEAD
 >>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
+=======
+    with torch.inference_mode():
+        with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar:
+            for row_idx in range(calibration_tokens.shape[0]):
+                model(calibration_tokens[row_idx].reshape(1, -1))
+                cleanup_memory()
+                pbar.update(1)
+>>>>>>> b1c6ad6 (Use `torch.inference_mode()` for lower memory usage during calibration (#20))
+>>>>>>> 57c31bb (Use `torch.inference_mode()` for lower memory usage during calibration (#20))
 
     # Replace dynamic quantizer observer with StaticLinear for export
     for name, quantizer in model.named_modules():