Add --fast_safetensors option to quantizer script

turboderp · turboderp · commit c91893ac6e5d · 2024-07-14T05:54:18.000+02:00
diff --git a/exllamav2/conversion/convert_exl2.py b/exllamav2/conversion/convert_exl2.py
@@ -31,6 +31,7 @@
 parser.add_argument("-ml", "--measurement_length", type = int, default = 2048, help = "Max no. tokens per sample when measuring")
 parser.add_argument("-so", "--status_output", action = "store_true", help = "Include machine-parseable status updates in console output")
 parser.add_argument("-hsol", "--hidden_state_offload_layers", type = int, default = 0, help = "Number of hidden/target states to keep in VRAM. Speed-up but increases VRAM usage")
+parser.add_argument("-fst", "--fast_safetensors", action = "store_true", help = "Use fast-safetensors to load layers of the unquantized model. This can help alleviate some out-of-memory issues, especially on Windows.")
 
 args = parser.parse_args()
 
@@ -112,6 +113,7 @@ def save_job():
        "rope_scale": args.rope_scale,
        "rope_alpha": args.rope_alpha,
        "output_measurement": output_measurement,
+       "fast_safetensors": args.fast_safetensors,
        "progress": "begin"}
 
 if args.measurement is not None:
@@ -160,6 +162,8 @@ def save_job():
 else:
     print(f" -- Measurement will be saved to {job['output_measurement']}")
     print(f" !! Conversion script will end after measurement pass")
+if job.get("fast_safetensors"):
+    print(f" -- Enabled fast_safetensors option.")
 
 if job['rope_scale']: print(f" -- RoPE scale: {job['rope_scale']:.2f}")
 if job['rope_alpha']: print(f" -- RoPE alpha: {job['rope_alpha']:.2f}")
@@ -190,6 +194,10 @@ def save_job():
 
 tokenizer = ExLlamaV2Tokenizer(config)
 
+# Set fast_safetensors in config
+
+if job.get("fast_safetensors"): config.fasttensors = True
+
 # Set scaling for input model
 
 if job["rope_scale"] is not None: config.scale_pos_emb = job["rope_scale"]