|
31 | 31 | parser.add_argument("-ml", "--measurement_length", type = int, default = 2048, help = "Max no. tokens per sample when measuring")
|
32 | 32 | parser.add_argument("-so", "--status_output", action = "store_true", help = "Include machine-parseable status updates in console output")
|
33 | 33 | parser.add_argument("-hsol", "--hidden_state_offload_layers", type = int, default = 0, help = "Number of hidden/target states to keep in VRAM. Speed-up but increases VRAM usage")
|
| 34 | +parser.add_argument("-fst", "--fast_safetensors", action = "store_true", help = "Use fast-safetensors to load layers of the unquantized model. This can help alleviate some out-of-memory issues, especially on Windows.") |
34 | 35 |
|
35 | 36 | args = parser.parse_args()
|
36 | 37 |
|
@@ -112,6 +113,7 @@ def save_job():
|
112 | 113 | "rope_scale": args.rope_scale,
|
113 | 114 | "rope_alpha": args.rope_alpha,
|
114 | 115 | "output_measurement": output_measurement,
|
| 116 | + "fast_safetensors": args.fast_safetensors, |
115 | 117 | "progress": "begin"}
|
116 | 118 |
|
117 | 119 | if args.measurement is not None:
|
@@ -160,6 +162,8 @@ def save_job():
|
160 | 162 | else:
|
161 | 163 | print(f" -- Measurement will be saved to {job['output_measurement']}")
|
162 | 164 | print(f" !! Conversion script will end after measurement pass")
|
| 165 | +if job.get("fast_safetensors"): |
| 166 | + print(f" -- Enabled fast_safetensors option.") |
163 | 167 |
|
164 | 168 | if job['rope_scale']: print(f" -- RoPE scale: {job['rope_scale']:.2f}")
|
165 | 169 | if job['rope_alpha']: print(f" -- RoPE alpha: {job['rope_alpha']:.2f}")
|
@@ -190,6 +194,10 @@ def save_job():
|
190 | 194 |
|
191 | 195 | tokenizer = ExLlamaV2Tokenizer(config)
|
192 | 196 |
|
| 197 | +# Set fast_safetensors in config |
| 198 | + |
| 199 | +if job.get("fast_safetensors"): config.fasttensors = True |
| 200 | + |
193 | 201 | # Set scaling for input model
|
194 | 202 |
|
195 | 203 | if job["rope_scale"] is not None: config.scale_pos_emb = job["rope_scale"]
|
|
0 commit comments