|
20 | 20 |
|
21 | 21 | # Select number of samples. 512 samples is a good place to start. |
22 | 22 | # Increasing the number of samples can improve accuracy. |
23 | | -NUM_CALIBRATION_SAMPLES = 1 |
| 23 | +NUM_CALIBRATION_SAMPLES = 512 |
24 | 24 | MAX_SEQUENCE_LENGTH = 2048 |
25 | 25 |
|
26 | 26 | # Load dataset and preprocess. |
@@ -66,15 +66,15 @@ def tokenize(sample): |
66 | 66 | num_calibration_samples=NUM_CALIBRATION_SAMPLES, |
67 | 67 | ) |
68 | 68 |
|
69 | | -# Confirm generations of the quantized model look sane. |
70 | | -print("\n\n") |
71 | | -print("========== SAMPLE GENERATION ==============") |
72 | | -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( |
73 | | - model.device |
74 | | -) |
75 | | -output = model.generate(input_ids, max_new_tokens=100) |
76 | | -print(tokenizer.decode(output[0])) |
77 | | -print("==========================================\n\n") |
| 69 | +# # Confirm generations of the quantized model look sane. |
| 70 | +# print("\n\n") |
| 71 | +# print("========== SAMPLE GENERATION ==============") |
| 72 | +# input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( |
| 73 | +# model.device |
| 74 | +# ) |
| 75 | +# output = model.generate(input_ids, max_new_tokens=100) |
| 76 | +# print(tokenizer.decode(output[0])) |
| 77 | +# print("==========================================\n\n") |
78 | 78 |
|
79 | 79 | # Save to disk compressed. |
80 | 80 | SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" |
|
0 commit comments