Skip to content

Commit e15d944

Browse files
Add the Quantizations Methods.
1 parent 54b07e7 commit e15d944

File tree

6 files changed

+468
-13
lines changed

6 files changed

+468
-13
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ __pycache__/
99

1010
upcoming.md
1111
examples/
12+
*.ipynb
13+
1214

1315
logs
1416
main.py

quantllm/quant/awq.py

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,7 @@ def quantize(
7777
setattr(self.model, name, quantized)
7878

7979
return self.model
80-
81-
def _collect_activation_stats(
80+
def _collect_activation_stats(
8281
self,
8382
data: torch.Tensor,
8483
num_steps: int
@@ -94,23 +93,59 @@ def fn(module, input, output):
9493
if name not in self.act_scales:
9594
self.act_scales[name] = []
9695
x = input[0].detach()
97-
scale = torch.max(torch.abs(x))
98-
self.act_scales[name].append(scale)
96+
# Handle both 2D and 3D inputs
97+
if len(x.shape) == 3:
98+
# For 3D input (batch_size, seq_len, hidden_size)
99+
scale = torch.max(torch.abs(x.view(-1, x.size(-1))))
100+
else:
101+
scale = torch.max(torch.abs(x))
102+
self.act_scales[name].append(scale.cpu()) # Move to CPU to save memory
99103
return fn
100104

101105
handles.append(
102106
module.register_forward_hook(hook_fn(name))
103107
)
104108

105-
# Run calibration
109+
# Run calibration in smaller batches
106110
with torch.no_grad():
107-
for _ in range(num_steps):
108-
self.model(data)
111+
batch_size = 2 # Small batch size to prevent OOM
112+
for step in range(num_steps):
113+
# Clear CUDA cache periodically
114+
if step % 10 == 0:
115+
torch.cuda.empty_cache()
116+
117+
# Process a small batch
118+
start_idx = (step * batch_size) % len(data)
119+
end_idx = min(start_idx + batch_size, len(data))
120+
batch = data[start_idx:end_idx]
121+
122+
# Move batch to appropriate device
123+
device = next(self.model.parameters()).device
124+
batch = batch.to(device)
125+
126+
self.model(batch)
127+
128+
# Move batch back to CPU to free GPU memory
129+
batch = batch.cpu()
109130

110131
# Remove hooks
111132
for handle in handles:
112133
handle.remove()
113134

135+
# Move model to CPU temporarily to free GPU memory
136+
self.model = self.model.cpu()
137+
torch.cuda.empty_cache()
138+
139+
# Process collected statistics on CPU
140+
for name in self.act_scales:
141+
scales = torch.stack(self.act_scales[name])
142+
# Use 99.9th percentile for more robust statistics
143+
self.act_scales[name] = torch.quantile(scales, 0.999)
144+
145+
# Move model back to GPU
146+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147+
self.model = self.model.to(device)
148+
114149
# Process collected statistics
115150
for name in self.act_scales:
116151
scales = torch.stack(self.act_scales[name])

quantllm/quant/gguf.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,7 @@ def fn(module, input, output):
135135
}
136136

137137
return stats
138-
139-
def _quantize_layer(
138+
def _quantize_layer(
140139
self,
141140
layer: nn.Linear,
142141
stats: Optional[Dict[str, torch.Tensor]] = None
@@ -152,7 +151,7 @@ def _quantize_layer(
152151
config=QuantizationConfig(
153152
bits=self.bits,
154153
scheme="symmetric",
155-
granularity="per-group" if self.group_size > 0 else "per-tensor",
154+
granularity="per-channel" if self.group_size > 0 else "per-tensor",
156155
calibration="minmax",
157156
channel_wise=self.group_size > 0,
158157
dtype=f"int{self.bits}",

quantllm/quant/gptq.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,7 @@ def quantize(self, calibration_data: Optional[torch.Tensor] = None) -> PreTraine
6868
setattr(self.model, name, quantized)
6969

7070
return self.model
71-
72-
def _compute_hessian(self, layer: nn.Linear, data: torch.Tensor) -> torch.Tensor:
71+
def _compute_hessian(self, layer: nn.Linear, data: torch.Tensor) -> torch.Tensor:
7372
"""Compute Hessian approximation for a layer."""
7473
device = next(layer.parameters()).device
7574

@@ -79,6 +78,9 @@ def _compute_hessian(self, layer: nn.Linear, data: torch.Tensor) -> torch.Tensor
7978

8079
def hook_fn(module, input, output):
8180
x = input[0].detach()
81+
# Reshape input if needed (batch_size * seq_len, hidden_size)
82+
if len(x.shape) == 3:
83+
x = x.view(-1, x.size(-1))
8284
with torch.no_grad():
8385
# Accumulate x^T x for Hessian approximation
8486
H.add_(torch.matmul(x.t(), x))
@@ -88,7 +90,11 @@ def hook_fn(module, input, output):
8890

8991
# Run calibration data through model
9092
with torch.no_grad():
91-
self.model(data)
93+
# Process in smaller batches to save memory
94+
batch_size = 4 # Adjust based on available memory
95+
for i in range(0, len(data), batch_size):
96+
batch = data[i:i+batch_size]
97+
self.model(batch)
9298

9399
# Remove hook
94100
handle.remove()

quantllm/utils/benchmark.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
"""Benchmarking utilities for quantization methods."""
2+
3+
import time
4+
import torch
5+
import pandas as pd
6+
from typing import Dict, List, Tuple
7+
from transformers import PreTrainedModel
8+
from quantllm.quant import (
9+
GPTQQuantizer,
10+
AWQQuantizer,
11+
GGUFQuantizer
12+
)
13+
14+
class QuantizationBenchmark:
15+
"""Benchmark different quantization methods."""
16+
17+
def __init__(
18+
self,
19+
model: PreTrainedModel,
20+
calibration_data: torch.Tensor,
21+
input_shape: Tuple[int, ...] = (1, 32),
22+
num_inference_steps: int = 100,
23+
device: str = "cuda" if torch.cuda.is_available() else "cpu"
24+
):
25+
self.model = model
26+
self.calibration_data = calibration_data
27+
self.input_shape = input_shape
28+
self.num_inference_steps = num_inference_steps
29+
self.device = device
30+
self.results = {}
31+
32+
def benchmark_quantizer(
33+
self,
34+
name: str,
35+
quantizer_class,
36+
quantizer_args: Dict
37+
) -> Dict[str, float]:
38+
"""Benchmark a specific quantizer."""
39+
try:
40+
# Initialize quantizer
41+
quantizer = quantizer_class(model=self.model.clone(), **quantizer_args)
42+
43+
# Measure quantization time
44+
start_time = time.time()
45+
quantized_model = quantizer.quantize(calibration_data=self.calibration_data)
46+
quant_time = time.time() - start_time
47+
48+
# Move to appropriate device
49+
quantized_model = quantized_model.to(self.device)
50+
51+
# Generate test input
52+
test_input = torch.randint(
53+
0, 1000,
54+
self.input_shape,
55+
device=self.device
56+
)
57+
58+
# Warmup
59+
for _ in range(10):
60+
with torch.no_grad():
61+
quantized_model(test_input)
62+
torch.cuda.synchronize() if self.device == "cuda" else None
63+
64+
# Measure inference latency
65+
latencies = []
66+
for _ in range(self.num_inference_steps):
67+
start = time.perf_counter()
68+
with torch.no_grad():
69+
quantized_model(test_input)
70+
torch.cuda.synchronize() if self.device == "cuda" else None
71+
latencies.append((time.perf_counter() - start) * 1000) # Convert to ms
72+
73+
latencies = torch.tensor(latencies)
74+
75+
# Calculate memory usage
76+
if self.device == "cuda":
77+
memory_allocated = torch.cuda.memory_allocated() / (1024 * 1024) # MB
78+
peak_memory = torch.cuda.max_memory_allocated() / (1024 * 1024) # MB
79+
else:
80+
memory_allocated = 0
81+
peak_memory = 0
82+
83+
# Calculate model size
84+
model_size = sum(p.numel() * p.element_size() for p in quantized_model.parameters()) / (1024 * 1024) # MB
85+
86+
results = {
87+
"quantization_time": quant_time,
88+
"mean_latency": latencies.mean().item(),
89+
"p95_latency": torch.quantile(latencies, 0.95).item(),
90+
"min_latency": latencies.min().item(),
91+
"max_latency": latencies.max().item(),
92+
"memory_allocated": memory_allocated,
93+
"peak_memory": peak_memory,
94+
"model_size": model_size
95+
}
96+
97+
self.results[name] = results
98+
return results
99+
100+
except Exception as e:
101+
print(f"Error benchmarking {name}: {str(e)}")
102+
return {}
103+
104+
def run_all_benchmarks(self) -> pd.DataFrame:
105+
"""Run benchmarks for all quantization methods."""
106+
# Common config
107+
config = {
108+
"bits": 4,
109+
"group_size": 128
110+
}
111+
112+
# GPTQ
113+
self.benchmark_quantizer(
114+
"GPTQ",
115+
GPTQQuantizer,
116+
{**config, "actorder": True, "use_triton": False}
117+
)
118+
119+
# AWQ
120+
self.benchmark_quantizer(
121+
"AWQ",
122+
AWQQuantizer,
123+
{**config, "zero_point": True}
124+
)
125+
126+
# GGUF
127+
self.benchmark_quantizer(
128+
"GGUF",
129+
GGUFQuantizer,
130+
{**config, "use_packed": True}
131+
)
132+
133+
# Convert results to DataFrame
134+
df = pd.DataFrame.from_dict(self.results, orient='index')
135+
136+
# Add compression ratio
137+
original_size = sum(p.numel() * p.element_size() for p in self.model.parameters()) / (1024 * 1024)
138+
df['compression_ratio'] = original_size / df['model_size']
139+
140+
return df
141+
142+
def print_report(self):
143+
"""Print a formatted benchmark report."""
144+
df = self.run_all_benchmarks()
145+
146+
print("\nQuantization Benchmark Results")
147+
print("=" * 80)
148+
149+
# Format metrics
150+
metrics = {
151+
'quantization_time': ('Quantization Time (s)', '{:.2f}'),
152+
'mean_latency': ('Mean Inference Latency (ms)', '{:.2f}'),
153+
'p95_latency': ('P95 Inference Latency (ms)', '{:.2f}'),
154+
'memory_allocated': ('Memory Used (MB)', '{:.1f}'),
155+
'model_size': ('Model Size (MB)', '{:.1f}'),
156+
'compression_ratio': ('Compression Ratio', '{:.1f}x')
157+
}
158+
159+
for method in df.index:
160+
print(f"\n{method}")
161+
print("-" * 40)
162+
for metric, (name, fmt) in metrics.items():
163+
value = df.loc[method, metric]
164+
print(f"{name:<30} {fmt.format(value)}")
165+
166+
def plot_comparison(self, save_path: str = None):
167+
"""Generate comparison plots."""
168+
try:
169+
import matplotlib.pyplot as plt
170+
except ImportError:
171+
print("matplotlib is required for plotting")
172+
return
173+
174+
df = pd.DataFrame.from_dict(self.results, orient='index')
175+
176+
# Create subplots
177+
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
178+
fig.suptitle('Quantization Method Comparison')
179+
180+
# Latency comparison
181+
axes[0, 0].bar(df.index, df['mean_latency'])
182+
axes[0, 0].set_title('Mean Inference Latency (ms)')
183+
axes[0, 0].tick_params(axis='x', rotation=45)
184+
185+
# Memory usage
186+
axes[0, 1].bar(df.index, df['memory_allocated'])
187+
axes[0, 1].set_title('Memory Usage (MB)')
188+
axes[0, 1].tick_params(axis='x', rotation=45)
189+
190+
# Model size
191+
axes[1, 0].bar(df.index, df['model_size'])
192+
axes[1, 0].set_title('Model Size (MB)')
193+
axes[1, 0].tick_params(axis='x', rotation=45)
194+
195+
# Quantization time
196+
axes[1, 1].bar(df.index, df['quantization_time'])
197+
axes[1, 1].set_title('Quantization Time (s)')
198+
axes[1, 1].tick_params(axis='x', rotation=45)
199+
200+
plt.tight_layout()
201+
202+
if save_path:
203+
plt.savefig(save_path)
204+
else:
205+
plt.show()

0 commit comments

Comments
 (0)