-
Notifications
You must be signed in to change notification settings - Fork 155
Expand file tree
/
Copy pathproof.py
More file actions
195 lines (162 loc) · 6.76 KB
/
proof.py
File metadata and controls
195 lines (162 loc) · 6.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
"""
TurboQuant definitive proof. Two separate subprocesses:
1. Baseline vLLM
2. TurboQuant + free_kv_cache
Hard numbers side by side.
"""
import os, sys, subprocess, json
MODEL = os.environ.get("MODEL", "Qwen/Qwen3.5-27B")
TP = int(os.environ.get("TP", "4"))
GPU_MEM = float(os.environ.get("GPU_MEM", "0.90"))
MAX_MODEL_LEN = int(os.environ.get("MAX_MODEL_LEN", "131072"))
GPUS = os.environ.get("CUDA_VISIBLE_DEVICES", "0,1,4,6")
PYTHON = sys.executable
def run_phase(name, script):
path = f"/tmp/tq_{name}.py"
with open(path, "w") as f:
f.write(script)
env = os.environ.copy()
env["CUDA_VISIBLE_DEVICES"] = GPUS
env["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
env["TOKENIZERS_PARALLELISM"] = "false"
r = subprocess.run([PYTHON, path], capture_output=True, text=True, env=env, timeout=600)
if r.returncode != 0:
print(f"=== {name} FAILED ===")
# Find the actual error
for line in r.stderr.split("\n"):
if "Error" in line or "error" in line:
print(f" {line.strip()}")
return None
for line in reversed(r.stdout.strip().split("\n")):
try:
return json.loads(line)
except:
continue
return None
BASELINE = f'''
import os, json, subprocess
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
def main():
import sys
from vllm import LLM, SamplingParams
llm = LLM(
model="{MODEL}", dtype="bfloat16",
gpu_memory_utilization={GPU_MEM},
max_model_len={MAX_MODEL_LEN},
tensor_parallel_size={TP},
trust_remote_code=True, max_num_seqs=1,
)
blocks = llm.llm_engine.vllm_config.cache_config.num_gpu_blocks
r = subprocess.run(["nvidia-smi","--query-gpu=index,memory.used","--format=csv,noheader,nounits"],
capture_output=True, text=True)
vram = [int(l.split(",")[1].strip()) for l in r.stdout.strip().split("\\n") if l.strip()]
out = llm.generate(["Explain KV cache compression in LLM inference."],
SamplingParams(temperature=0, max_tokens=64))
r2 = subprocess.run(["nvidia-smi","--query-gpu=index,memory.used","--format=csv,noheader,nounits"],
capture_output=True, text=True)
vram2 = [int(l.split(",")[1].strip()) for l in r2.stdout.strip().split("\\n") if l.strip()]
print(json.dumps({{"blocks": blocks, "vram_load": vram, "vram_gen": vram2,
"text": out[0].outputs[0].text[:100]}}))
if __name__ == "__main__":
main()
'''
TQ = f'''
import os, json, subprocess
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
def main():
import sys
from vllm import LLM, SamplingParams
llm = LLM(
model="{MODEL}", dtype="bfloat16",
gpu_memory_utilization={GPU_MEM},
max_model_len={MAX_MODEL_LEN},
tensor_parallel_size={TP},
trust_remote_code=True, max_num_seqs=1,
)
blocks = llm.llm_engine.vllm_config.cache_config.num_gpu_blocks
engine = llm.llm_engine
core = getattr(engine, "engine_core", engine)
inner = getattr(core, "engine_core", core)
executor = inner.model_executor
def _install(worker):
from turboquant.vllm_attn_backend import install_turboquant_hooks, MODE_ACTIVE
return len(install_turboquant_hooks(worker.model_runner, key_bits=3, value_bits=2,
buffer_size=128, mode=MODE_ACTIVE))
hooks = executor.collective_rpc(_install)
out = llm.generate(["Explain KV cache compression in LLM inference."],
SamplingParams(temperature=0, max_tokens=64))
r = subprocess.run(["nvidia-smi","--query-gpu=index,memory.used","--format=csv,noheader,nounits"],
capture_output=True, text=True)
vram_gen = [int(l.split(",")[1].strip()) for l in r.stdout.strip().split("\\n") if l.strip()]
def _free(worker):
from turboquant.vllm_attn_backend import free_kv_cache
return free_kv_cache(worker.model_runner)
freed = executor.collective_rpc(_free)
r2 = subprocess.run(["nvidia-smi","--query-gpu=index,memory.used","--format=csv,noheader,nounits"],
capture_output=True, text=True)
vram_freed = [int(l.split(",")[1].strip()) for l in r2.stdout.strip().split("\\n") if l.strip()]
print(json.dumps({{"blocks": blocks, "hooks": hooks[0], "vram_gen": vram_gen,
"vram_freed": vram_freed, "freed_bytes": freed,
"text": out[0].outputs[0].text[:100]}}))
if __name__ == "__main__":
main()
'''
def main():
print(f"Model: {MODEL}")
print(f"TP={TP}, GPU_MEM={GPU_MEM}, MAX_MODEL_LEN={MAX_MODEL_LEN}")
print(f"GPUs: {GPUS}")
print()
print(">>> Phase 1: Baseline ...", flush=True)
bl = run_phase("baseline", BASELINE)
if not bl:
return
print(">>> Phase 2: TurboQuant ...", flush=True)
tq = run_phase("tq", TQ)
if not tq:
return
n = len(GPUS.split(","))
bl_v = bl["vram_gen"][:n]
tq_v = tq["vram_gen"][:n]
tq_f = tq["vram_freed"][:n]
freed_total = sum(tq["freed_bytes"])
freed_per = tq["freed_bytes"][0]
block_size = 784 # Qwen3.5-27B: attention block aligned to mamba
bl_tokens = bl["blocks"] * block_size
# Extra capacity from freed KV cache
# full_attn: 16 layers, kv_heads=1/gpu, head_dim=256, bf16=2, K+V=2
bytes_per_block_full = 2 * 1 * 256 * 2 * block_size * tq["hooks"]
extra_blocks = int(freed_per / max(bytes_per_block_full, 1))
new_tokens = bl_tokens + extra_blocks * block_size
print()
print("=" * 70)
print(f" MODEL: {MODEL}")
print(f" TP={TP}, max_model_len={MAX_MODEL_LEN}, gpu_mem={GPU_MEM}")
print()
print(f" BASELINE (vanilla vLLM)")
print(f" KV cache blocks: {bl['blocks']}")
print(f" Max tokens: {bl_tokens:,}")
print(f" VRAM/GPU after gen: {bl_v} MB")
print()
print(f" TURBOQUANT (3-bit key, 2-bit value, {tq['hooks']} full_attn layers)")
print(f" KV cache blocks: {tq['blocks']} (same initial alloc)")
print(f" VRAM/GPU after gen: {tq_v} MB")
print(f" VRAM/GPU after free: {tq_f} MB")
print(f" Tensor freed/GPU: {freed_per/1e6:.0f} MB")
print(f" Total tensor freed: {freed_total/1e6:.0f} MB ({freed_total/1e9:.1f} GB)")
print()
print(f" RESULT")
print(f" KV VRAM saved/GPU: {freed_per/1e6:.0f} MB")
print(f" Extra blocks possible: {extra_blocks}")
print(f" Baseline capacity: {bl_tokens:,} tokens")
print(f" With TQ capacity: {new_tokens:,} tokens")
print(f" Improvement: {new_tokens/bl_tokens:.2f}x context length")
print()
print(f" OUTPUT COMPARISON")
print(f" Baseline: {bl['text']}")
print(f" TQ: {tq['text']}")
print("=" * 70)
if __name__ == "__main__":
main()