Skip to content

Commit 1ebadc5

Browse files
committed
add streaming support for oai tools (+2 squashed commit)
Squashed commit: [4d080b37] qwen2.5vl surgery script [4bebe7e5] add streaming support for oai tools
1 parent 091eb36 commit 1ebadc5

File tree

2 files changed

+130
-67
lines changed

2 files changed

+130
-67
lines changed

examples/llava/qwen2_vl_surgery.py

Lines changed: 104 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55
import numpy as np
66
from gguf import *
77
from transformers import (
8+
AutoProcessor,
89
Qwen2VLForConditionalGeneration,
10+
Qwen2_5_VLForConditionalGeneration,
911
Qwen2VLProcessor,
10-
AutoProcessor,
11-
Qwen2VLConfig
12+
Qwen2VLConfig,
13+
Qwen2_5_VLConfig,
1214
)
1315

1416

@@ -18,62 +20,80 @@
1820
def k(raw_key: str, arch: str) -> str:
1921
return raw_key.format(arch=arch)
2022

23+
class VL2:
24+
25+
@staticmethod
26+
def to_gguf_name(name: str) -> str:
27+
og = name
28+
name = name.replace("text_model", "t").replace("vision_model", "v")
29+
name = name.replace("blocks", "blk").replace("embeddings.", "")
30+
name = name.replace("attn.", "attn_")
31+
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
32+
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
33+
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
34+
name = name.replace("merger.mlp", 'mm')
35+
print(f"[to_gguf_name] {og} --> {name}")
36+
return name
37+
38+
@classmethod
39+
def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]:
40+
vision_model = qwen2vl.visual
41+
tensor_map = {}
42+
for name, ten in vision_model.state_dict().items():
43+
ten = ten.numpy()
44+
if 'qkv' in name:
45+
if ten.ndim == 2: # weight
46+
c3, _ = ten.shape
47+
else: # bias
48+
c3 = ten.shape[0]
49+
assert c3 % 3 == 0
50+
c = c3 // 3
51+
wq = ten[:c]
52+
wk = ten[c: c * 2]
53+
wv = ten[c * 2:]
54+
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
55+
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
56+
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
57+
elif 'merger' in name:
58+
if name.endswith("ln_q.weight"):
59+
tensor_map['v.post_ln.weight'] = ten
60+
elif name.endswith("ln_q.bias"):
61+
tensor_map['v.post_ln.bias'] = ten
62+
else:
63+
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
64+
tensor_map[cls.to_gguf_name(name)] = ten
65+
elif 'patch_embed.proj.weight' in name:
66+
# NOTE: split Conv3D into Conv2Ds
67+
c1, c2, kt, kh, kw = ten.shape
68+
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
69+
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
70+
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
71+
else:
72+
tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten
2173

22-
def to_gguf_name(name: str) -> str:
23-
og = name
24-
name = name.replace("text_model", "t").replace("vision_model", "v")
25-
name = name.replace("blocks", "blk").replace("embeddings.", "")
26-
name = name.replace("attn.", "attn_")
27-
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
28-
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
29-
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
30-
name = name.replace("merger.mlp", 'mm')
31-
print(f"[to_gguf_name] {og} --> {name}")
32-
return name
33-
34-
35-
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
36-
vision_model = qwen2vl.visual
37-
tensor_map = {}
38-
for name, ten in vision_model.state_dict().items():
39-
ten = ten.numpy()
40-
if 'qkv' in name:
41-
if ten.ndim == 2: # weight
42-
c3, _ = ten.shape
43-
else: # bias
44-
c3 = ten.shape[0]
45-
assert c3 % 3 == 0
46-
c = c3 // 3
47-
wq = ten[:c]
48-
wk = ten[c: c * 2]
49-
wv = ten[c * 2:]
50-
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
51-
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
52-
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
53-
elif 'merger' in name:
54-
if name.endswith("ln_q.weight"):
55-
tensor_map['v.post_ln.weight'] = ten
56-
elif name.endswith("ln_q.bias"):
57-
tensor_map['v.post_ln.bias'] = ten
74+
for new_name, ten in tensor_map.items():
75+
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
76+
tensor_map[new_name] = ten.astype(np.float32)
5877
else:
59-
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
60-
tensor_map[to_gguf_name(name)] = ten
61-
elif 'patch_embed.proj.weight' in name:
62-
# NOTE: split Conv3D into Conv2Ds
63-
c1, c2, kt, kh, kw = ten.shape
64-
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
65-
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
66-
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
67-
else:
68-
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
69-
70-
for new_name, ten in tensor_map.items():
71-
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
72-
tensor_map[new_name] = ten.astype(np.float32)
73-
else:
74-
tensor_map[new_name] = ten.astype(dtype)
75-
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
76-
return tensor_map
78+
tensor_map[new_name] = ten.astype(dtype)
79+
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
80+
return tensor_map
81+
82+
83+
class VL25(VL2):
84+
85+
@staticmethod
86+
def to_gguf_name(name: str) -> str:
87+
og = name
88+
name = name.replace("text_model", "t").replace("vision_model", "v")
89+
name = name.replace("blocks", "blk").replace("embeddings.", "")
90+
name = name.replace("attn.", "attn_")
91+
name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up")
92+
name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.")
93+
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
94+
name = name.replace("merger.mlp", 'mm')
95+
print(f"[vl25][to_gguf_name] {og} --> {name}")
96+
return name
7797

7898

7999
def main(args):
@@ -92,11 +112,18 @@ def main(args):
92112
model_path = ""
93113
model_name = args.model_name
94114
print("model_name: ", model_name)
95-
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
96-
model_name, torch_dtype=dtype, device_map="cpu"
97-
)
98-
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
99-
vcfg = cfg.vision_config
115+
if args.model_type == "qwen2vl":
116+
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
117+
model_name, torch_dtype=dtype, device_map="cpu"
118+
)
119+
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
120+
vcfg = cfg.vision_config
121+
else:
122+
qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained(
123+
model_name, torch_dtype=dtype, device_map="cpu"
124+
)
125+
cfg: Qwen2_5_VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
126+
vcfg = cfg.vision_config
100127

101128
if os.path.isdir(model_name):
102129
local_model = True
@@ -125,14 +152,26 @@ def main(args):
125152
else:
126153
raise ValueError()
127154

128-
tensor_map = find_vision_tensors(qwen2vl, np_dtype)
155+
if args.model_type == "qwen2.5vl":
156+
fout.add_bool("clip.use_glu_mlp", True) # gate linear unit MLP layer in vision model
157+
fout.add_bool("clip.use_rms_norm", True)
158+
fout.add_array("clip.vision.fullatt_block_indexes", vcfg.fullatt_block_indexes)
159+
fout.add_uint32("clip.vision.window_size", vcfg.window_size)
160+
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
161+
fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size)
162+
else:
163+
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
164+
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
165+
166+
if args.model_type == "qwen2.5vl":
167+
tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype)
168+
else:
169+
tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype)
129170
for name, data in tensor_map.items():
130171
fout.add_tensor(name, data)
131172

132173
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
133174
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
134-
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
135-
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
136175
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
137176
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
138177
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
@@ -160,6 +199,7 @@ def main(args):
160199
if __name__ == "__main__":
161200
parser = argparse.ArgumentParser()
162201
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
202+
parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl")
163203
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
164204
args = parser.parse_args()
165205
main(args)

koboldcpp.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2018,8 +2018,8 @@ def transform_genparams(genparams, api_format):
20182018
#if auto mode, determine whether a tool is needed
20192019
tools_string = json.dumps(tools_array, indent=0)
20202020
should_use_tools = True
2021-
user_start = adapter_obj.get("user_start", "### Instruction:\n\n")
2022-
user_end = adapter_obj.get("user_end", "\n\n### Response:\n\n")
2021+
user_start = user_message_start
2022+
user_end = assistant_message_start
20232023
if chosen_tool=="auto":
20242024
temp_poll = {
20252025
"prompt": f"{user_start}User query:\n\n{messages_string}\n\nTool Code:\n{tools_string}Determine from the provided tool code if the user query would be best answered by a listed tool (One word: yes / no):{user_end}",
@@ -2030,7 +2030,7 @@ def transform_genparams(genparams, api_format):
20302030
"ban_eos_token":False
20312031
}
20322032
temp_poll_result = generate(genparams=temp_poll)
2033-
if temp_poll_result and not "yes" in temp_poll_result['text'].lower():
2033+
if temp_poll_result and "yes" not in temp_poll_result['text'].lower():
20342034
should_use_tools = False
20352035
if not args.quiet:
20362036
print(f"\nRelevant tool is listed: {temp_poll_result['text']} ({should_use_tools})")
@@ -2301,6 +2301,10 @@ async def send_kai_sse_event(self, data):
23012301

23022302
async def handle_sse_stream(self, genparams, api_format):
23032303
global friendlymodelname, currfinishreason
2304+
# if tools, do not send anything - OAI tool calls will be handled with fakestreaming!
2305+
using_openai_tools = genparams.get('using_openai_tools', False)
2306+
if api_format == 4 and using_openai_tools:
2307+
return
23042308
self.send_response(200)
23052309
self.send_header("X-Accel-Buffering", "no")
23062310
self.send_header("cache-control", "no-cache")
@@ -2311,6 +2315,7 @@ async def handle_sse_stream(self, genparams, api_format):
23112315
incomplete_token_buffer = bytearray()
23122316
async_sleep_short = 0.02
23132317
await asyncio.sleep(0.35) #anti race condition, prevent check from overtaking generate
2318+
23142319
try:
23152320
tokenReserve = "" #keeps fully formed tokens that we cannot send out yet
23162321
while True:
@@ -3188,6 +3193,24 @@ def do_POST(self):
31883193
self.send_header('content-length', str(len(genresp)))
31893194
self.end_headers(content_type='application/json')
31903195
self.wfile.write(genresp)
3196+
elif api_format == 4 and genparams.get('using_openai_tools', False): #special case, fake streaming for openai tool calls
3197+
self.send_response(200)
3198+
self.send_header("X-Accel-Buffering", "no")
3199+
self.send_header("cache-control", "no-cache")
3200+
self.send_header("connection", "keep-alive")
3201+
self.end_headers(content_type='text/event-stream')
3202+
toolsdata_res = []
3203+
try:
3204+
toolsdata_res = gen['choices'][0]['message']['tool_calls']
3205+
except Exception:
3206+
toolsdata_res = []
3207+
toolsdata_p1 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':None, "tool_calls":toolsdata_res}}]})
3208+
toolsdata_p2 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":"tool_calls","delta":{}}]})
3209+
self.wfile.write(f'data: {toolsdata_p1}\n\n'.encode())
3210+
self.wfile.write(f'data: {toolsdata_p2}\n\n'.encode())
3211+
self.wfile.write('data: [DONE]'.encode())
3212+
self.wfile.flush()
3213+
self.close_connection = True
31913214
except Exception as ex:
31923215
utfprint(ex,1)
31933216
print("Generate: The response could not be sent, maybe connection was terminated?")

0 commit comments

Comments
 (0)