Skip to content

Commit 23bcb14

Browse files
committed
Merge branch 'master' into ci-build-cross-fix-sync
2 parents 230ef3c + a19b5ce commit 23bcb14

33 files changed

+762
-82
lines changed

common/arg.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ struct common_hf_file_res {
163163
# if !defined(PATH_MAX)
164164
# define PATH_MAX MAX_PATH
165165
# endif
166+
#elif defined(_AIX)
167+
#include <sys/limits.h>
166168
#else
167169
#include <sys/syslimits.h>
168170
#endif

convert_hf_to_gguf.py

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
714714
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
715715
# ref: https://huggingface.co/inclusionAI/Ling-lite
716716
res = "bailingmoe"
717+
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
718+
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
719+
res = "llama4"
717720

718721
if res is None:
719722
logger.warning("\n")
@@ -1608,6 +1611,7 @@ def prepare_tensors(self):
16081611
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
16091612
class LlamaModel(Model):
16101613
model_arch = gguf.MODEL_ARCH.LLAMA
1614+
undo_permute = True
16111615

16121616
def set_vocab(self):
16131617
try:
@@ -1672,10 +1676,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
16721676
n_head = self.hparams["num_attention_heads"]
16731677
n_kv_head = self.hparams.get("num_key_value_heads")
16741678

1675-
if name.endswith(("q_proj.weight", "q_proj.bias")):
1676-
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1677-
if name.endswith(("k_proj.weight", "k_proj.bias")):
1678-
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1679+
if self.undo_permute:
1680+
if name.endswith(("q_proj.weight", "q_proj.bias")):
1681+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1682+
if name.endswith(("k_proj.weight", "k_proj.bias")):
1683+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
16791684

16801685
# process the experts separately
16811686
if name.find("block_sparse_moe.experts") != -1:
@@ -1752,6 +1757,61 @@ def prepare_tensors(self):
17521757
raise ValueError(f"Unprocessed experts: {experts}")
17531758

17541759

1760+
@Model.register("Llama4ForConditionalGeneration")
1761+
class Llama4Model(LlamaModel):
1762+
model_arch = gguf.MODEL_ARCH.LLAMA4
1763+
has_vision: bool = False
1764+
undo_permute = False
1765+
1766+
# TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config"
1767+
# same with llama, but we need to merge the text_config into the root level of hparams
1768+
def __init__(self, *args, **kwargs):
1769+
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
1770+
if "text_config" in hparams:
1771+
hparams = {**hparams, **hparams["text_config"]}
1772+
kwargs["hparams"] = hparams
1773+
super().__init__(*args, **kwargs)
1774+
if "vision_config" in hparams:
1775+
logger.info("Has vision encoder, but it will be ignored")
1776+
self.has_vision = True
1777+
# IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
1778+
self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
1779+
self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
1780+
1781+
def set_vocab(self):
1782+
self._set_vocab_gpt2()
1783+
self.gguf_writer.add_add_bos_token(True)
1784+
1785+
def set_gguf_parameters(self):
1786+
super().set_gguf_parameters()
1787+
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
1788+
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
1789+
1790+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
1791+
name = name.replace("language_model.", "")
1792+
name = name.replace("feed_forward.", "mlp.") # a bit hacky for now
1793+
name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now
1794+
1795+
# split the gate_up into gate and up
1796+
if "gate_up_proj" in name:
1797+
name_up = name.replace("gate_up_proj", "up_proj.weight")
1798+
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
1799+
dim_half = data_torch.shape[-1] // 2
1800+
gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2)
1801+
return [
1802+
(self.map_tensor_name(name_gate), gate_proj_weight),
1803+
(self.map_tensor_name(name_up), up_proj_weight)
1804+
]
1805+
1806+
if name.endswith("down_proj"):
1807+
name += ".weight"
1808+
data_torch = data_torch.transpose(-1, -2)
1809+
1810+
if "multi_modal_projector" in name or "vision_model" in name:
1811+
return []
1812+
return super().modify_tensors(data_torch, name, bid)
1813+
1814+
17551815
@Model.register("Mistral3ForConditionalGeneration")
17561816
class Mistral3Model(LlamaModel):
17571817
model_arch = gguf.MODEL_ARCH.LLAMA

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ class TOKENIZER_TYPE(IntEnum):
113113
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
114114
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
115115
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
116+
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
116117
]
117118

118119

examples/llava/clip.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2840,10 +2840,19 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
28402840
bool clip_is_glm(const struct clip_ctx * ctx) {
28412841
return ctx->has_glm_projector;
28422842
}
2843+
28432844
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
28442845
return ctx->has_qwen2vl_merger;
28452846
}
28462847

2848+
bool clip_is_llava(const struct clip_ctx * ctx) {
2849+
return ctx->has_llava_projector;
2850+
}
2851+
2852+
bool clip_is_gemma3(const struct clip_ctx * ctx) {
2853+
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
2854+
}
2855+
28472856
// Determine the number of encoder layers to iterate over
28482857
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
28492858
// Get the index of the second to last layer; this is the

examples/llava/clip.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
106106
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
107107
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
108108
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
109+
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
110+
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
109111

110112
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
111113

252 Bytes
Binary file not shown.

examples/server/server.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,8 @@ struct server_queue {
17051705
};
17061706

17071707
struct server_response {
1708+
bool running = true;
1709+
17081710
// for keeping track of all tasks waiting for the result
17091711
std::unordered_set<int> waiting_task_ids;
17101712

@@ -1759,6 +1761,10 @@ struct server_response {
17591761
while (true) {
17601762
std::unique_lock<std::mutex> lock(mutex_results);
17611763
condition_results.wait(lock, [&]{
1764+
if (!running) {
1765+
SRV_DBG("%s : queue result stop\n", __func__);
1766+
std::terminate(); // we cannot return here since the caller is HTTP code
1767+
}
17621768
return !queue_results.empty();
17631769
});
17641770

@@ -1789,6 +1795,10 @@ struct server_response {
17891795
}
17901796

17911797
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
1798+
if (!running) {
1799+
SRV_DBG("%s : queue result stop\n", __func__);
1800+
std::terminate(); // we cannot return here since the caller is HTTP code
1801+
}
17921802
if (cr_res == std::cv_status::timeout) {
17931803
return nullptr;
17941804
}
@@ -1818,6 +1828,12 @@ struct server_response {
18181828
}
18191829
}
18201830
}
1831+
1832+
// terminate the waiting loop
1833+
void terminate() {
1834+
running = false;
1835+
condition_results.notify_all();
1836+
}
18211837
};
18221838

18231839
struct server_context {
@@ -4491,9 +4507,10 @@ int main(int argc, char ** argv) {
44914507
svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
44924508

44934509
// clean up function, to be called before exit
4494-
auto clean_up = [&svr]() {
4510+
auto clean_up = [&svr, &ctx_server]() {
44954511
SRV_INF("%s: cleaning up before exit...\n", __func__);
44964512
svr->stop();
4513+
ctx_server.queue_results.terminate();
44974514
llama_backend_free();
44984515
};
44994516

@@ -4534,7 +4551,7 @@ int main(int argc, char ** argv) {
45344551

45354552
if (!ctx_server.load_model(params)) {
45364553
clean_up();
4537-
// t.join(); // FIXME: see below
4554+
t.join();
45384555
LOG_ERR("%s: exiting due to model loading error\n", __func__);
45394556
return 1;
45404557
}
@@ -4582,7 +4599,7 @@ int main(int argc, char ** argv) {
45824599
ctx_server.queue_tasks.start_loop();
45834600

45844601
clean_up();
4585-
// t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
4602+
t.join();
45864603

45874604
return 0;
45884605
}

examples/server/tests/unit/test_embedding.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,26 @@ def test_embedding_multiple():
4949
assert len(d['embedding']) > 1
5050

5151

52+
def test_embedding_multiple_with_fa():
53+
server = ServerPreset.bert_bge_small_with_fa()
54+
server.pooling = 'last'
55+
server.start()
56+
# one of these should trigger the FA branch (i.e. context size % 256 == 0)
57+
res = server.make_request("POST", "/v1/embeddings", data={
58+
"input": [
59+
"a "*253,
60+
"b "*254,
61+
"c "*255,
62+
"d "*256,
63+
],
64+
})
65+
assert res.status_code == 200
66+
assert len(res.body['data']) == 4
67+
for d in res.body['data']:
68+
assert 'embedding' in d
69+
assert len(d['embedding']) > 1
70+
71+
5272
@pytest.mark.parametrize(
5373
"input,is_multi_prompt",
5474
[

examples/server/tests/utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,21 @@ def bert_bge_small() -> ServerProcess:
323323
server.server_embeddings = True
324324
return server
325325

326+
@staticmethod
327+
def bert_bge_small_with_fa() -> ServerProcess:
328+
server = ServerProcess()
329+
server.model_hf_repo = "ggml-org/models"
330+
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
331+
server.model_alias = "bert-bge-small"
332+
server.n_ctx = 1024
333+
server.n_batch = 300
334+
server.n_ubatch = 300
335+
server.n_slots = 2
336+
server.fa = True
337+
server.seed = 42
338+
server.server_embeddings = True
339+
return server
340+
326341
@staticmethod
327342
def tinyllama_infill() -> ServerProcess:
328343
server = ServerProcess()

examples/server/webui/src/components/ChatScreen.tsx

Lines changed: 13 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
import { useEffect, useMemo, useRef, useState } from 'react';
1+
import { useEffect, useMemo, useState } from 'react';
22
import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
33
import ChatMessage from './ChatMessage';
44
import { CanvasType, Message, PendingMessage } from '../utils/types';
55
import { classNames, cleanCurrentUrl, throttle } from '../utils/misc';
66
import CanvasPyInterpreter from './CanvasPyInterpreter';
77
import StorageUtils from '../utils/storage';
88
import { useVSCodeContext } from '../utils/llama-vscode';
9+
import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts';
910

1011
/**
1112
* A message display is a message node with additional information for rendering.
@@ -99,7 +100,8 @@ export default function ChatScreen() {
99100
canvasData,
100101
replaceMessageAndGenerate,
101102
} = useAppContext();
102-
const textarea = useOptimizedTextarea(prefilledMsg.content());
103+
104+
const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content());
103105

104106
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
105107
// TODO: improve this when we have "upload file" feature
@@ -248,22 +250,28 @@ export default function ChatScreen() {
248250
</div>
249251

250252
{/* chat input */}
251-
<div className="flex flex-row items-center pt-8 pb-6 sticky bottom-0 bg-base-100">
253+
<div className="flex flex-row items-end pt-8 pb-6 sticky bottom-0 bg-base-100">
252254
<textarea
253-
className="textarea textarea-bordered w-full"
255+
// Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
256+
// Large screens (lg:): Disable manual resize, apply max-height for autosize limit
257+
className="textarea textarea-bordered w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
254258
placeholder="Type a message (Shift+Enter to add a new line)"
255259
ref={textarea.ref}
260+
onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
256261
onKeyDown={(e) => {
257262
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
258-
if (e.key === 'Enter' && e.shiftKey) return;
259263
if (e.key === 'Enter' && !e.shiftKey) {
260264
e.preventDefault();
261265
sendNewMessage();
262266
}
263267
}}
264268
id="msg-input"
265269
dir="auto"
270+
// Set a base height of 2 rows for mobile views
271+
// On lg+ screens, the hook will calculate and set the initial height anyway
272+
rows={2}
266273
></textarea>
274+
267275
{isGenerating(currConvId ?? '') ? (
268276
<button
269277
className="btn btn-neutral ml-2"
@@ -286,43 +294,3 @@ export default function ChatScreen() {
286294
</div>
287295
);
288296
}
289-
290-
export interface OptimizedTextareaValue {
291-
value: () => string;
292-
setValue: (value: string) => void;
293-
focus: () => void;
294-
ref: React.RefObject<HTMLTextAreaElement>;
295-
}
296-
297-
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
298-
// See https://github.com/ggml-org/llama.cpp/pull/12299
299-
function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
300-
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
301-
const textareaRef = useRef<HTMLTextAreaElement>(null);
302-
303-
useEffect(() => {
304-
if (textareaRef.current && savedInitValue) {
305-
textareaRef.current.value = savedInitValue;
306-
setSavedInitValue('');
307-
}
308-
}, [textareaRef, savedInitValue, setSavedInitValue]);
309-
310-
return {
311-
value: () => {
312-
return textareaRef.current?.value ?? savedInitValue;
313-
},
314-
setValue: (value: string) => {
315-
if (textareaRef.current) {
316-
textareaRef.current.value = value;
317-
}
318-
},
319-
focus: () => {
320-
if (textareaRef.current) {
321-
// focus and move the cursor to the end
322-
textareaRef.current.focus();
323-
textareaRef.current.selectionStart = textareaRef.current.value.length;
324-
}
325-
},
326-
ref: textareaRef,
327-
};
328-
}

0 commit comments

Comments
 (0)