Skip to content

Commit 8152481

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents cc2b864 + aa3ee0e commit 8152481

31 files changed

+753
-483
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ The project differentiates between 3 levels of contributors:
2525
- Squash-merge PRs
2626
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
2727
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
28-
- Let other maintainers, merge their own PRs
28+
- Let other maintainers merge their own PRs
2929
- When merging a PR, make sure you have a good understanding of the changes
3030
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
3131

common/common.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -961,15 +961,13 @@ struct common_init_result common_init_from_params(common_params & params) {
961961

962962
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
963963
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
964+
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
964965

965-
if (!has_eos && !has_sep) {
966-
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
966+
if (!has_eos && !has_sep && !has_rerank_prompt) {
967+
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
967968
ok = false;
968969
} else if (!has_eos) {
969970
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
970-
} else if (!has_sep) {
971-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
972-
ok = false;
973971
}
974972

975973
if (!ok) {

convert_hf_to_gguf.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3717,11 +3717,29 @@ def prepare_tensors(self):
37173717
class Qwen3Model(Qwen2Model):
37183718
model_arch = gguf.MODEL_ARCH.QWEN3
37193719

3720+
# extra logic for rerank models
3721+
is_rerank: bool = False
3722+
is_tied_embeddings: bool = False
3723+
token_false_id: int | None = None
3724+
token_true_id: int | None = None
3725+
37203726
def __init__(self, *args, **kwargs):
37213727
super().__init__(*args, **kwargs)
3728+
3729+
# track for intern-s1-mini
37223730
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
37233731
self.origin_hf_arch = hparams.get('architectures', [None])[0]
37243732

3733+
# a bit hacky, but currently the only way to detect if this is a rerank model
3734+
# ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
3735+
readme_path = self.dir_model / "README.md"
3736+
readme_text = ""
3737+
if readme_path.exists():
3738+
with readme_path.open("r", encoding="utf-8") as f:
3739+
readme_text = f.read()
3740+
if "# Qwen3-Reranker" in readme_text:
3741+
self._find_rerank_config()
3742+
37253743
def set_vocab(self):
37263744
# deal with intern-s1-mini
37273745
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
@@ -3730,6 +3748,53 @@ def set_vocab(self):
37303748

37313749
super().set_vocab()
37323750

3751+
def _find_rerank_config(self):
3752+
from transformers import AutoTokenizer
3753+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
3754+
3755+
self.is_rerank = True
3756+
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
3757+
self.token_false_id = tokenizer.convert_tokens_to_ids("no")
3758+
self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
3759+
self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
3760+
3761+
assert self.token_false_id is not None and self.token_true_id is not None
3762+
3763+
def set_gguf_parameters(self):
3764+
super().set_gguf_parameters()
3765+
if self.is_rerank:
3766+
self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
3767+
self.gguf_writer.add_classifier_output_labels(["yes", "no"])
3768+
self.gguf_writer.add_chat_template([{
3769+
"name": "rerank",
3770+
"template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
3771+
"<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
3772+
"<|im_start|>assistant\n<think>\n\n</think>\n\n"
3773+
}])
3774+
3775+
def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
3776+
# extract "yes" and "no" tokens from the output lm_head tensor
3777+
false_row = data_torch[self.token_false_id]
3778+
true_row = data_torch[self.token_true_id]
3779+
return torch.stack([true_row, false_row], dim=0)
3780+
3781+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3782+
if self.is_rerank:
3783+
is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
3784+
is_real_head = not self.is_tied_embeddings and "lm_head" in name
3785+
if is_tied_head or is_real_head:
3786+
cls_out_head = (
3787+
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
3788+
self._get_cls_out_tensor(data_torch),
3789+
)
3790+
if is_tied_head:
3791+
embed = (self.map_tensor_name(name), data_torch)
3792+
return [cls_out_head, embed]
3793+
if is_real_head:
3794+
return [cls_out_head]
3795+
3796+
return super().modify_tensors(data_torch, name, bid)
3797+
37333798

37343799
@ModelBase.register("Qwen3MoeForCausalLM")
37353800
class Qwen3MoeModel(Qwen2MoeModel):

examples/embedding/embedding.cpp

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,13 @@ int main(int argc, char ** argv) {
9595
params.n_batch = params.n_ctx;
9696
}
9797

98-
// For non-causal models, batch size must be equal to ubatch size
99-
params.n_ubatch = params.n_batch;
98+
// for non-causal models, batch size must be equal to ubatch size
99+
if (params.attention_type != LLAMA_ATTENTION_TYPE_CAUSAL) {
100+
params.n_ubatch = params.n_batch;
101+
}
102+
103+
// get max number of sequences per batch
104+
const int n_seq_max = llama_max_parallel_sequences();
100105

101106
llama_backend_init();
102107
llama_numa_init(params.numa);
@@ -144,6 +149,7 @@ int main(int argc, char ** argv) {
144149
// get added sep and eos token, if any
145150
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
146151
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
152+
const char * rerank_prompt = llama_model_chat_template(model, "rerank");
147153

148154
// tokenize the prompts and trim
149155
std::vector<std::vector<int32_t>> inputs;
@@ -153,21 +159,28 @@ int main(int argc, char ** argv) {
153159
// split classification pairs and insert expected separator tokens
154160
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
155161
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
156-
std::string final_prompt;
157-
158-
for (size_t i = 0; i < pairs.size(); i++) {
159-
final_prompt += pairs[i];
160-
if (i != pairs.size() - 1) {
161-
if (!added_eos_token.empty()) {
162-
final_prompt += added_eos_token;
163-
}
164-
if (!added_sep_token.empty()) {
165-
final_prompt += added_sep_token;
162+
if (rerank_prompt != nullptr) {
163+
const std::string query = pairs[0];
164+
const std::string doc = pairs[1];
165+
std::string final_prompt = rerank_prompt;
166+
string_replace_all(final_prompt, "{query}" , query);
167+
string_replace_all(final_prompt, "{document}", doc );
168+
inp = common_tokenize(vocab, final_prompt, true, true);
169+
} else {
170+
std::string final_prompt;
171+
for (size_t i = 0; i < pairs.size(); i++) {
172+
final_prompt += pairs[i];
173+
if (i != pairs.size() - 1) {
174+
if (!added_eos_token.empty()) {
175+
final_prompt += added_eos_token;
176+
}
177+
if (!added_sep_token.empty()) {
178+
final_prompt += added_sep_token;
179+
}
166180
}
167181
}
182+
inp = common_tokenize(ctx, final_prompt, true, true);
168183
}
169-
170-
inp = common_tokenize(ctx, final_prompt, true, true);
171184
} else {
172185
inp = common_tokenize(ctx, prompt, true, true);
173186
}
@@ -229,7 +242,7 @@ int main(int argc, char ** argv) {
229242
const uint64_t n_toks = inp.size();
230243

231244
// encode if at capacity
232-
if (batch.n_tokens + n_toks > n_batch) {
245+
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
233246
float * out = emb + e * n_embd;
234247
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
235248
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;

examples/model-conversion/Makefile

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,13 +118,17 @@ embedding-convert-model:
118118

119119
embedding-run-original-model:
120120
$(call validate_embedding_model_path,embedding-run-original-model)
121-
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/embedding/run-original-model.py
121+
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
122+
./scripts/embedding/run-original-model.py \
123+
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
122124

123125
embedding-run-converted-model:
124-
@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/embedding/run-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
126+
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
127+
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
125128

126129
embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
127-
@./scripts/embedding/compare-embeddings-logits.sh
130+
@./scripts/embedding/compare-embeddings-logits.sh \
131+
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
128132

129133
embedding-inspect-original-model:
130134
$(call validate_embedding_model_path,embedding-inspect-original-model)
@@ -156,7 +160,8 @@ embedding-quantize-model:
156160
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
157161

158162
embedding-run-quantized-model:
159-
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
163+
@./scripts/embedding/run-converted-model.sh $(QUANTIZED_EMBEDDING_MODEL) \
164+
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
160165

161166
###
162167
### Perplexity targets/recipes

examples/model-conversion/logits.cpp

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,35 @@ int main(int argc, char ** argv) {
151151
logits = llama_get_embeddings(ctx);
152152
n_logits = llama_model_n_embd(model) * batch.n_tokens;
153153
type = "-embeddings";
154+
155+
const int n_embd = llama_model_n_embd(model);
156+
const int n_embd_count = batch.n_tokens;
157+
158+
printf("Embedding dimension: %d\n", n_embd);
159+
printf("\n");
160+
161+
// Print embeddings in the specified format
162+
for (int j = 0; j < n_embd_count; j++) {
163+
printf("embedding %d: ", j);
164+
165+
// Print first 3 values
166+
for (int i = 0; i < 3 && i < n_embd; i++) {
167+
printf("%9.6f ", logits[j * n_embd + i]);
168+
}
169+
170+
printf(" ... ");
171+
172+
// Print last 3 values
173+
for (int i = n_embd - 3; i < n_embd; i++) {
174+
if (i >= 0) {
175+
printf("%9.6f ", logits[j * n_embd + i]);
176+
}
177+
}
178+
179+
printf("\n");
180+
}
181+
printf("\n");
182+
154183
printf("Embeddings size: %d\n", n_logits);
155184
} else {
156185
logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
@@ -183,22 +212,23 @@ int main(int argc, char ** argv) {
183212
return 1;
184213
}
185214
for (int i = 0; i < n_logits; i++) {
186-
fprintf(f, "%d: %.6f\n", i, logits[i]); // Added index and changed format
215+
fprintf(f, "%d: %.6f\n", i, logits[i]);
187216
}
188217
fclose(f);
189218

190-
// Print first and last 10 logits for quick verification
191-
printf("First 10 logits: ");
192-
for (int i = 0; i < 10 && i < n_logits; i++) {
193-
printf("%.6f ", logits[i]);
194-
}
195-
printf("\n");
219+
if (!embedding_mode) {
220+
printf("First 10 logits: ");
221+
for (int i = 0; i < 10 && i < n_logits; i++) {
222+
printf("%.6f ", logits[i]);
223+
}
224+
printf("\n");
196225

197-
printf("Last 10 logits: ");
198-
for (int i = n_logits - 10; i < n_logits; i++) {
199-
if (i >= 0) printf("%.6f ", logits[i]);
226+
printf("Last 10 logits: ");
227+
for (int i = n_logits - 10; i < n_logits; i++) {
228+
if (i >= 0) printf("%.6f ", logits[i]);
229+
}
230+
printf("\n\n");
200231
}
201-
printf("\n\n");
202232

203233
printf("Logits saved to %s\n", bin_filename);
204234
printf("Logits saved to %s\n", txt_filename);

examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,37 @@
22

33
set -e
44

5-
MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}"
6-
MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
5+
# Parse command line arguments
6+
MODEL_PATH=""
7+
MODEL_NAME=""
8+
PROMPTS_FILE=""
9+
10+
# First argument is always model path
11+
if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
12+
MODEL_PATH="$1"
13+
shift
14+
fi
15+
16+
# Parse remaining arguments
17+
while [[ $# -gt 0 ]]; do
18+
case $1 in
19+
--prompts-file|-pf)
20+
PROMPTS_FILE="$2"
21+
shift 2
22+
;;
23+
*)
24+
# If MODEL_NAME not set and this isn't a flag, use as model name
25+
if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
26+
MODEL_NAME="$1"
27+
fi
28+
shift
29+
;;
30+
esac
31+
done
32+
33+
# Set defaults
34+
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
35+
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
736

837
if [ -t 0 ]; then
938
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
@@ -35,8 +64,18 @@ with open('$TEMP_FILE', 'wb') as f:
3564
trap "rm -f $TEMP_FILE" EXIT
3665
fi
3766

38-
python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
67+
# Build the semantic_check.py command
68+
SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
3969
--python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
40-
--cpp-embeddings $CPP_EMBEDDINGS \
41-
--prompt "Hello world today"
70+
--cpp-embeddings $CPP_EMBEDDINGS"
71+
72+
# Add prompts file if specified, otherwise use default prompt
73+
if [ -n "$PROMPTS_FILE" ]; then
74+
SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
75+
else
76+
SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
77+
fi
78+
79+
# Execute the command
80+
eval $SEMANTIC_CMD
4281

0 commit comments

Comments
 (0)