Skip to content

Commit 91ecc29

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents 55eac0f + 5787b5d commit 91ecc29

File tree

45 files changed

+604
-163
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+604
-163
lines changed

.github/workflows/build-linux-cross.yml

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,3 +231,116 @@ jobs:
231231
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
232232
233233
cmake --build build --config Release -j $(nproc)
234+
235+
debian-13-loongarch64-cpu-cross:
236+
runs-on: ubuntu-24.04
237+
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
238+
239+
steps:
240+
- uses: actions/checkout@v4
241+
- name: Setup LoongArch
242+
run: |
243+
rm -f /etc/apt/sources.list.d/*
244+
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
245+
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
246+
EOF
247+
( echo 'quiet "true";'; \
248+
echo 'APT::Get::Assume-Yes "true";'; \
249+
echo 'APT::Install-Recommends "false";'; \
250+
echo 'Acquire::Check-Valid-Until "false";'; \
251+
echo 'Acquire::Retries "5";'; \
252+
) > /etc/apt/apt.conf.d/99snapshot-repos
253+
254+
apt-get update
255+
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
256+
dpkg --add-architecture loong64
257+
258+
# Add arch-specific repositories for non-amd64 architectures
259+
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
260+
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
261+
EOF
262+
263+
apt-get update || true ;# Prevent failure due to missing URLs.
264+
265+
apt-get install -y --no-install-recommends \
266+
build-essential \
267+
gcc-14-loongarch64-linux-gnu \
268+
g++-14-loongarch64-linux-gnu
269+
270+
- name: Build
271+
run: |
272+
cmake -B build -DLLAMA_CURL=OFF \
273+
-DCMAKE_BUILD_TYPE=Release \
274+
-DGGML_OPENMP=OFF \
275+
-DLLAMA_BUILD_EXAMPLES=ON \
276+
-DLLAMA_BUILD_TOOLS=ON \
277+
-DLLAMA_BUILD_TESTS=OFF \
278+
-DCMAKE_SYSTEM_NAME=Linux \
279+
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
280+
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
281+
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
282+
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
283+
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
284+
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
285+
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
286+
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
287+
288+
cmake --build build --config Release -j $(nproc)
289+
290+
debian-13-loongarch64-vulkan-cross:
291+
runs-on: ubuntu-24.04
292+
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
293+
294+
steps:
295+
- uses: actions/checkout@v4
296+
- name: Setup LoongArch
297+
run: |
298+
rm -f /etc/apt/sources.list.d/*
299+
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
300+
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
301+
EOF
302+
( echo 'quiet "true";'; \
303+
echo 'APT::Get::Assume-Yes "true";'; \
304+
echo 'APT::Install-Recommends "false";'; \
305+
echo 'Acquire::Check-Valid-Until "false";'; \
306+
echo 'Acquire::Retries "5";'; \
307+
) > /etc/apt/apt.conf.d/99snapshot-repos
308+
309+
apt-get update
310+
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
311+
dpkg --add-architecture loong64
312+
313+
# Add arch-specific repositories for non-amd64 architectures
314+
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
315+
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
316+
EOF
317+
318+
apt-get update || true ;# Prevent failure due to missing URLs.
319+
320+
apt-get install -y --no-install-recommends \
321+
build-essential \
322+
glslc \
323+
gcc-14-loongarch64-linux-gnu \
324+
g++-14-loongarch64-linux-gnu \
325+
libvulkan-dev:loong64
326+
327+
- name: Build
328+
run: |
329+
cmake -B build -DLLAMA_CURL=OFF \
330+
-DCMAKE_BUILD_TYPE=Release \
331+
-DGGML_VULKAN=ON \
332+
-DGGML_OPENMP=OFF \
333+
-DLLAMA_BUILD_EXAMPLES=ON \
334+
-DLLAMA_BUILD_TOOLS=ON \
335+
-DLLAMA_BUILD_TESTS=OFF \
336+
-DCMAKE_SYSTEM_NAME=Linux \
337+
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
338+
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
339+
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
340+
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
341+
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
342+
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
343+
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
344+
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
345+
346+
cmake --build build --config Release -j $(nproc)

common/common.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -938,7 +938,7 @@ struct common_init_result common_init_from_params(common_params & params) {
938938
return iparams;
939939
}
940940

941-
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
941+
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
942942
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
943943
params.ctx_shift = false;
944944
}
@@ -1045,7 +1045,7 @@ struct common_init_result common_init_from_params(common_params & params) {
10451045
if (llama_model_has_decoder(model)) {
10461046
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
10471047
}
1048-
llama_kv_self_clear(lctx);
1048+
llama_memory_clear(llama_get_memory(lctx), true);
10491049
llama_synchronize(lctx);
10501050
llama_perf_context_reset(lctx);
10511051
llama_set_warmup(lctx, false);

common/speculative.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
144144
auto & smpl = spec->smpl;
145145
auto & prompt = spec->prompt;
146146

147+
auto * mem = llama_get_memory(ctx);
148+
147149
int reuse_i = 0;
148150
int reuse_n = 0;
149151

@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
173175
result.reserve(params.n_draft);
174176

175177
if (reuse_n == 0) {
176-
llama_kv_self_clear(ctx);
178+
llama_memory_clear(mem, false);
177179

178180
prompt.clear();
179181
} else {
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
192194
}
193195

194196
if (reuse_i > 0) {
195-
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
196-
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
197+
llama_memory_seq_rm (mem, 0, 0, reuse_i);
198+
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
197199

198200
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
199201
}
200202

201203
if (reuse_n < (int) prompt.size()) {
202-
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
204+
llama_memory_seq_rm (mem, 0, reuse_n, -1);
203205

204206
prompt.erase(prompt.begin() + reuse_n, prompt.end());
205207
}

convert_hf_to_gguf.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3709,8 +3709,7 @@ def set_gguf_parameters(self):
37093709
self._try_set_pooling_type()
37103710

37113711
if self.cls_out_labels:
3712-
key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
3713-
self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
3712+
self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
37143713

37153714
def set_vocab(self):
37163715
tokens, toktypes, tokpre = self.get_vocab_base()

examples/batched.swift/Sources/main.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
116116
}
117117

118118
for i in 1 ..< n_parallel {
119-
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
119+
llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens)
120120
}
121121

122122
if n_parallel > 1 {

examples/embedding/embedding.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
3737
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
3838

3939
// clear previous kv_cache values (irrelevant for embeddings)
40-
llama_kv_self_clear(ctx);
40+
llama_memory_clear(llama_get_memory(ctx), true);
4141

4242
// run model
4343
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
@@ -236,9 +236,24 @@ int main(int argc, char ** argv) {
236236
LOG("\n");
237237
}
238238
} else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
239+
const uint32_t n_cls_out = llama_model_n_cls_out(model);
240+
std::vector<std::string> cls_out_labels;
241+
242+
for (uint32_t i = 0; i < n_cls_out; i++) {
243+
const char * label = llama_model_cls_label(model, i);
244+
const std::string label_i(label == nullptr ? "" : label);
245+
cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
246+
}
247+
239248
for (int j = 0; j < n_embd_count; j++) {
240-
// NOTE: if you change this log - update the tests in ci/run.sh
241-
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
249+
for (uint32_t i = 0; i < n_cls_out; i++) {
250+
// NOTE: if you change this log - update the tests in ci/run.sh
251+
if (n_cls_out == 1) {
252+
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
253+
} else {
254+
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
255+
}
256+
}
242257
}
243258
} else {
244259
// print the first part of the embeddings or for a single prompt, the full embedding

examples/gritlm/gritlm.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
4545
}
4646

4747
// clear previous kv_cache values (irrelevant for embeddings)
48-
llama_kv_self_clear(ctx);
48+
llama_memory_clear(llama_get_memory(ctx), true);
4949
llama_set_embeddings(ctx, true);
5050
llama_set_causal_attn(ctx, false);
5151

@@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
102102

103103
llama_token eos_token = llama_vocab_eos(vocab);
104104

105-
llama_kv_self_clear(ctx);
105+
llama_memory_clear(llama_get_memory(ctx), true);
106106
llama_set_embeddings(ctx, false);
107107
llama_set_causal_attn(ctx, true);
108108

examples/llama.android/llama/src/main/cpp/llama-android.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
194194
}
195195

196196
batch->logits[batch->n_tokens - 1] = true;
197-
llama_kv_self_clear(context);
197+
llama_memory_clear(llama_get_memory(context), false);
198198

199199
const auto t_pp_start = ggml_time_us();
200200
if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
206206

207207
LOGi("Benchmark text generation (tg)");
208208

209-
llama_kv_self_clear(context);
209+
llama_memory_clear(llama_get_memory(context), false);
210210
const auto t_tg_start = ggml_time_us();
211211
for (i = 0; i < tg; i++) {
212212

@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
223223

224224
const auto t_tg_end = ggml_time_us();
225225

226-
llama_kv_self_clear(context);
226+
llama_memory_clear(llama_get_memory(context), false);
227227

228228
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
229229
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
448448
extern "C"
449449
JNIEXPORT void JNICALL
450450
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
451-
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
451+
llama_memory_clear(llama_get_memory(reinterpret_cast<llama_context *>(context)), true);
452452
}

examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ actor LlamaContext {
210210
}
211211
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
212212

213-
llama_kv_self_clear(context)
213+
llama_memory_clear(llama_get_memory(context), false)
214214

215215
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
216216

@@ -223,7 +223,7 @@ actor LlamaContext {
223223

224224
// bench text generation
225225

226-
llama_kv_self_clear(context)
226+
llama_memory_clear(llama_get_memory(context), false)
227227

228228
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
229229

@@ -242,7 +242,7 @@ actor LlamaContext {
242242

243243
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
244244

245-
llama_kv_self_clear(context)
245+
llama_memory_clear(llama_get_memory(context), false)
246246

247247
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
248248
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@@ -292,7 +292,7 @@ actor LlamaContext {
292292
func clear() {
293293
tokens_list.removeAll()
294294
temporary_invalid_cchars.removeAll()
295-
llama_kv_self_clear(context)
295+
llama_memory_clear(llama_get_memory(context), true)
296296
}
297297

298298
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

examples/lookahead/lookahead.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ int main(int argc, char ** argv) {
6060
llama_model * model = llama_init.model.get();
6161
llama_context * ctx = llama_init.context.get();
6262

63+
auto * mem = llama_get_memory(ctx);
64+
6365
const llama_vocab * vocab = llama_model_get_vocab(model);
6466

6567
// Tokenize the prompt
@@ -94,7 +96,7 @@ int main(int argc, char ** argv) {
9496
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
9597

9698
for (int s = 1; s < W + G + 1; ++s) {
97-
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
99+
llama_memory_seq_cp(mem, 0, s, -1, -1);
98100
}
99101

100102
const auto t_enc_end = ggml_time_us();
@@ -427,17 +429,17 @@ int main(int argc, char ** argv) {
427429

428430
// KV cache management
429431
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
430-
llama_kv_self_seq_rm(ctx, -1, n_past, -1);
432+
llama_memory_seq_rm(mem, -1, n_past, -1);
431433

432434
if (seq_id_best != 0) {
433435
// if a verification token matched, we keep the best sequence and remove the rest
434436
// this leads to some KV cache fragmentation
435-
llama_kv_self_seq_keep(ctx, seq_id_best);
436-
llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1);
437-
llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1);
437+
llama_memory_seq_keep(mem, seq_id_best);
438+
llama_memory_seq_cp (mem, seq_id_best, 0, -1, -1);
439+
llama_memory_seq_rm (mem, seq_id_best, -1, -1);
438440

439441
for (int s = 1; s < W + G + 1; ++s) {
440-
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
442+
llama_memory_seq_cp(mem, 0, s, -1, -1);
441443
}
442444
}
443445
}

0 commit comments

Comments
 (0)