Skip to content

Commit 8095657

Browse files
author
prima
committed
Merge remote-tracking branch 'origin/concedo_experimental' into remoteManagement
2 parents 92c5530 + 4403503 commit 8095657

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+2378
-1412
lines changed

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ jobs:
448448
shell: bash
449449

450450
env:
451-
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
451+
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
452452
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
453453
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
454454
steps:

common/arg.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,6 +1446,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14461446
params.n_keep = value;
14471447
}
14481448
));
1449+
add_opt(common_arg(
1450+
{"--swa-full"},
1451+
string_format("use full-size SWA cache (default: %s)\n"
1452+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
1453+
[](common_params & params) {
1454+
params.swa_full = true;
1455+
}
1456+
).set_env("LLAMA_ARG_SWA_FULL"));
14491457
add_opt(common_arg(
14501458
{"--no-context-shift"},
14511459
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -2058,13 +2066,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20582066
params.grp_attn_w = value;
20592067
}
20602068
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
2061-
add_opt(common_arg(
2062-
{"-dkvc", "--dump-kv-cache"},
2063-
"verbose print of the KV cache",
2064-
[](common_params & params) {
2065-
params.dump_kv_cache = true;
2066-
}
2067-
));
20682069
add_opt(common_arg(
20692070
{"-nkvo", "--no-kv-offload"},
20702071
"disable KV offload",

common/common.cpp

Lines changed: 4 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,6 +1110,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
11101110
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
11111111
}
11121112

1113+
mparams.progress_callback = params.load_progress_callback;
1114+
mparams.progress_callback_user_data = params.load_progress_callback_user_data;
1115+
11131116
return mparams;
11141117
}
11151118

@@ -1141,6 +1144,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11411144
cparams.flash_attn = params.flash_attn;
11421145
cparams.no_perf = params.no_perf;
11431146
cparams.op_offload = !params.no_op_offload;
1147+
cparams.swa_full = params.swa_full;
11441148

11451149
if (params.reranking) {
11461150
cparams.embeddings = true;
@@ -1333,81 +1337,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
13331337
return text;
13341338
}
13351339

1336-
//
1337-
// KV cache utils
1338-
//
1339-
1340-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1341-
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1342-
1343-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1344-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1345-
1346-
llama_kv_cache_view_cell * c_curr = view.cells;
1347-
llama_seq_id * cs_curr = view.cells_sequences;
1348-
1349-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1350-
if (i % row_size == 0) {
1351-
printf("\n%5d: ", i);
1352-
}
1353-
int seq_count = 0;
1354-
for (int j = 0; j < view.n_seq_max; j++) {
1355-
if (cs_curr[j] >= 0) { seq_count++; }
1356-
}
1357-
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1358-
}
1359-
1360-
printf("\n=== Done dumping\n");
1361-
}
1362-
1363-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1364-
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1365-
1366-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1367-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1368-
1369-
std::unordered_map<llama_seq_id, size_t> seqs;
1370-
llama_kv_cache_view_cell * c_curr = view.cells;
1371-
llama_seq_id * cs_curr = view.cells_sequences;
1372-
1373-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1374-
for (int j = 0; j < view.n_seq_max; j++) {
1375-
if (cs_curr[j] < 0) { continue; }
1376-
if (seqs.find(cs_curr[j]) == seqs.end()) {
1377-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1378-
const size_t sz = seqs.size();
1379-
seqs[cs_curr[j]] = sz;
1380-
}
1381-
}
1382-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1383-
}
1384-
1385-
printf("=== Sequence legend: ");
1386-
for (const auto & it : seqs) {
1387-
printf("%zu=%d, ", it.second, it.first);
1388-
}
1389-
printf("'+'=other sequence ids");
1390-
1391-
c_curr = view.cells;
1392-
cs_curr = view.cells_sequences;
1393-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1394-
if (i % row_size == 0) {
1395-
printf("\n%5d: ", i);
1396-
}
1397-
for (int j = 0; j < view.n_seq_max; j++) {
1398-
if (cs_curr[j] >= 0) {
1399-
const auto & it = seqs.find(cs_curr[j]);
1400-
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1401-
} else {
1402-
putchar('.');
1403-
}
1404-
}
1405-
putchar(' ');
1406-
}
1407-
1408-
printf("\n=== Done dumping\n");
1409-
}
1410-
14111340
//
14121341
// Embedding utils
14131342
//

common/common.h

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -319,13 +319,13 @@ struct common_params {
319319
bool flash_attn = false; // flash attention
320320
bool no_perf = false; // disable performance metrics
321321
bool ctx_shift = true; // context shift on inifinite text generation
322+
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
322323

323324
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
324325
bool use_mmap = true; // use mmap for faster loads
325326
bool use_mlock = false; // use mlock to keep model in memory
326327
bool verbose_prompt = false; // print prompt tokens before generation
327328
bool display_prompt = true; // print prompt before generation
328-
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
329329
bool no_kv_offload = false; // disable KV offloading
330330
bool warmup = true; // warmup run
331331
bool check_tensors = false; // validate tensor data
@@ -424,6 +424,11 @@ struct common_params {
424424

425425
// common params
426426
std::string out_file; // output filename for all example programs
427+
// optional callback for model loading progress and cancellation:
428+
// called with a progress value between 0.0 and 1.0.
429+
// return false from callback to abort model loading or true to continue
430+
llama_progress_callback load_progress_callback = NULL;
431+
void * load_progress_callback_user_data = NULL;
427432
};
428433

429434
// call once at the start of a program if it uses libcommon
@@ -612,16 +617,6 @@ std::string common_detokenize(
612617
const std::vector<llama_token> & tokens,
613618
bool special = true);
614619

615-
//
616-
// KV cache utils
617-
//
618-
619-
// Dump the KV cache view with the number of sequences per cell.
620-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
621-
622-
// Dump the KV cache view showing individual sequences in each cell (long output).
623-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
624-
625620
//
626621
// Embedding utils
627622
//

convert_hf_to_gguf.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@ def prepare_tensors(self):
308308
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
309309
gguf.MODEL_TENSOR.POSNET_NORM1,
310310
gguf.MODEL_TENSOR.POSNET_NORM2,
311+
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
311312
)
312313
)
313314
or not new_name.endswith(".weight")
@@ -2092,6 +2093,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
20922093
return super().modify_tensors(data_torch, name, bid)
20932094

20942095

2096+
@ModelBase.register("Llama4ForConditionalGeneration")
2097+
class Llama4VisionModel(VisionModel):
2098+
def set_gguf_parameters(self):
2099+
super().set_gguf_parameters()
2100+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
2101+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
2102+
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
2103+
assert self.hparams["hidden_act"] == "gelu"
2104+
self.gguf_writer.add_vision_use_gelu(True)
2105+
2106+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2107+
del bid # unused
2108+
if "multi_modal_projector" in name or "vision_model" in name:
2109+
# process vision tensors
2110+
if "positional_embedding_vlm" in name and ".weight" not in name:
2111+
name += ".weight"
2112+
return [(self.map_tensor_name(name), data_torch)]
2113+
return []
2114+
2115+
20952116
@ModelBase.register("Mistral3ForConditionalGeneration")
20962117
class Mistral3Model(LlamaModel):
20972118
model_arch = gguf.MODEL_ARCH.LLAMA

docs/multimodal.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,7 @@ NOTE: some models may require large context window, for example: `-c 8192`
7474
(tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
7575
(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
7676
(tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
77+
78+
# Llama 4 Scout
79+
(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
7780
```

examples/sycl/run-llama3.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/bin/bash
2+
3+
# MIT license
4+
# Copyright (C) 2025 Intel Corporation
5+
# SPDX-License-Identifier: MIT
6+
7+
# If you want more control, DPC++ Allows selecting a specific device through the
8+
# following environment variable
9+
#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
10+
source /opt/intel/oneapi/setvars.sh
11+
12+
#export GGML_SYCL_DEBUG=1
13+
14+
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
15+
16+
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
17+
MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
18+
NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
19+
CONTEXT=4096
20+
21+
if [ $# -gt 0 ]; then
22+
GGML_SYCL_DEVICE=$1
23+
echo "Using $GGML_SYCL_DEVICE as the main GPU"
24+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
25+
else
26+
#use multiple GPUs with same max compute units
27+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
28+
fi

examples/sycl/win-run-llama3.bat

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
:: MIT license
2+
:: Copyright (C) 2024 Intel Corporation
3+
:: SPDX-License-Identifier: MIT
4+
5+
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
6+
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
7+
8+
9+
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99

ggml/include/ggml-opt.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ extern "C" {
128128
// set gradients to zero, initilize loss, and optionally reset the optimizer
129129
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
130130

131+
GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
132+
131133
// get underlying tensors that store data
132134
// if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
133135
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor

ggml/src/ggml-cuda/cpy.cu

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#include "cpy.cuh"
22
#include "dequantize.cuh"
3+
#ifdef GGML_USE_MUSA
4+
#include "ggml-musa/mudnn.cuh"
5+
#endif // GGML_USE_MUSA
36

47
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
58

@@ -597,7 +600,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
597600
#endif
598601
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
599602
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
600-
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
603+
#ifdef GGML_USE_MUSA
604+
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
605+
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
606+
} else
607+
#endif // GGML_USE_MUSA
608+
{
609+
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
610+
}
601611
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
602612
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
603613
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {

0 commit comments

Comments
 (0)