Skip to content

Commit b0f4749

Browse files
committed
Merge branch 'remoteManagement' into remoteman_stripped
2 parents 68841dc + cc7e627 commit b0f4749

38 files changed

+899
-625
lines changed

.github/workflows/kcpp-build-release-arm64.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ jobs:
7878
--add-data './kcpp_adapters:./kcpp_adapters' \
7979
--add-data './koboldcpp.py:.' \
8080
--add-data './json_to_gbnf.py:.' \
81+
--add-data "./LICENSE.md:." \
82+
--add-data "./MIT_LICENSE_GGML_SDCPP_LLAMACPP_ONLY.md:." \
8183
--add-data './klite.embd:.' \
8284
--add-data './kcpp_docs.embd:.' \
8385
--add-data './kcpp_sdui.embd:.' \

.github/workflows/kcpp-build-release-osx.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
make LLAMA_METAL=1 LLAMA_PORTABLE=1
3737
chmod +x './create_ver_file.sh'
3838
. create_ver_file.sh
39-
pyinstaller --noconfirm --onefile --collect-all customtkinter --collect-all psutil --collect-all pdfplumber --add-data './koboldcpp_default.so:.' --add-data './ggml-metal-merged.metal:.' --add-data './kcpp_adapters:./kcpp_adapters' --add-data './koboldcpp.py:.' --add-data './json_to_gbnf.py:.' --add-data './klite.embd:.' --add-data './kcpp_docs.embd:.' --add-data './kcpp_sdui.embd:.' --add-data './taesd.embd:.' --add-data './taesd_xl.embd:.' --add-data './taesd_f.embd:.' --add-data './taesd_3.embd:.' --add-data './rwkv_vocab.embd:.' --add-data './rwkv_world_vocab.embd:.' --version-file './version.txt' --clean --console koboldcpp.py -n "koboldcpp-mac-arm64"
39+
pyinstaller --noconfirm --onefile --collect-all customtkinter --collect-all psutil --collect-all pdfplumber --add-data './koboldcpp_default.so:.' --add-data './ggml-metal-merged.metal:.' --add-data './kcpp_adapters:./kcpp_adapters' --add-data './koboldcpp.py:.' --add-data './json_to_gbnf.py:.' --add-data './LICENSE.md:.' --add-data './MIT_LICENSE_GGML_SDCPP_LLAMACPP_ONLY.md:.' --add-data './klite.embd:.' --add-data './kcpp_docs.embd:.' --add-data './kcpp_sdui.embd:.' --add-data './taesd.embd:.' --add-data './taesd_xl.embd:.' --add-data './taesd_f.embd:.' --add-data './taesd_3.embd:.' --add-data './rwkv_vocab.embd:.' --add-data './rwkv_world_vocab.embd:.' --version-file './version.txt' --clean --console koboldcpp.py -n "koboldcpp-mac-arm64"
4040
4141
- name: Test
4242
id: test

.github/workflows/release.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,14 +238,19 @@ jobs:
238238
matrix:
239239
include:
240240
- build: 'cpu-x64'
241+
arch: 'x64'
241242
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
242243
#- build: 'openblas-x64'
244+
# arch: 'x64'
243245
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
244246
- build: 'vulkan-x64'
247+
arch: 'x64'
245248
defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
246249
- build: 'cpu-arm64'
250+
arch: 'arm64'
247251
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
248252
- build: 'opencl-adreno-arm64'
253+
arch: 'arm64'
249254
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
250255

251256
steps:
@@ -312,6 +317,8 @@ jobs:
312317
- name: libCURL
313318
id: get_libcurl
314319
uses: ./.github/actions/windows-setup-curl
320+
with:
321+
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
315322

316323
- name: Build
317324
id: cmake_build
@@ -339,7 +346,7 @@ jobs:
339346
env:
340347
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
341348
run: |
342-
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
349+
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
343350
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
344351
345352
- name: Upload artifacts

MIT_LICENSE_GGML_LLAMACPP_ONLY renamed to MIT_LICENSE_GGML_SDCPP_LLAMACPP_ONLY.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,13 @@ SOFTWARE.
2222

2323
===================================
2424

25-
Note that the above license applies ONLY to the GGML library and llama.cpp by ggerganov which are licensed under the MIT License
26-
KoboldAI Lite by Concedo and the provided python ctypes bindings in koboldcpp dlls are licensed under the AGPL v3.0 License
25+
Note that the above MIT license applies ONLY to the GGML library, the UNMODIFIED stable-diffusion.cpp portions and llama.cpp portions which are licensed under the MIT License
26+
27+
KoboldAI Lite and the rest of KoboldCpp python and C++ code are licensed under the AGPL v3.0 License
28+
29+
- Llama.cpp source repo is at https://github.com/ggml-org/llama.cpp (MIT)
30+
- Stable-diffusion.cpp source repo is at https://github.com/leejet/stable-diffusion.cpp (MIT)
31+
- KoboldCpp source repo is at https://github.com/LostRuins/koboldcpp (AGPL)
32+
- KoboldAI Lite source repo is at https://github.com/LostRuins/lite.koboldai.net (AGPL)
33+
34+
For any further enquiries, contact @concedo on discord, or LostRuins on github.

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,9 +293,14 @@ and it will install everything required. Alternatively, you can download the abo
293293
- Since v1.75, openblas has been deprecated and removed in favor of the native CPU implementation.
294294

295295
## License
296-
- The original GGML library and llama.cpp by ggerganov are licensed under the MIT License
296+
- The original GGML library, stable-diffusion.cpp and llama.cpp by ggerganov are licensed under the MIT License
297297
- However, KoboldAI Lite is licensed under the AGPL v3.0 License
298298
- KoboldCpp code and other files are also under the AGPL v3.0 License unless otherwise stated
299+
- Llama.cpp source repo is at https://github.com/ggml-org/llama.cpp (MIT)
300+
- Stable-diffusion.cpp source repo is at https://github.com/leejet/stable-diffusion.cpp (MIT)
301+
- KoboldCpp source repo is at https://github.com/LostRuins/koboldcpp (AGPL)
302+
- KoboldAI Lite source repo is at https://github.com/LostRuins/lite.koboldai.net (AGPL)
303+
- For any further enquiries, contact @concedo on discord, or LostRuins on github.
299304

300305
## Notes
301306
- If you wish, after building the koboldcpp libraries with `make`, you can rebuild the exe yourself with pyinstaller by using `make_pyinstaller.bat`

common/arg.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2586,7 +2586,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25862586
[](common_params & params, int value) {
25872587
params.n_junk = value;
25882588
}
2589-
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
2589+
).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
25902590
add_opt(common_arg(
25912591
{"--pos"}, "N",
25922592
string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -2649,7 +2649,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26492649
[](common_params & params) {
26502650
params.is_pp_shared = true;
26512651
}
2652-
).set_examples({LLAMA_EXAMPLE_BENCH}));
2652+
).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
26532653
add_opt(common_arg(
26542654
{"-npp"}, "n0,n1,...",
26552655
"number of prompt tokens",
@@ -2881,6 +2881,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28812881
params.chat_template = read_file(value);
28822882
}
28832883
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2884+
add_opt(common_arg(
2885+
{"--no-prefill-assistant"},
2886+
string_format(
2887+
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2888+
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2889+
),
2890+
[](common_params & params) {
2891+
params.prefill_assistant = false;
2892+
}
2893+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
28842894
add_opt(common_arg(
28852895
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
28862896
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ struct common_params {
364364
bool use_jinja = false; // NOLINT
365365
bool enable_chat_template = true;
366366
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
367+
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
367368

368369
std::vector<std::string> api_keys;
369370

ggml/src/ggml-metal/ggml-metal.m

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,13 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
415415
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128,
416416
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
417417
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512,
418+
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64,
419+
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64,
420+
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64,
421+
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64,
422+
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64,
423+
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64,
424+
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64,
418425
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,
419426
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,
420427
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,
@@ -1362,6 +1369,13 @@ @implementation GGMLMetalClass
13621369
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm);
13631370
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm);
13641371
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, flash_attn_ext_q8_0_hk576_hv512, has_simdgroup_mm);
1372+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64, flash_attn_ext_vec_f16_h64, has_simdgroup_reduction);
1373+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64, flash_attn_ext_vec_bf16_h64, has_simdgroup_reduction && use_bfloat);
1374+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64, flash_attn_ext_vec_q4_0_h64, has_simdgroup_reduction);
1375+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64, flash_attn_ext_vec_q4_1_h64, has_simdgroup_reduction);
1376+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64, flash_attn_ext_vec_q5_0_h64, has_simdgroup_reduction);
1377+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64, flash_attn_ext_vec_q5_1_h64, has_simdgroup_reduction);
1378+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64, flash_attn_ext_vec_q8_0_h64, has_simdgroup_reduction);
13651379
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96, flash_attn_ext_vec_f16_h96, has_simdgroup_reduction);
13661380
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96, flash_attn_ext_vec_bf16_h96, has_simdgroup_reduction && use_bfloat);
13671381
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96, flash_attn_ext_vec_q4_0_h96, has_simdgroup_reduction);
@@ -4358,7 +4372,7 @@ static bool ggml_metal_encode_node(
43584372
// TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
43594373
// for now avoiding mainly to keep the number of templates/kernels a bit lower
43604374
// these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
4361-
if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
4375+
if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 64 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
43624376
switch (src1->type) {
43634377
case GGML_TYPE_F16:
43644378
{
@@ -4539,6 +4553,24 @@ static bool ggml_metal_encode_node(
45394553
use_vec_kernel = true;
45404554

45414555
switch (ne00) {
4556+
case 64:
4557+
{
4558+
switch (src1->type) {
4559+
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64].pipeline; break;
4560+
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64].pipeline; break;
4561+
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64].pipeline; break;
4562+
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64].pipeline; break;
4563+
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64].pipeline; break;
4564+
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64].pipeline; break;
4565+
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64].pipeline; break;
4566+
default:
4567+
{
4568+
GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
4569+
GGML_LOG_ERROR("add template specialization for this type\n");
4570+
GGML_ABORT("add template specialization for this type");
4571+
}
4572+
}
4573+
} break;
45424574
case 96:
45434575
{
45444576
switch (src1->type) {

ggml/src/ggml-metal/ggml-metal.metal

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4124,6 +4124,16 @@ kernel void kernel_flash_attn_ext_vec(
41244124

41254125
typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
41264126

4127+
template [[host_name("kernel_flash_attn_ext_vec_f16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 64, 64, 8>;
4128+
#if defined(GGML_METAL_USE_BF16)
4129+
template [[host_name("kernel_flash_attn_ext_vec_bf16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 64, 64, 8>;
4130+
#endif
4131+
template [[host_name("kernel_flash_attn_ext_vec_q4_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 64, 64, 8>;
4132+
template [[host_name("kernel_flash_attn_ext_vec_q4_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 64, 64, 8>;
4133+
template [[host_name("kernel_flash_attn_ext_vec_q5_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 64, 64, 8>;
4134+
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 64, 64, 8>;
4135+
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 64, 64, 8>;
4136+
41274137
template [[host_name("kernel_flash_attn_ext_vec_f16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 96, 96, 4>;
41284138
#if defined(GGML_METAL_USE_BF16)
41294139
template [[host_name("kernel_flash_attn_ext_vec_bf16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 96, 96, 4>;

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5896,10 +5896,17 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
58965896
vk_pipeline *pipelines;
58975897
bool small_rows = N <= get_fa_num_small_rows(path);
58985898

5899+
// coopmat1 does not actually support "small rows" (it needs 16 rows).
5900+
// So use scalar instead.
58995901
if (small_rows && path == FA_COOPMAT1) {
59005902
path = FA_SCALAR;
59015903
}
59025904

5905+
// scalar is faster than coopmat2 when N==1
5906+
if (N == 1 && path == FA_COOPMAT2) {
5907+
path = FA_SCALAR;
5908+
}
5909+
59035910
bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
59045911

59055912
switch (path) {

0 commit comments

Comments
 (0)