Skip to content

Commit cb6d4b3

Browse files
committed
Merge branch 'master' into master_fix
2 parents 7c1a685 + 09186fa commit cb6d4b3

File tree

156 files changed

+23032
-21130
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

156 files changed

+23032
-21130
lines changed

.github/workflows/build.yml

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,7 @@ jobs:
6060
-DLLAMA_CURL=ON \
6161
-DGGML_METAL_USE_BF16=ON \
6262
-DGGML_METAL_EMBED_LIBRARY=ON \
63-
-DGGML_RPC=ON \
64-
-DBUILD_SHARED_LIBS=OFF
63+
-DGGML_RPC=ON
6564
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
6665
6766
- name: Test
@@ -123,8 +122,7 @@ jobs:
123122
-DLLAMA_FATAL_WARNINGS=ON \
124123
-DLLAMA_CURL=ON \
125124
-DGGML_METAL=OFF \
126-
-DGGML_RPC=ON \
127-
-DBUILD_SHARED_LIBS=OFF
125+
-DGGML_RPC=ON
128126
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
129127
130128
- name: Test
@@ -181,7 +179,7 @@ jobs:
181179
run: |
182180
mkdir build
183181
cd build
184-
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
182+
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
185183
cmake --build . --config Release -j $(nproc)
186184
187185
- name: Test
@@ -651,23 +649,23 @@ jobs:
651649
matrix:
652650
include:
653651
- build: 'noavx-x64'
654-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
652+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
655653
- build: 'avx2-x64'
656-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
654+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
657655
- build: 'avx-x64'
658-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
656+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
659657
- build: 'avx512-x64'
660-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
658+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
661659
- build: 'openblas-x64'
662-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
660+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
663661
- build: 'kompute-x64'
664-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
662+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
665663
- build: 'vulkan-x64'
666-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
664+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
667665
- build: 'llvm-arm64'
668-
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
666+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
669667
- build: 'msvc-arm64'
670-
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
668+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=O'
671669
- build: 'llvm-arm64-opencl-adreno'
672670
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
673671

@@ -914,7 +912,7 @@ jobs:
914912
shell: cmd
915913
run: |
916914
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
917-
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
915+
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
918916
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
919917
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
920918
cmake --build build --config Release

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
201201
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
202202
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
203203
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
204+
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
204205

205206
</details>
206207

common/arg.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,15 +1512,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15121512
{"--lora"}, "FNAME",
15131513
"path to LoRA adapter (can be repeated to use multiple adapters)",
15141514
[](common_params & params, const std::string & value) {
1515-
params.lora_adapters.push_back({ std::string(value), 1.0 });
1515+
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
15161516
}
15171517
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
15181518
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
15191519
add_opt(common_arg(
15201520
{"--lora-scaled"}, "FNAME", "SCALE",
15211521
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
15221522
[](common_params & params, const std::string & fname, const std::string & scale) {
1523-
params.lora_adapters.push_back({ fname, std::stof(scale) });
1523+
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
15241524
}
15251525
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
15261526
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));

common/common.cpp

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <cstdarg>
1919
#include <cstring>
2020
#include <ctime>
21+
#include <filesystem>
2122
#include <fstream>
2223
#include <iostream>
2324
#include <iterator>
@@ -62,7 +63,9 @@
6263
#ifdef __linux__
6364
#include <linux/limits.h>
6465
#elif defined(_WIN32)
65-
#define PATH_MAX MAX_PATH
66+
# if !defined(PATH_MAX)
67+
# define PATH_MAX MAX_PATH
68+
# endif
6669
#else
6770
#include <sys/syslimits.h>
6871
#endif
@@ -843,7 +846,7 @@ struct common_init_result common_init_from_params(common_params & params) {
843846
} else if (!params.model_url.empty()) {
844847
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
845848
} else {
846-
model = llama_load_model_from_file(params.model.c_str(), mparams);
849+
model = llama_model_load_from_file(params.model.c_str(), mparams);
847850
}
848851

849852
if (model == NULL) {
@@ -870,7 +873,7 @@ struct common_init_result common_init_from_params(common_params & params) {
870873
}
871874

872875
if (!ok) {
873-
llama_free_model(model);
876+
llama_model_free(model);
874877

875878
return iparams;
876879
}
@@ -881,14 +884,13 @@ struct common_init_result common_init_from_params(common_params & params) {
881884
llama_context * lctx = llama_new_context_with_model(model, cparams);
882885
if (lctx == NULL) {
883886
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
884-
llama_free_model(model);
887+
llama_model_free(model);
885888
return iparams;
886889
}
887890

888891
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
889-
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
890-
llama_free_model(model);
891-
return iparams;
892+
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
893+
params.ctx_shift = false;
892894
}
893895

894896
if (!params.control_vectors.empty()) {
@@ -898,7 +900,7 @@ struct common_init_result common_init_from_params(common_params & params) {
898900
const auto cvec = common_control_vector_load(params.control_vectors);
899901
if (cvec.n_embd == -1) {
900902
llama_free(lctx);
901-
llama_free_model(model);
903+
llama_model_free(model);
902904

903905
return iparams;
904906
}
@@ -911,28 +913,29 @@ struct common_init_result common_init_from_params(common_params & params) {
911913
params.control_vector_layer_end);
912914
if (err) {
913915
llama_free(lctx);
914-
llama_free_model(model);
916+
llama_model_free(model);
915917

916918
return iparams;
917919
}
918920
}
919921

920922
// load and optionally apply lora adapters
921923
for (auto & la : params.lora_adapters) {
922-
common_lora_adapter_container loaded_la;
923-
loaded_la.path = la.path;
924-
loaded_la.scale = la.scale;
925-
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
926-
if (loaded_la.adapter == nullptr) {
924+
llama_lora_adapter_ptr lora;
925+
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
926+
if (lora == nullptr) {
927927
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
928928
llama_free(lctx);
929-
llama_free_model(model);
929+
llama_model_free(model);
930930
return iparams;
931931
}
932-
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
932+
933+
la.ptr = lora.get();
934+
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
933935
}
936+
934937
if (!params.lora_init_without_apply) {
935-
common_lora_adapters_apply(lctx, iparams.lora_adapters);
938+
common_lora_adapters_apply(lctx, params.lora_adapters);
936939
}
937940

938941
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -979,7 +982,7 @@ struct common_init_result common_init_from_params(common_params & params) {
979982
if (llama_model_has_encoder(model)) {
980983
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
981984
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
982-
if (decoder_start_token_id == -1) {
985+
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
983986
decoder_start_token_id = bos;
984987
}
985988
tmp.clear();
@@ -993,17 +996,17 @@ struct common_init_result common_init_from_params(common_params & params) {
993996
llama_perf_context_reset(lctx);
994997
}
995998

996-
iparams.model = model;
997-
iparams.context = lctx;
999+
iparams.model.reset(model);
1000+
iparams.context.reset(lctx);
9981001

9991002
return iparams;
10001003
}
10011004

1002-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
1005+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
10031006
llama_lora_adapter_clear(ctx);
1004-
for (auto & la : lora_adapters) {
1007+
for (auto & la : lora) {
10051008
if (la.scale != 0.0f) {
1006-
llama_lora_adapter_set(ctx, la.adapter, la.scale);
1009+
llama_lora_adapter_set(ctx, la.ptr, la.scale);
10071010
}
10081011
}
10091012
}
@@ -1148,8 +1151,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
11481151
#endif
11491152

11501153
// Check if the file already exists locally
1151-
struct stat model_file_info;
1152-
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
1154+
auto file_exists = std::filesystem::exists(path);
11531155

11541156
// If the file exists, check its JSON metadata companion file.
11551157
std::string metadata_path = path + ".json";
@@ -1409,7 +1411,7 @@ struct llama_model * common_load_model_from_url(
14091411
}
14101412
}
14111413

1412-
return llama_load_model_from_file(local_path.c_str(), params);
1414+
return llama_model_load_from_file(local_path.c_str(), params);
14131415
}
14141416

14151417
struct llama_model * common_load_model_from_hf(
@@ -1612,6 +1614,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
16121614
// Chat template utils
16131615
//
16141616

1617+
std::string common_get_builtin_chat_template(const struct llama_model * model) {
1618+
static const char * template_key = "tokenizer.chat_template";
1619+
// call with NULL buffer to get the total size of the string
1620+
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
1621+
if (res > 0) {
1622+
std::vector<char> model_template(res + 1, 0);
1623+
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
1624+
return std::string(model_template.data(), model_template.size() - 1);
1625+
}
1626+
return "";
1627+
}
1628+
16151629
bool common_chat_verify_template(const std::string & tmpl) {
16161630
llama_chat_message chat[] = {{"user", "test"}};
16171631
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);

common/common.h

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
#pragma once
44

5-
#include "llama.h"
5+
#include "llama-cpp.h"
66

77
#include <string>
88
#include <vector>
@@ -27,10 +27,8 @@
2727
struct common_lora_adapter_info {
2828
std::string path;
2929
float scale;
30-
};
3130

32-
struct common_lora_adapter_container : common_lora_adapter_info {
33-
struct llama_lora_adapter * adapter;
31+
struct llama_lora_adapter * ptr;
3432
};
3533

3634
using llama_tokens = std::vector<llama_token>;
@@ -478,10 +476,12 @@ std::string fs_get_cache_file(const std::string & filename);
478476
// Model utils
479477
//
480478

479+
// note: defines object's lifetime
481480
struct common_init_result {
482-
struct llama_model * model = nullptr;
483-
struct llama_context * context = nullptr;
484-
std::vector<common_lora_adapter_container> lora_adapters;
481+
llama_model_ptr model;
482+
llama_context_ptr context;
483+
484+
std::vector<llama_lora_adapter_ptr> lora;
485485
};
486486

487487
struct common_init_result common_init_from_params(common_params & params);
@@ -503,7 +503,7 @@ struct llama_model * common_load_model_from_hf(
503503
const struct llama_model_params & params);
504504

505505
// clear LoRA adapters from context, then apply new list of adapters
506-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
506+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
507507

508508
//
509509
// Batch utils
@@ -571,6 +571,9 @@ struct common_chat_msg {
571571
std::string content;
572572
};
573573

574+
// Get the built-in chat template for the model. Return empty string if not present.
575+
std::string common_get_builtin_chat_template(const struct llama_model * model);
576+
574577
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
575578
bool common_chat_verify_template(const std::string & tmpl);
576579

@@ -637,6 +640,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
637640
// Split utils
638641
//
639642

640-
static const char * const LLM_KV_SPLIT_NO = "split.no";
641-
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
642-
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
643+
namespace {
644+
645+
const char * const LLM_KV_SPLIT_NO = "split.no";
646+
const char * const LLM_KV_SPLIT_COUNT = "split.count";
647+
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
648+
649+
}

0 commit comments

Comments
 (0)