Skip to content

Commit 7d16e1b

Browse files
committed
Merge branch 'master' into compilade/mamba2
2 parents 805512a + d865d14 commit 7d16e1b

File tree

101 files changed

+15442
-8234
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+15442
-8234
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
8888
set(GGML_LLAMAFILE_DEFAULT ON)
8989
endif()
9090

91+
if (NOT DEFINED GGML_AMX)
92+
set(GGML_AMX ON)
93+
endif()
94+
9195
if (NOT DEFINED GGML_CUDA_GRAPHS)
9296
set(GGML_CUDA_GRAPHS_DEFAULT ON)
9397
endif()

Makefile

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,6 @@ GGML_METAL := 1
9393
DEPRECATE_WARNING := 1
9494
endif
9595

96-
ifdef LLAMA_OPENMP
97-
GGML_OPENMP := 1
98-
DEPRECATE_WARNING := 1
99-
endif
100-
10196
ifdef LLAMA_RPC
10297
GGML_RPC := 1
10398
DEPRECATE_WARNING := 1
@@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
584579
OBJ_GGML += ggml/src/llamafile/sgemm.o
585580
endif
586581

582+
ifndef GGML_NO_AMX
583+
MK_CPPFLAGS += -DGGML_USE_AMX
584+
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
585+
endif
586+
587587
ifdef GGML_RPC
588588
MK_CPPFLAGS += -DGGML_USE_RPC
589589
OBJ_GGML += ggml/src/ggml-rpc.o
@@ -1087,6 +1087,19 @@ ggml/src/llamafile/sgemm.o: \
10871087
$(CXX) $(CXXFLAGS) -c $< -o $@
10881088
endif # GGML_NO_LLAMAFILE
10891089

1090+
ifndef GGML_NO_AMX
1091+
ggml/src/ggml-amx.o: \
1092+
ggml/src/ggml-amx.cpp \
1093+
ggml/include/ggml-amx.h
1094+
$(CXX) $(CXXFLAGS) -c $< -o $@
1095+
1096+
ggml/src/ggml-amx/mmq.o: \
1097+
ggml/src/ggml-amx/mmq.cpp \
1098+
ggml/src/ggml-amx/mmq.h \
1099+
ggml/include/ggml.h
1100+
$(CXX) $(CXXFLAGS) -c $< -o $@
1101+
endif
1102+
10901103
ifdef GGML_RPC
10911104
ggml/src/ggml-rpc.o: \
10921105
ggml/src/ggml-rpc.cpp \
@@ -1238,6 +1251,7 @@ clean:
12381251
rm -vrf ggml/src/ggml-metal-embed.metal
12391252
rm -vrf ggml/src/ggml-cuda/*.o
12401253
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
1254+
rm -vrf ggml/src/ggml-amx/*.o
12411255
rm -rvf $(BUILD_TARGETS)
12421256
rm -rvf $(TEST_TARGETS)
12431257
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ variety of hardware - locally and in the cloud.
2929

3030
- Plain C/C++ implementation without any dependencies
3131
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
32-
- AVX, AVX2 and AVX512 support for x86 architectures
32+
- AVX, AVX2, AVX512 and AMX support for x86 architectures
3333
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
3434
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
3535
- Vulkan and SYCL backend support
@@ -93,6 +93,7 @@ Typically finetunes of the base models below are supported as well.
9393
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
9494
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
9595
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
96+
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
9697

9798
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
9899

@@ -122,6 +123,7 @@ Typically finetunes of the base models below are supported as well.
122123
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
123124
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
124125
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
126+
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
125127
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
126128
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
127129
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
@@ -130,6 +132,8 @@ Typically finetunes of the base models below are supported as well.
130132
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
131133
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
132134
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
135+
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
136+
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
133137

134138
**UI:**
135139

@@ -170,6 +174,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
170174
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
171175
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
172176
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
177+
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
173178

174179
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
175180

@@ -185,6 +190,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
185190

186191
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
187192
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
193+
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
188194

189195
**Games:**
190196
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
5353
exit 1
5454
fi
5555

56-
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
56+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
5757
fi
5858

5959
if [ ! -z ${GG_BUILD_VULKAN} ]; then

common/arg.cpp

Lines changed: 87 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -128,13 +128,13 @@ static void common_params_handle_model_default(common_params & params) {
128128
}
129129
params.hf_file = params.model;
130130
} else if (params.model.empty()) {
131-
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
131+
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
132132
}
133133
} else if (!params.model_url.empty()) {
134134
if (params.model.empty()) {
135-
auto f = string_split(params.model_url, '#').front();
136-
f = string_split(f, '?').front();
137-
params.model = fs_get_cache_file(string_split(f, '/').back());
135+
auto f = string_split<std::string>(params.model_url, '#').front();
136+
f = string_split<std::string>(f, '?').front();
137+
params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
138138
}
139139
} else if (params.model.empty()) {
140140
params.model = DEFAULT_MODEL_PATH;
@@ -251,6 +251,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
251251
for (auto & antiprompt : params.antiprompt) {
252252
string_process_escapes(antiprompt);
253253
}
254+
for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
255+
string_process_escapes(seq_breaker);
256+
}
254257
}
255258

256259
if (!params.kv_overrides.empty()) {
@@ -879,7 +882,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
879882
{"--samplers"}, "SAMPLERS",
880883
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
881884
[](common_params & params, const std::string & value) {
882-
const auto sampler_names = string_split(value, ';');
885+
const auto sampler_names = string_split<std::string>(value, ';');
883886
params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
884887
}
885888
).set_sparam());
@@ -941,10 +944,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
941944
}
942945
).set_sparam());
943946
add_opt(common_arg(
944-
{"--tfs"}, "N",
945-
string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
947+
{"--xtc-probability"}, "N",
948+
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
949+
[](common_params & params, const std::string & value) {
950+
params.sparams.xtc_probability = std::stof(value);
951+
}
952+
).set_sparam());
953+
add_opt(common_arg(
954+
{"--xtc-threshold"}, "N",
955+
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
946956
[](common_params & params, const std::string & value) {
947-
params.sparams.tfs_z = std::stof(value);
957+
params.sparams.xtc_threshold = std::stof(value);
948958
}
949959
).set_sparam());
950960
add_opt(common_arg(
@@ -983,6 +993,64 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
983993
params.sparams.penalty_freq = std::stof(value);
984994
}
985995
).set_sparam());
996+
add_opt(common_arg(
997+
{"--dry-multiplier"}, "N",
998+
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
999+
[](common_params & params, const std::string & value) {
1000+
params.sparams.dry_multiplier = std::stof(value);
1001+
}
1002+
).set_sparam());
1003+
add_opt(common_arg(
1004+
{"--dry-base"}, "N",
1005+
string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
1006+
[](common_params & params, const std::string & value) {
1007+
float potential_base = std::stof(value);
1008+
if (potential_base >= 1.0f)
1009+
{
1010+
params.sparams.dry_base = potential_base;
1011+
}
1012+
}
1013+
).set_sparam());
1014+
add_opt(common_arg(
1015+
{"--dry-allowed-length"}, "N",
1016+
string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
1017+
[](common_params & params, int value) {
1018+
params.sparams.dry_allowed_length = value;
1019+
}
1020+
).set_sparam());
1021+
add_opt(common_arg(
1022+
{"--dry-penalty-last-n"}, "N",
1023+
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
1024+
[](common_params & params, int value) {
1025+
params.sparams.dry_penalty_last_n = value;
1026+
}
1027+
).set_sparam());
1028+
add_opt(common_arg(
1029+
{"--dry-sequence-breaker"}, "STRING",
1030+
string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
1031+
params.sparams.dry_sequence_breakers.empty() ? "none" :
1032+
std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
1033+
params.sparams.dry_sequence_breakers.end(),
1034+
std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
1035+
[](const std::string& a, const std::string& b) {
1036+
std::string formatted_b = (b == "\n") ? "\\n" : b;
1037+
return a + ", '" + formatted_b + "'";
1038+
}).c_str()),
1039+
[](common_params & params, const std::string & value) {
1040+
static bool defaults_cleared = false;
1041+
1042+
if (!defaults_cleared) {
1043+
params.sparams.dry_sequence_breakers.clear();
1044+
defaults_cleared = true;
1045+
}
1046+
1047+
if (value == "none") {
1048+
params.sparams.dry_sequence_breakers.clear();
1049+
} else {
1050+
params.sparams.dry_sequence_breakers.emplace_back(value);
1051+
}
1052+
}
1053+
).set_sparam());
9861054
add_opt(common_arg(
9871055
{"--dynatemp-range"}, "N",
9881056
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
@@ -999,7 +1067,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
9991067
).set_sparam());
10001068
add_opt(common_arg(
10011069
{"--mirostat"}, "N",
1002-
string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
1070+
string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
10031071
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
10041072
[](common_params & params, int value) {
10051073
params.sparams.mirostat = value;
@@ -1083,7 +1151,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
10831151
}
10841152
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
10851153
add_opt(common_arg(
1086-
{"--attention"}, "{causal,non,causal}",
1154+
{"--attention"}, "{causal,non-causal}",
10871155
"attention type for embeddings, use model default if unspecified",
10881156
[](common_params & params, const std::string & value) {
10891157
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
@@ -1681,7 +1749,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
16811749
).set_examples({LLAMA_EXAMPLE_BENCH}));
16821750
add_opt(common_arg(
16831751
{"--embd-normalize"}, "N",
1684-
string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
1752+
string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
16851753
[](common_params & params, int value) {
16861754
params.embd_normalize = value;
16871755
}
@@ -1695,7 +1763,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
16951763
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
16961764
add_opt(common_arg(
16971765
{"--embd-separator"}, "STRING",
1698-
"separator of embendings (default \\n) for example \"<#sep#>\"",
1766+
"separator of embeddings (default \\n) for example \"<#sep#>\"",
16991767
[](common_params & params, const std::string & value) {
17001768
params.embd_sep = value;
17011769
}
@@ -1788,6 +1856,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17881856
params.n_threads_http = value;
17891857
}
17901858
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
1859+
add_opt(common_arg(
1860+
{"--cache-reuse"}, "N",
1861+
string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
1862+
[](common_params & params, int value) {
1863+
params.n_cache_reuse = value;
1864+
}
1865+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
17911866
add_opt(common_arg(
17921867
{"--metrics"},
17931868
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),

common/common.cpp

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -416,19 +416,6 @@ std::string string_format(const char * fmt, ...) {
416416
return std::string(buf.data(), size);
417417
}
418418

419-
std::vector<std::string> string_split(std::string input, char separator) {
420-
std::vector<std::string> parts;
421-
size_t separator_pos = input.find(separator);
422-
while (separator_pos != std::string::npos) {
423-
std::string part = input.substr(0, separator_pos);
424-
parts.emplace_back(part);
425-
input = input.substr(separator_pos + 1);
426-
separator_pos = input.find(separator);
427-
}
428-
parts.emplace_back(input);
429-
return parts;
430-
}
431-
432419
std::string string_strip(const std::string & str) {
433420
size_t start = 0;
434421
size_t end = str.size();
@@ -955,7 +942,7 @@ struct common_init_result common_init_from_params(common_params & params) {
955942
}
956943

957944
if (llama_model_has_encoder(model)) {
958-
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
945+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
959946
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
960947
if (decoder_start_token_id == -1) {
961948
decoder_start_token_id = bos;
@@ -964,7 +951,7 @@ struct common_init_result common_init_from_params(common_params & params) {
964951
tmp.push_back(decoder_start_token_id);
965952
}
966953
if (llama_model_has_decoder(model)) {
967-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
954+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
968955
}
969956
llama_kv_cache_clear(lctx);
970957
llama_synchronize(lctx);
@@ -1035,7 +1022,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
10351022
return GGML_TYPE_Q5_1;
10361023
}
10371024

1038-
throw std::runtime_error("Invalid cache type: " + s);
1025+
throw std::runtime_error("Unsupported cache type: " + s);
10391026
}
10401027

10411028
struct llama_context_params common_context_params_to_llama(const common_params & params) {
@@ -1047,7 +1034,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10471034
cparams.n_ubatch = params.n_ubatch;
10481035
cparams.n_threads = params.cpuparams.n_threads;
10491036
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1050-
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1037+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
10511038
cparams.logits_all = params.logits_all;
10521039
cparams.embeddings = params.embedding;
10531040
cparams.rope_scaling_type = params.rope_scaling_type;
@@ -2019,6 +2006,10 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
20192006
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
20202007
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
20212008
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2009+
fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
2010+
fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
2011+
fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
2012+
fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
20222013
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
20232014
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
20242015
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
@@ -2099,11 +2090,12 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
20992090
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
21002091
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
21012092

2102-
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
21032093
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
21042094
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
21052095
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
21062096
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2097+
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
2098+
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
21072099
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
21082100
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
21092101
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");

0 commit comments

Comments
 (0)