Skip to content

Commit 9d08ae3

Browse files
authored
Merge branch 'ggerganov:master' into server-chat-templates
2 parents a1ee42d + 66798e4 commit 9d08ae3

File tree

15 files changed

+506
-505
lines changed

15 files changed

+506
-505
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ Typically finetunes of the base models below are supported as well.
131131
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
132132
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
133133
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
134+
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
134135
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
135136
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
136137
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)

examples/convert_legacy_llama.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,8 @@ def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None
840840
self.gguf.add_base_model_version(key, base_model_entry["version"])
841841
if "organization" in base_model_entry:
842842
self.gguf.add_base_model_organization(key, base_model_entry["organization"])
843+
if "description" in base_model_entry:
844+
self.gguf.add_base_model_description(key, base_model_entry["description"])
843845
if "url" in base_model_entry:
844846
self.gguf.add_base_model_url(key, base_model_entry["url"])
845847
if "doi" in base_model_entry:
@@ -849,12 +851,32 @@ def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None
849851
if "repo_url" in base_model_entry:
850852
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
851853

854+
if metadata.datasets is not None:
855+
self.gguf.add_dataset_count(len(metadata.datasets))
856+
for key, dataset_entry in enumerate(metadata.datasets):
857+
if "name" in dataset_entry:
858+
self.gguf.add_dataset_name(key, dataset_entry["name"])
859+
if "author" in dataset_entry:
860+
self.gguf.add_dataset_author(key, dataset_entry["author"])
861+
if "version" in dataset_entry:
862+
self.gguf.add_dataset_version(key, dataset_entry["version"])
863+
if "organization" in dataset_entry:
864+
self.gguf.add_dataset_organization(key, dataset_entry["organization"])
865+
if "description" in dataset_entry:
866+
self.gguf.add_dataset_description(key, dataset_entry["description"])
867+
if "url" in dataset_entry:
868+
self.gguf.add_dataset_url(key, dataset_entry["url"])
869+
if "doi" in dataset_entry:
870+
self.gguf.add_dataset_doi(key, dataset_entry["doi"])
871+
if "uuid" in dataset_entry:
872+
self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
873+
if "repo_url" in dataset_entry:
874+
self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
875+
852876
if metadata.tags is not None:
853877
self.gguf.add_tags(metadata.tags)
854878
if metadata.languages is not None:
855879
self.gguf.add_languages(metadata.languages)
856-
if metadata.datasets is not None:
857-
self.gguf.add_datasets(metadata.datasets)
858880

859881
def add_meta_arch(self, params: Params) -> None:
860882
# Metadata About The Neural Architecture Itself

examples/server/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,10 @@ node index.js
383383

384384
`dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
385385

386+
`xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
387+
388+
`xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
389+
386390
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
387391

388392
`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
@@ -411,7 +415,7 @@ node index.js
411415

412416
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
413417

414-
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
418+
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
415419

416420
**Response format**
417421

examples/server/server.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -655,11 +655,16 @@ struct server_context {
655655
}
656656

657657
bool validate_model_chat_template() const {
658-
llama_chat_message chat[] = {{"user", "test"}};
659-
660-
const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
661-
662-
return res > 0;
658+
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
659+
std::string template_key = "tokenizer.chat_template";
660+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
661+
if (res >= 0) {
662+
llama_chat_message chat[] = {{"user", "test"}};
663+
std::string tmpl = std::string(model_template.data(), model_template.size());
664+
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
665+
return chat_res > 0;
666+
}
667+
return false;
663668
}
664669

665670
void init() {

ggml/src/ggml-sycl.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4350,6 +4350,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
43504350
if (op->op == GGML_OP_MUL_MAT) {
43514351
a = op->src[0];
43524352
b = op->src[1];
4353+
if (ggml_is_permuted(a) || ggml_is_permuted(b)) {
4354+
// TODO: fix like https://github.com/ggerganov/llama.cpp/pull/10021
4355+
return false;
4356+
}
43534357
} else {
43544358
a = op->src[2];
43554359
b = op->src[1];

ggml/src/ggml-sycl/norm.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
88

99
const int nthreads = item_ct1.get_local_range(2);
1010
const int nwarps = nthreads / WARP_SIZE;
11-
assert(nwarps % WARP_SIZE == 0);
1211
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
1312

1413
for (int col = tid; col < ncols; col += block_size) {
@@ -55,7 +54,6 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
5554
int end = start + group_size;
5655
const int nthreads = item_ct1.get_local_range(2);
5756
const int nwarps = nthreads / WARP_SIZE;
58-
assert(nwarps % WARP_SIZE == 0);
5957
start += item_ct1.get_local_id(2);
6058
int nreduce = nwarps / WARP_SIZE;
6159

@@ -144,7 +142,6 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
144142
const int tid = item_ct1.get_local_id(2);
145143
const int nthreads = item_ct1.get_local_range(2);
146144
const int nwarps = nthreads / WARP_SIZE;
147-
assert(nwarps % WARP_SIZE == 0);
148145
float tmp = 0.0f; // partial sum for thread in warp
149146

150147
for (int col = tid; col < ncols; col += block_size) {
@@ -202,6 +199,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
202199
}
203200
else {
204201
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
202+
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
205203
const sycl::range<3> block_dims(1, 1, work_group_size);
206204
/*
207205
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
@@ -244,6 +242,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
244242
}
245243
else {
246244
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
245+
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
247246
const sycl::range<3> block_dims(1, 1, work_group_size);
248247
/*
249248
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
@@ -290,6 +289,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
290289
}
291290
else {
292291
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
292+
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
293293
const sycl::range<3> block_dims(1, 1, work_group_size);
294294
/*
295295
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed

ggml/src/ggml-sycl/outprod.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include <sycl/sycl.hpp>
2+
#include <oneapi/mkl.hpp>
23
#include "outprod.hpp"
34

45

@@ -39,7 +40,7 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* sr
3940

4041
try {
4142
// Perform matrix multiplication using oneMKL GEMM
42-
oneapi::mkl::blas::gemm(*stream,
43+
oneapi::mkl::blas::column_major::gemm(*stream,
4344
oneapi::mkl::transpose::nontrans, src1_op,
4445
ne0, ne1, ne01,
4546
alpha,

0 commit comments

Comments
 (0)