Skip to content

Commit 9640750

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # README.md # examples/llama-bench/llama-bench.cpp # examples/llama.android/llama/src/main/cpp/llama-android.cpp # examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt # src/llama-vocab.cpp # tests/test-backend-ops.cpp
2 parents e8570de + 3edfa7d commit 9640750

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+15438
-439
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ KoboldCpp can now also be run on Novita AI, a newer alternative GPU cloud provid
4848

4949
## Obtaining a GGUF model
5050
- KoboldCpp uses GGUF models. They are not included with KoboldCpp, but you can download GGUF files from other places such as [TheBloke's Huggingface](https://huggingface.co/TheBloke). Search for "GGUF" on huggingface.co for plenty of compatible models in the `.gguf` format.
51-
- For beginners, we recommend the models [BookAdventures 8B](https://huggingface.co/KoboldAI/Llama-3.1-8B-BookAdventures-GGUF/resolve/main/Llama-3.1-8B-BookAdventures.Q4_K_S.gguf) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (larger model).
51+
- For beginners, we recommend the models [Airoboros Mistral 7B](https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf) (smaller and weaker) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (larger model) or [Beepo 22B](https://huggingface.co/concedo/Beepo-22B-GGUF/resolve/main/Beepo-22B-Q4_K_S.gguf) (largest and most powerful)
5252
- [Alternatively, you can download the tools to convert models to the GGUF format yourself here](https://kcpptools.concedo.workers.dev). Run `convert-hf-to-gguf.py` to convert them, then `quantize_gguf.exe` to quantize the result.
53-
- Other models for Whisper (speech recognition), Image Generation or Image Recognition [can be found on the Wiki](https://github.com/LostRuins/koboldcpp/wiki#what-models-does-koboldcpp-support-what-architectures-are-supported)
53+
- Other models for Whisper (speech recognition), Image Generation, Text to Speech or Image Recognition [can be found on the Wiki](https://github.com/LostRuins/koboldcpp/wiki#what-models-does-koboldcpp-support-what-architectures-are-supported)
5454

5555
## Improving Performance
5656
- **GPU Acceleration**: If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag (Nvidia Only), or `--usevulkan` (Any GPU), make sure you select the correct .exe with CUDA support.
@@ -172,7 +172,7 @@ when you can't use the precompiled binary directly, we provide an automated buil
172172

173173
# Where can I download AI model files?
174174
- The best place to get GGUF text models is huggingface. For image models, CivitAI has a good selection. Here are some to get started.
175-
- Text Generation: [BookAdventures 8B](https://huggingface.co/KoboldAI/Llama-3.1-8B-BookAdventures-GGUF/resolve/main/Llama-3.1-8B-BookAdventures.Q4_K_S.gguf) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (larger model).
175+
- Text Generation: [Airoboros Mistral 7B](https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf) (smaller and weaker) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (larger model) or [Beepo 22B](https://huggingface.co/concedo/Beepo-22B-GGUF/resolve/main/Beepo-22B-Q4_K_S.gguf) (largest and most powerful)
176176
- Image Generation: [Anything v3](https://huggingface.co/admruul/anything-v3.0/resolve/main/Anything-V3.0-pruned-fp16.safetensors) or [Deliberate V2](https://huggingface.co/Yntec/Deliberate2/resolve/main/Deliberate_v2.safetensors) or [Dreamshaper SDXL](https://huggingface.co/Lykon/dreamshaper-xl-v2-turbo/resolve/main/DreamShaperXL_Turbo_v2_1.safetensors)
177177
- Image Recognition MMproj: [Pick the correct one for your model architecture here](https://huggingface.co/koboldcpp/mmproj/tree/main)
178178
- Speech Recognition: [Whisper models for Speech-To-Text](https://huggingface.co/koboldcpp/whisper/tree/main)

common/arg.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,30 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
377377
return devices;
378378
}
379379

380+
static void add_rpc_devices(std::string servers) {
381+
auto rpc_servers = string_split<std::string>(servers, ',');
382+
if (rpc_servers.empty()) {
383+
throw std::invalid_argument("no RPC servers specified");
384+
}
385+
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
386+
if (!rpc_reg) {
387+
throw std::invalid_argument("failed to find RPC backend");
388+
}
389+
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
390+
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
391+
if (!ggml_backend_rpc_add_device_fn) {
392+
throw std::invalid_argument("failed to find RPC device add function");
393+
}
394+
for (const auto & server : rpc_servers) {
395+
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
396+
if (dev) {
397+
ggml_backend_device_register(dev);
398+
} else {
399+
throw std::invalid_argument("failed to register RPC device");
400+
}
401+
}
402+
}
403+
380404
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
381405
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
382406
const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -1386,7 +1410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13861410
{"--rpc"}, "SERVERS",
13871411
"comma separated list of RPC servers",
13881412
[](common_params & params, const std::string & value) {
1389-
params.rpc_servers = value;
1413+
add_rpc_devices(value);
1414+
GGML_UNUSED(params);
13901415
}
13911416
).set_env("LLAMA_ARG_RPC"));
13921417
}

common/common.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1045,7 +1045,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
10451045
if (params.n_gpu_layers != -1) {
10461046
mparams.n_gpu_layers = params.n_gpu_layers;
10471047
}
1048-
mparams.rpc_servers = params.rpc_servers.c_str();
10491048
mparams.main_gpu = params.main_gpu;
10501049
mparams.split_mode = params.split_mode;
10511050
mparams.tensor_split = params.tensor_split;

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,6 @@ struct common_params {
244244
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
245245
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
246246
std::string logits_file = ""; // file for saving *all* logits // NOLINT
247-
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
248247

249248
std::vector<std::string> in_files; // all input files
250249
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)

convert_hf_to_gguf.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2882,6 +2882,66 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
28822882
return [(self.map_tensor_name(name), data_torch)]
28832883

28842884

2885+
@Model.register("InternLM3ForCausalLM")
2886+
class InternLM3Model(Model):
2887+
model_arch = gguf.MODEL_ARCH.LLAMA
2888+
2889+
def set_vocab(self):
2890+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
2891+
2892+
self.gguf_writer.add_tokenizer_model("llama")
2893+
self.gguf_writer.add_tokenizer_pre("default")
2894+
self.gguf_writer.add_token_list(tokens)
2895+
self.gguf_writer.add_token_scores(scores)
2896+
self.gguf_writer.add_token_types(toktypes)
2897+
2898+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2899+
2900+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2901+
if tokenizer_config_file.is_file():
2902+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2903+
tokenizer_config_json = json.load(f)
2904+
if "add_prefix_space" in tokenizer_config_json:
2905+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
2906+
2907+
if "added_tokens_decoder" in tokenizer_config_json:
2908+
for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
2909+
if token_data.get("special"):
2910+
token_id = int(token_id)
2911+
token = token_data["content"]
2912+
special_vocab._set_special_token(token, token_id)
2913+
# update eos token
2914+
if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
2915+
special_vocab.special_token_ids["eos"] = token_id
2916+
2917+
special_vocab.add_to_gguf(self.gguf_writer)
2918+
2919+
def set_gguf_parameters(self):
2920+
super().set_gguf_parameters()
2921+
hparams = self.hparams
2922+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2923+
2924+
if "head_dim" in hparams:
2925+
rope_dim = hparams["head_dim"]
2926+
else:
2927+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
2928+
self.gguf_writer.add_rope_dimension_count(rope_dim)
2929+
2930+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2931+
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
2932+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2933+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2934+
2935+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2936+
n_head = self.hparams["num_attention_heads"]
2937+
n_kv_head = self.hparams.get("num_key_value_heads")
2938+
if name.endswith(("q_proj.weight", "q_proj.bias")):
2939+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
2940+
if name.endswith(("k_proj.weight", "k_proj.bias")):
2941+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
2942+
return [(self.map_tensor_name(name), data_torch)]
2943+
2944+
28852945
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
28862946
class BertModel(Model):
28872947
model_arch = gguf.MODEL_ARCH.BERT

ggml/include/ggml-backend.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ extern "C" {
203203
// Backend registry
204204
//
205205

206+
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
207+
206208
// Backend (reg) enumeration
207209
GGML_API size_t ggml_backend_reg_count(void);
208210
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);

ggml/include/ggml.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1397,16 +1397,20 @@ extern "C" {
13971397
float scale,
13981398
float max_bias);
13991399

1400-
GGML_API struct ggml_tensor * ggml_soft_max_back(
1400+
GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
14011401
struct ggml_context * ctx,
14021402
struct ggml_tensor * a,
1403-
struct ggml_tensor * b);
1403+
struct ggml_tensor * b,
1404+
float scale,
1405+
float max_bias);
14041406

14051407
// in-place, returns view(a)
1406-
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
1408+
GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace(
14071409
struct ggml_context * ctx,
14081410
struct ggml_tensor * a,
1409-
struct ggml_tensor * b);
1411+
struct ggml_tensor * b,
1412+
float scale,
1413+
float max_bias);
14101414

14111415
// rotary position embedding
14121416
// if (mode & 1) - skip n_past elements (NOT SUPPORTED)

ggml/src/ggml-alloc.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
3737
return true;
3838
}
3939

40+
// ops that return true for this function must not use restrict pointers for their backend implementations
4041
static bool ggml_op_can_inplace(enum ggml_op op) {
4142
switch (op) {
4243
case GGML_OP_SCALE:
@@ -52,8 +53,12 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
5253
case GGML_OP_LOG:
5354
case GGML_OP_UNARY:
5455
case GGML_OP_ROPE:
56+
case GGML_OP_ROPE_BACK:
57+
case GGML_OP_SILU_BACK:
5558
case GGML_OP_RMS_NORM:
59+
case GGML_OP_RMS_NORM_BACK:
5660
case GGML_OP_SOFT_MAX:
61+
case GGML_OP_SOFT_MAX_BACK:
5762
return true;
5863

5964
default:

ggml/src/ggml-backend-impl.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,6 @@ extern "C" {
208208

209209
// Internal backend registry API
210210
GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
211-
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
212211

213212
// Add backend dynamic loading support to the backend
214213

ggml/src/ggml-cpu/ggml-cpu-quants.c

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5574,7 +5574,88 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
55745574

55755575
uint32_t utmp[4];
55765576

5577-
#ifdef __ARM_NEON
5577+
#ifdef __ARM_FEATURE_SVE
5578+
float sumf = 0;
5579+
for (int i = 0; i < nb; ++i) {
5580+
5581+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5582+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5583+
5584+
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
5585+
5586+
memcpy(utmp, x[i].scales, K_SCALE_SIZE);
5587+
5588+
uint32x2_t mins8 = { 0 };
5589+
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
5590+
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
5591+
5592+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
5593+
utmp[0] &= kmask1;
5594+
5595+
const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
5596+
const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
5597+
vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
5598+
sumf -= dmin * vaddvq_s32(prod);
5599+
5600+
const uint8_t * scales = (const uint8_t *)utmp;
5601+
5602+
const uint8_t * restrict q4 = x[i].qs;
5603+
const int8_t * restrict q8 = y[i].qs;
5604+
5605+
const int vector_length = ggml_cpu_get_sve_cnt()*8;
5606+
const svuint8_t m4b = svdup_n_u8(0xf);
5607+
const svint32_t mzero = svdup_n_s32(0);
5608+
svint32_t sumi1 = svdup_n_s32(0);
5609+
svint32_t sumi1_1 = svdup_n_s32(0);
5610+
svint32_t sumi1_2 = svdup_n_s32(0);
5611+
svint32_t sumi2 = svdup_n_s32(0);
5612+
svint32_t sumi2_1 = svdup_n_s32(0);
5613+
svint32_t sumi2_2 = svdup_n_s32(0);
5614+
switch (vector_length) {
5615+
case 128:
5616+
{
5617+
for (int j = 0; j < QK_K/64; ++j) {
5618+
svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), m4b));
5619+
svint8_t q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
5620+
sumi1_1 = svmla_n_s32_x(svptrue_b32(), sumi1_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
5621+
q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), m4b));
5622+
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
5623+
sumi1_2 = svmla_n_s32_x(svptrue_b32(), sumi1_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
5624+
5625+
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), 4));
5626+
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
5627+
sumi2_1 = svmla_n_s32_x(svptrue_b32(), sumi2_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
5628+
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), 4));
5629+
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
5630+
sumi2_2 = svmla_n_s32_x(svptrue_b32(), sumi2_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
5631+
q4 += 32;
5632+
}
5633+
sumi1 = svadd_s32_x(svptrue_b32(), sumi1_1, sumi1_2);
5634+
sumi2 = svadd_s32_x(svptrue_b32(), sumi2_1, sumi2_2);
5635+
sumf += d * (svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sumi1, sumi2)));
5636+
} break;
5637+
case 256:
5638+
case 512:
5639+
{
5640+
for (int j = 0; j < QK_K/64; ++j) {
5641+
const svuint8_t q4bits = svld1_u8(svptrue_pat_b8(SV_VL32), q4); q4 += 32;
5642+
svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_pat_b8(SV_VL32), q4bits, m4b));
5643+
svint8_t q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
5644+
sumi1 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
5645+
5646+
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q4bits, 4));
5647+
q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
5648+
sumi2 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
5649+
}
5650+
sumf += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), sumi1, sumi2)));
5651+
} break;
5652+
default:
5653+
assert(false && "Unsupported vector length");
5654+
break;
5655+
}
5656+
}
5657+
*s = sumf;
5658+
#elif __ARM_NEON
55785659
const uint8x16_t m4b = vdupq_n_u8(0xf);
55795660
const int32x4_t mzero = vdupq_n_s32(0);
55805661

0 commit comments

Comments
 (0)