Skip to content

Commit 4f2fcaa

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # ci/run.sh # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/repack.cpp # ggml/src/ggml-sycl/binbcast.cpp # ggml/src/ggml-sycl/concat.cpp # ggml/src/ggml-sycl/conv.cpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/cpy.cpp # ggml/src/ggml-sycl/dmmv.cpp # ggml/src/ggml-sycl/dpct/helper.hpp # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/getrows.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/gla.cpp # ggml/src/ggml-sycl/im2col.cpp # ggml/src/ggml-sycl/mmq.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/norm.cpp # ggml/src/ggml-sycl/rope.cpp # ggml/src/ggml-sycl/softmax.cpp # ggml/src/ggml-sycl/tsembd.cpp # ggml/src/ggml-sycl/wkv.cpp # tests/test-backend-ops.cpp
2 parents c16d672 + c959f46 commit 4f2fcaa

File tree

21 files changed

+443
-81
lines changed

21 files changed

+443
-81
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2708,6 +2708,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27082708
params.embd_sep = value;
27092709
}
27102710
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2711+
add_opt(common_arg(
2712+
{"--cls-separator"}, "STRING",
2713+
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
2714+
[](common_params & params, const std::string & value) {
2715+
params.cls_sep = value;
2716+
}
2717+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
27112718
add_opt(common_arg(
27122719
{"--host"}, "HOST",
27132720
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),

common/common.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,6 +1298,9 @@ std::vector<llama_token> common_tokenize(
12981298
int n_tokens = text.length() + 2 * add_special;
12991299
std::vector<llama_token> result(n_tokens);
13001300
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1301+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
1302+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1303+
}
13011304
if (n_tokens < 0) {
13021305
result.resize(-n_tokens);
13031306
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,7 @@ struct common_params {
354354
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
355355
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
356356
std::string embd_sep = "\n"; // separator of embeddings
357+
std::string cls_sep = "\t"; // separator of classification sequences
357358

358359
// server params
359360
int32_t port = 8080; // server listens on this network port

convert_hf_to_gguf.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2145,7 +2145,6 @@ def __init__(self, *args, **kwargs):
21452145

21462146
def set_vocab(self):
21472147
self._set_vocab_gpt2()
2148-
self.gguf_writer.add_add_bos_token(True)
21492148

21502149
def set_gguf_parameters(self):
21512150
super().set_gguf_parameters()
@@ -3918,9 +3917,6 @@ def _xlmroberta_set_vocab(self) -> None:
39183917
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
39193918
special_vocab.add_to_gguf(self.gguf_writer)
39203919

3921-
self.gguf_writer.add_add_bos_token(True)
3922-
self.gguf_writer.add_add_eos_token(True)
3923-
39243920

39253921
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
39263922
class DistilBertModel(BertModel):
@@ -3962,8 +3958,6 @@ def set_vocab(self):
39623958
bpe_tok_path = self.dir_model / "tokenizer.json"
39633959
if bpe_tok_path.exists():
39643960
self._set_vocab_gpt2()
3965-
self.gguf_writer.add_add_bos_token(True)
3966-
self.gguf_writer.add_add_eos_token(True)
39673961

39683962
# we need this to validate the size of the token_type embeddings
39693963
# though currently we are passing all zeros to the token_type embeddings
@@ -4848,8 +4842,6 @@ def set_vocab(self):
48484842
self.gguf_writer.add_token_type_count(2)
48494843
else:
48504844
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
4851-
self.gguf_writer.add_add_bos_token(True)
4852-
self.gguf_writer.add_add_eos_token(True)
48534845

48544846

48554847
@ModelBase.register("OpenELMForCausalLM")
@@ -5451,9 +5443,6 @@ def set_vocab(self):
54515443
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
54525444
special_vocab.add_to_gguf(self.gguf_writer)
54535445

5454-
self.gguf_writer.add_add_bos_token(False)
5455-
self.gguf_writer.add_add_eos_token(True)
5456-
54575446
def set_gguf_parameters(self):
54585447
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
54595448
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5591,9 +5580,6 @@ def set_vocab(self):
55915580
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
55925581
special_vocab.add_to_gguf(self.gguf_writer)
55935582

5594-
self.gguf_writer.add_add_bos_token(False)
5595-
self.gguf_writer.add_add_eos_token(True)
5596-
55975583
def set_gguf_parameters(self):
55985584
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
55995585
logger.warning("Couldn't find context length in config.json, assuming default value of 512")

examples/embedding/embedding.cpp

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,36 @@ int main(int argc, char ** argv) {
133133
// max batch size
134134
const uint64_t n_batch = params.n_batch;
135135

136+
// get added sep and eos token, if any
137+
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
138+
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
139+
136140
// tokenize the prompts and trim
137141
std::vector<std::vector<int32_t>> inputs;
138142
for (const auto & prompt : prompts) {
139-
auto inp = common_tokenize(ctx, prompt, true, true);
143+
std::vector<llama_token> inp;
144+
145+
// split classification pairs and insert expected separator tokens
146+
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
147+
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
148+
std::string final_prompt;
149+
150+
for (size_t i = 0; i < pairs.size(); i++) {
151+
final_prompt += pairs[i];
152+
if (i != pairs.size() - 1) {
153+
if (!added_eos_token.empty()) {
154+
final_prompt += added_eos_token;
155+
}
156+
if (!added_sep_token.empty()) {
157+
final_prompt += added_sep_token;
158+
}
159+
}
160+
}
161+
162+
inp = common_tokenize(ctx, final_prompt, true, true);
163+
} else {
164+
inp = common_tokenize(ctx, prompt, true, true);
165+
}
140166
if (inp.size() > n_batch) {
141167
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
142168
__func__, (long long int) inp.size(), (long long int) n_batch);
@@ -145,11 +171,11 @@ int main(int argc, char ** argv) {
145171
inputs.push_back(inp);
146172
}
147173

148-
// check if the last token is SEP
174+
// check if the last token is SEP/EOS
149175
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
150176
for (auto & inp : inputs) {
151-
if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
152-
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
177+
if (inp.empty() || (inp.back() != llama_vocab_sep(vocab) && inp.back() != llama_vocab_eos(vocab))) {
178+
LOG_WRN("%s: last token in the prompt is not SEP or EOS\n", __func__);
153179
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
154180
}
155181
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# include "ggml-backend-impl.h"
2+
3+
#if defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
4+
5+
#if defined(__linux__)
6+
#include <sys/auxv.h>
7+
#endif
8+
9+
#include <string>
10+
11+
struct powerpc_features {
12+
std::string platform = "";
13+
int power_version = -1;
14+
15+
bool has_vsx = false;
16+
17+
powerpc_features() {
18+
#if defined(__linux__)
19+
unsigned long auxval = getauxval(AT_PLATFORM);
20+
if (auxval) {
21+
platform = std::string(reinterpret_cast<const char*>(auxval));
22+
// TBD: Do systems exist that return this in uppercase?
23+
if (platform.substr(0, 5) == "power") {
24+
// Extractt a numeric suffix, if one exists
25+
int vpos = -1;
26+
for (int i = platform.length() - 1; i >= 0; i--) {
27+
if (std::isdigit(platform[i])) {
28+
vpos = i;
29+
} else {
30+
break;
31+
}
32+
}
33+
if (vpos > -1) {
34+
power_version = std::stoi(platform.substr(vpos));
35+
}
36+
}
37+
}
38+
#endif
39+
if (power_version >= 9) {
40+
has_vsx = true;
41+
}
42+
}
43+
};
44+
45+
static int ggml_backend_cpu_powerpc_score() {
46+
int score = 1;
47+
powerpc_features pf;
48+
49+
// Platform scores
50+
#if defined(GGML_USE_POWER7)
51+
if (pf.power_version < 7) { return 0; }
52+
score += 1<<1;
53+
#endif
54+
#if defined(GGML_USE_POWER8)
55+
if (pf.power_version < 8) { return 0; }
56+
score += 1<<2;
57+
#endif
58+
#if defined(GGML_USE_POWER9)
59+
if (pf.power_version < 9) { return 0; }
60+
score += 1<<3;
61+
#endif
62+
#if defined(GGML_USE_POWER10)
63+
if (pf.power_version < 10) { return 0; }
64+
score += 1<<4;
65+
#endif
66+
#if defined(GGML_USE_POWER11)
67+
if (pf.power_version < 11) { return 0; }
68+
score += 1<<5;
69+
#endif
70+
71+
// Feature scores
72+
#if defined(GGML_USE_VSX)
73+
if (!pf.has_vsx) { return 0; }
74+
score += 1<<6;
75+
#endif
76+
77+
return score;
78+
}
79+
80+
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_powerpc_score)
81+
82+
#endif // defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)

ggml/src/ggml-cpu/repack.cpp

Lines changed: 49 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,13 +1180,24 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
11801180
// not realy a GGML_TYPE_Q8_0 but same size.
11811181
switch (op->op) {
11821182
case GGML_OP_MUL_MAT:
1183-
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
1184-
return true;
1183+
{
1184+
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
1185+
return true;
1186+
}
11851187
case GGML_OP_MUL_MAT_ID:
1186-
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
1187-
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
1188-
size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
1189-
return true;
1188+
{
1189+
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
1190+
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
1191+
1192+
const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
1193+
const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
1194+
1195+
const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
1196+
1197+
size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
1198+
1199+
return true;
1200+
}
11901201
default:
11911202
// GGML_ABORT("fatal error");
11921203
break;
@@ -1322,14 +1333,17 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
13221333
int32_t i2;
13231334
};
13241335

1325-
GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) +
1326-
n_as * ne12 * sizeof(mmid_row_mapping)));
1336+
GGML_ASSERT(params->wsize >=
1337+
(GGML_PAD(nbw3, sizeof(int64_t)) +
1338+
n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
1339+
);
13271340

1328-
auto * wdata = (char *) params->wdata;
1329-
auto * wdata_src1_end = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t));
1330-
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
1341+
auto * wdata = (char *)params->wdata;
1342+
auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
13311343

1332-
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
1344+
// total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
1345+
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
1346+
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
13331347

13341348
// src1: float32 => param type
13351349
for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -1414,15 +1428,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
14141428
}
14151429
};
14161430

1417-
// instance for Q4
1418-
static const tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
1419-
static const tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
1420-
static const tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
1421-
static const tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
1422-
1423-
// instance for IQ4
1424-
static const tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1425-
14261431
} // namespace ggml::cpu::repack
14271432

14281433
static void flag_aarch_prepacked_quant(int type)
@@ -1435,55 +1440,65 @@ static void flag_aarch_prepacked_quant(int type)
14351440
}
14361441

14371442
static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
1443+
1444+
// instance for Q4
1445+
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
1446+
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
1447+
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
1448+
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
1449+
1450+
// instance for IQ4
1451+
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1452+
14381453
if (cur->type == GGML_TYPE_Q4_0) {
14391454
//we shall just use the regular avx2 handling, no repacking
14401455
if (/*ggml_cpu_has_avx2() ||*/ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
14411456
if (cur->ne[1] % 8 == 0) {
1442-
return &ggml::cpu::repack::q4_0_8x8_q8_0;
1457+
return &q4_0_8x8_q8_0;
14431458
}
14441459
}
14451460
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
14461461
if (cur->ne[1] % 4 == 0) {
1447-
return &ggml::cpu::repack::q4_0_4x8_q8_0;
1462+
return &q4_0_4x8_q8_0;
14481463
}
14491464
}
14501465
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
14511466
if (cur->ne[1] % 4 == 0) {
1452-
return &ggml::cpu::repack::q4_0_4x4_q8_0;
1467+
return &q4_0_4x4_q8_0;
14531468
}
14541469
}
14551470
} else if (cur->type == GGML_TYPE_Q4_K) {
1456-
// if (ggml_cpu_has_avx2()) { //we shall just use the regular avx2 handling, no repacking otherwise massive slowdown with gpu
1457-
// if (cur->ne[1] % 8 == 0) {
1458-
// return &ggml::cpu::aarch64::q4_K_8x8_q8_K;
1459-
// }
1460-
// }
1471+
// if (ggml_cpu_has_avx2()) {
1472+
// if (cur->ne[1] % 8 == 0) {
1473+
// return &q4_K_8x8_q8_K;
1474+
// }
1475+
// }
14611476
} else if (cur->type == GGML_TYPE_IQ4_NL) {
14621477
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
14631478
if (cur->ne[1] % 4 == 0) {
1464-
return &ggml::cpu::repack::iq4_nl_4x4_q8_0;
1479+
return &iq4_nl_4x4_q8_0;
14651480
}
14661481
}
14671482
}
14681483
else if (cur->type == GGML_TYPE_Q4_0_4_4) //kcpp backport old quant support
14691484
{
14701485
flag_aarch_prepacked_quant(cur->type);
1471-
return &ggml::cpu::repack::q4_0_4x4_q8_0;
1486+
return &q4_0_4x4_q8_0;
14721487
}
14731488
else if (cur->type == GGML_TYPE_Q4_0_4_8)
14741489
{
14751490
flag_aarch_prepacked_quant(cur->type);
1476-
return &ggml::cpu::repack::q4_0_4x8_q8_0;
1491+
return &q4_0_4x8_q8_0;
14771492
}
14781493
else if (cur->type == GGML_TYPE_Q4_0_8_8)
14791494
{
14801495
flag_aarch_prepacked_quant(cur->type);
1481-
return &ggml::cpu::repack::q4_0_8x8_q8_0;
1496+
return &q4_0_8x8_q8_0;
14821497
}
14831498
else if (cur->type == GGML_TYPE_IQ4_NL)
14841499
{
14851500
flag_aarch_prepacked_quant(cur->type);
1486-
return &ggml::cpu::repack::iq4_nl_4x4_q8_0;
1501+
return &iq4_nl_4x4_q8_0;
14871502
}
14881503

14891504
return nullptr;

0 commit comments

Comments
 (0)