Skip to content

Commit 5f9e96e

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/intel.Dockerfile # CMakeLists.txt # README.md # common/CMakeLists.txt # docs/multimodal.md # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-metal/CMakeLists.txt # ggml/src/ggml-sycl/CMakeLists.txt # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/cpy.cpp # ggml/src/ggml-sycl/gemm.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # src/llama-context.cpp
2 parents 69e4a32 + fb85a28 commit 5f9e96e

18 files changed

+501
-243
lines changed

common/build-info.cpp.in

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
2-
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
1+
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
2+
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
33
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
44
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
474474

475475
std::string regex_escape(const std::string & s) {
476476
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
477-
return std::regex_replace(s, special_chars, "\\$0");
477+
return std::regex_replace(s, special_chars, "\\$&");
478478
}
479479

480480
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#include "ggml-backend-impl.h"
2+
3+
#if defined(__aarch64__)
4+
5+
#if defined(__linux__)
6+
#include <sys/auxv.h>
7+
#elif defined(__APPLE__)
8+
#include <sys/sysctl.h>
9+
#endif
10+
11+
#if !defined(HWCAP2_I8MM)
12+
#define HWCAP2_I8MM (1 << 13)
13+
#endif
14+
15+
#if !defined(HWCAP2_SME)
16+
#define HWCAP2_SME (1 << 23)
17+
#endif
18+
19+
struct aarch64_features {
20+
// has_neon not needed, aarch64 has NEON guaranteed
21+
bool has_dotprod = false;
22+
bool has_fp16_va = false;
23+
bool has_sve = false;
24+
bool has_sve2 = false;
25+
bool has_i8mm = false;
26+
bool has_sme = false;
27+
28+
aarch64_features() {
29+
#if defined(__linux__)
30+
uint32_t hwcap = getauxval(AT_HWCAP);
31+
uint32_t hwcap2 = getauxval(AT_HWCAP2);
32+
33+
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
34+
has_fp16_va = !!(hwcap & HWCAP_FPHP);
35+
has_sve = !!(hwcap & HWCAP_SVE);
36+
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
37+
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
38+
has_sme = !!(hwcap2 & HWCAP2_SME);
39+
#elif defined(__APPLE__)
40+
int oldp = 0;
41+
size_t size = sizeof(oldp);
42+
43+
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
44+
has_dotprod = static_cast<bool>(oldp);
45+
}
46+
47+
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
48+
has_i8mm = static_cast<bool>(oldp);
49+
}
50+
51+
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
52+
has_sme = static_cast<bool>(oldp);
53+
}
54+
55+
// Apple apparently does not implement SVE yet
56+
#endif
57+
}
58+
};
59+
60+
static int ggml_backend_cpu_aarch64_score() {
61+
int score = 1;
62+
aarch64_features af;
63+
64+
#ifdef GGML_USE_DOTPROD
65+
if (!af.has_dotprod) { return 0; }
66+
score += 1<<1;
67+
#endif
68+
#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
69+
if (!af.has_fp16_va) { return 0; }
70+
score += 1<<2;
71+
#endif
72+
#ifdef GGML_USE_SVE
73+
if (!af.has_sve) { return 0; }
74+
score += 1<<3;
75+
#endif
76+
#ifdef GGML_USE_MATMUL_INT8
77+
if (!af.has_i8mm) { return 0; }
78+
score += 1<<4;
79+
#endif
80+
#ifdef GGML_USE_SVE2
81+
if (!af.has_sve2) { return 0; }
82+
score += 1<<5;
83+
#endif
84+
#ifdef GGML_USE_SME
85+
if (!af.has_sme) { return 0; }
86+
score += 1<<6;
87+
#endif
88+
89+
return score;
90+
}
91+
92+
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
93+
94+
# endif // defined(__aarch64__)

src/llama-batch.cpp

Lines changed: 123 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
#include "llama-batch.h"
22

3+
#include "llama-impl.h"
4+
#include "llama-cparams.h"
5+
#include "llama-vocab.h"
6+
37
#include <cassert>
48
#include <cstring>
59
#include <algorithm>
10+
#include <sstream>
611

712
llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
813
// clear empty sequences
@@ -105,12 +110,7 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
105110
ubatch.seq_id = batch->seq_id + seq.offset;
106111
}
107112
}
108-
if (logits_all) {
109-
for (size_t i = 0; i < length; ++i) {
110-
ubatch.output[ubatch.n_tokens + i] = 1;
111-
out_ids.push_back(ids[seq.offset + i]);
112-
}
113-
} else if (batch->logits) {
113+
if (batch->logits) {
114114
if (ubatch.equal_seqs) {
115115
for (size_t i = 0; i < length; ++i) {
116116
size_t id = ids[seq.offset + i];
@@ -197,11 +197,10 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
197197
return ubatch;
198198
}
199199

200-
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
200+
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split) {
201201
GGML_ASSERT(batch.n_tokens >= 0);
202202
this->batch = &batch;
203203
this->n_embd = n_embd;
204-
this->logits_all = logits_all;
205204

206205
n_tokens = batch.n_tokens;
207206
ids.resize(n_tokens);
@@ -285,9 +284,45 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
285284
);
286285
}
287286

288-
llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) {
289-
batch = in_batch;
287+
llama_batch_allocr::llama_batch_allocr() {
288+
const char * LLAMA_BATCH_DEBUG = getenv("LLAMA_BATCH_DEBUG");
289+
debug = LLAMA_BATCH_DEBUG ? atoi(LLAMA_BATCH_DEBUG) : 0;
290+
}
291+
292+
bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0) {
293+
clear();
294+
295+
batch = batch_inp;
296+
290297
GGML_ASSERT(batch.n_tokens > 0);
298+
299+
if (!batch.pos) {
300+
if (batch.seq_id) {
301+
LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
302+
return false;
303+
}
304+
}
305+
306+
if (batch.token) {
307+
for (int32_t i = 0; i < batch.n_tokens; ++i) {
308+
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
309+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
310+
return false;
311+
}
312+
}
313+
}
314+
315+
if (batch.seq_id) {
316+
for (int32_t i = 0; i < batch.n_tokens; ++i) {
317+
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
318+
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
319+
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_PARALLEL_SEQUENCES);
320+
return false;
321+
}
322+
}
323+
}
324+
}
325+
291326
if (!batch.pos) {
292327
assert(p0 >= 0);
293328
pos.resize(batch.n_tokens);
@@ -296,13 +331,15 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
296331
}
297332
batch.pos = pos.data();
298333
}
334+
299335
if (!batch.n_seq_id) {
300336
n_seq_id.resize(batch.n_tokens);
301337
for (int32_t i = 0; i < batch.n_tokens; i++) {
302338
n_seq_id[i] = seq_id_0.size();
303339
}
304340
batch.n_seq_id = n_seq_id.data();
305341
}
342+
306343
if (!batch.seq_id) {
307344
seq_id.resize(batch.n_tokens + 1);
308345
seq_id[batch.n_tokens] = NULL;
@@ -311,11 +348,84 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
311348
}
312349
batch.seq_id = seq_id.data();
313350
}
351+
314352
if (!batch.logits) {
315-
logits.resize(batch.n_tokens);
316-
logits[logits.size() - 1] = true;
317-
batch.logits = logits.data();
353+
// by default return the output only for the last token
354+
output.resize(batch.n_tokens);
355+
output[output.size() - 1] = true;
356+
batch.logits = output.data();
318357
}
358+
359+
for (int32_t i = 0; i < batch.n_tokens; ++i) {
360+
n_outputs += batch.logits[i] != 0;
361+
}
362+
363+
if (debug > 0) {
364+
LLAMA_LOG_DEBUG("%s: input batch info (p0 = %d):\n", __func__, p0);
365+
LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, batch.n_tokens);
366+
LLAMA_LOG_DEBUG("%s: token = %p\n", __func__, (void *) batch.token);
367+
LLAMA_LOG_DEBUG("%s: embd = %p\n", __func__, (void *) batch.embd);
368+
LLAMA_LOG_DEBUG("%s: pos = %p\n", __func__, (void *) batch.pos);
369+
LLAMA_LOG_DEBUG("%s: n_seq_id = %p\n", __func__, (void *) batch.n_seq_id);
370+
LLAMA_LOG_DEBUG("%s: seq_id = %p\n", __func__, (void *) batch.seq_id);
371+
LLAMA_LOG_DEBUG("%s: logits = %p\n", __func__, (void *) batch.logits);
372+
LLAMA_LOG_DEBUG("%s: n_outputs = %d\n", __func__, n_outputs);
373+
374+
if (debug > 1) {
375+
int seq_id_max = 0;
376+
for (int32_t i = 0; i < batch.n_tokens; ++i) {
377+
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
378+
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
379+
seq_id_max = std::max(seq_id_max, batch.seq_id[i][s]);
380+
}
381+
}
382+
}
383+
++seq_id_max;
384+
385+
LLAMA_LOG_DEBUG("%s: token = [\n", __func__);
386+
for (int32_t i = 0; i < batch.n_tokens; ++i) {
387+
std::vector<int8_t> seq_id(seq_id_max);
388+
389+
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
390+
seq_id[batch.seq_id[i][s]] = 1;
391+
}
392+
393+
std::stringstream ss;
394+
for (int s = 0; s < seq_id_max; ++s) {
395+
if (seq_id[s]) {
396+
ss << s%10;
397+
} else {
398+
ss << ".";
399+
}
400+
}
401+
402+
LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
403+
__func__, i, batch.token[i], vocab.token_to_piece(batch.token[i]).c_str(),
404+
batch.pos[i], batch.n_seq_id[i], ss.str().c_str(), batch.logits[i]);
405+
}
406+
LLAMA_LOG_DEBUG("%s: ]\n", __func__);
407+
}
408+
}
409+
410+
return true;
411+
}
412+
413+
const llama_batch & llama_batch_allocr::get_batch() const {
414+
return batch;
415+
}
416+
417+
uint32_t llama_batch_allocr::get_n_outputs() const {
418+
return n_outputs;
419+
}
420+
421+
void llama_batch_allocr::clear() {
422+
n_outputs = 0;
423+
424+
batch = {};
425+
pos.clear();
426+
n_seq_id.clear();
427+
seq_id.clear();
428+
output.clear();
319429
}
320430

321431
//

src/llama-batch.h

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ struct llama_ubatch {
1818
llama_token * token; // [n_tokens]
1919
float * embd; // [n_embd, n_tokens]
2020
llama_pos * pos; // [n_tokens]
21-
int32_t * n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
22-
llama_seq_id ** seq_id; // [n_seqs] // TODO: become llama_seq_id * seq_id;
21+
int32_t * n_seq_id; // [n_seqs]
22+
llama_seq_id ** seq_id; // [n_seqs]
2323
int8_t * output; // [n_tokens]
2424
};
2525

@@ -39,8 +39,6 @@ struct llama_sbatch {
3939

4040
size_t n_embd;
4141

42-
bool logits_all; // TODO: remove once lctx.logits_all is removed too
43-
4442
// sorted indices into the batch
4543
std::vector<int64_t> ids;
4644
// batch indices of the output
@@ -76,19 +74,34 @@ struct llama_sbatch {
7674
llama_ubatch split_seq(size_t n_ubatch);
7775

7876
llama_sbatch() = default;
79-
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
77+
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false);
8078
};
8179

8280
// temporary allocate memory for the input batch if needed
83-
struct llama_batch_allocr {
84-
struct llama_batch batch;
81+
class llama_batch_allocr {
82+
public:
83+
llama_batch_allocr();
84+
85+
// optionally fulfill the batch returned by llama_batch_get_one
86+
bool init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0);
87+
88+
const llama_batch & get_batch() const;
89+
90+
uint32_t get_n_outputs() const;
91+
92+
private:
93+
void clear();
94+
95+
llama_batch batch;
96+
97+
uint32_t n_outputs;
8598

8699
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
100+
87101
std::vector<llama_pos> pos;
88102
std::vector<int32_t> n_seq_id;
89103
std::vector<llama_seq_id *> seq_id;
90-
std::vector<int8_t> logits;
104+
std::vector<int8_t> output;
91105

92-
// optionally fulfill the batch returned by llama_batch_get_one
93-
llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
106+
int debug;
94107
};

0 commit comments

Comments
 (0)