Skip to content

Commit 2ef3cc9

Browse files
committed
Merge remote-tracking branch 'upstream/master' into dry-sampling-post-refactor
2 parents 875ff55 + cda0e4b commit 2ef3cc9

File tree

43 files changed

+8096
-4522
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+8096
-4522
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
8888
set(GGML_LLAMAFILE_DEFAULT ON)
8989
endif()
9090

91+
if (NOT DEFINED GGML_AMX)
92+
set(GGML_AMX ON)
93+
endif()
94+
9195
if (NOT DEFINED GGML_CUDA_GRAPHS)
9296
set(GGML_CUDA_GRAPHS_DEFAULT ON)
9397
endif()

Makefile

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,6 @@ GGML_METAL := 1
9393
DEPRECATE_WARNING := 1
9494
endif
9595

96-
ifdef LLAMA_OPENMP
97-
GGML_OPENMP := 1
98-
DEPRECATE_WARNING := 1
99-
endif
100-
10196
ifdef LLAMA_RPC
10297
GGML_RPC := 1
10398
DEPRECATE_WARNING := 1
@@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
584579
OBJ_GGML += ggml/src/llamafile/sgemm.o
585580
endif
586581

582+
ifndef GGML_NO_AMX
583+
MK_CPPFLAGS += -DGGML_USE_AMX
584+
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
585+
endif
586+
587587
ifdef GGML_RPC
588588
MK_CPPFLAGS += -DGGML_USE_RPC
589589
OBJ_GGML += ggml/src/ggml-rpc.o
@@ -1087,6 +1087,19 @@ ggml/src/llamafile/sgemm.o: \
10871087
$(CXX) $(CXXFLAGS) -c $< -o $@
10881088
endif # GGML_NO_LLAMAFILE
10891089

1090+
ifndef GGML_NO_AMX
1091+
ggml/src/ggml-amx.o: \
1092+
ggml/src/ggml-amx.cpp \
1093+
ggml/include/ggml-amx.h
1094+
$(CXX) $(CXXFLAGS) -c $< -o $@
1095+
1096+
ggml/src/ggml-amx/mmq.o: \
1097+
ggml/src/ggml-amx/mmq.cpp \
1098+
ggml/src/ggml-amx/mmq.h \
1099+
ggml/include/ggml.h
1100+
$(CXX) $(CXXFLAGS) -c $< -o $@
1101+
endif
1102+
10901103
ifdef GGML_RPC
10911104
ggml/src/ggml-rpc.o: \
10921105
ggml/src/ggml-rpc.cpp \
@@ -1238,6 +1251,7 @@ clean:
12381251
rm -vrf ggml/src/ggml-metal-embed.metal
12391252
rm -vrf ggml/src/ggml-cuda/*.o
12401253
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
1254+
rm -vrf ggml/src/ggml-amx/*.o
12411255
rm -rvf $(BUILD_TARGETS)
12421256
rm -rvf $(TEST_TARGETS)
12431257
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ variety of hardware - locally and in the cloud.
2929

3030
- Plain C/C++ implementation without any dependencies
3131
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
32-
- AVX, AVX2 and AVX512 support for x86 architectures
32+
- AVX, AVX2, AVX512 and AMX support for x86 architectures
3333
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
3434
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
3535
- Vulkan and SYCL backend support

common/common.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,7 @@ struct common_init_result common_init_from_params(common_params & params) {
955955
}
956956

957957
if (llama_model_has_encoder(model)) {
958-
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
958+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
959959
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
960960
if (decoder_start_token_id == -1) {
961961
decoder_start_token_id = bos;
@@ -964,7 +964,7 @@ struct common_init_result common_init_from_params(common_params & params) {
964964
tmp.push_back(decoder_start_token_id);
965965
}
966966
if (llama_model_has_decoder(model)) {
967-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
967+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
968968
}
969969
llama_kv_cache_clear(lctx);
970970
llama_synchronize(lctx);

examples/batched-bench/batched-bench.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
7474
batch.n_seq_id + i,
7575
batch.seq_id + i,
7676
batch.logits + i,
77-
0, 0, 0, // unused
7877
};
7978

8079
const int ret = llama_decode(ctx, batch_view);

examples/cvector-generator/cvector-generator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
339339

340340
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
341341
llama_kv_cache_clear(ctx);
342-
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
342+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
343343
fprintf(stderr, "%s : failed to eval\n", __func__);
344344
return false;
345345
}

examples/eval-callback/eval-callback.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ static bool run(llama_context * ctx, const common_params & params) {
131131

132132
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
133133

134-
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
134+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
135135
LOG_ERR("%s : failed to eval\n", __func__);
136136
return false;
137137
}

examples/imatrix/imatrix.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
496496
// clear the KV cache
497497
llama_kv_cache_clear(ctx);
498498

499+
llama_batch batch = llama_batch_init(n_batch, 0, 1);
500+
499501
for (int j = 0; j < num_batches; ++j) {
500502
const int batch_start = start + j * n_batch;
501503
const int batch_size = std::min(end - batch_start, n_batch);
@@ -508,9 +510,14 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
508510
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
509511
}
510512

511-
// TODO: use batch.logits to save computations instead of relying on logits_all == true
512-
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
513+
common_batch_clear(batch);
514+
for (int i = 0; i < batch_size; i++) {
515+
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
516+
}
517+
518+
if (llama_decode(ctx, batch)) {
513519
LOG_ERR("%s : failed to eval\n", __func__);
520+
llama_batch_free(batch);
514521
return false;
515522
}
516523

@@ -523,6 +530,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
523530
}
524531
}
525532

533+
llama_batch_free(batch);
534+
526535
const auto t_end = std::chrono::high_resolution_clock::now();
527536

528537
if (i == 0) {

examples/infill/infill.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ int main(int argc, char ** argv) {
396396

397397
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
398398

399-
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
399+
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
400400
LOG_ERR("%s : failed to eval\n", __func__);
401401
return 1;
402402
}

examples/llama-bench/llama-bench.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ static std::string get_gpu_info() {
151151
int count = ggml_backend_sycl_get_device_count();
152152
for (int i = 0; i < count; i++) {
153153
char buf[128];
154-
ggml_sycl_get_device_description(i, buf, sizeof(buf));
154+
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
155155
id += buf;
156156
if (i < count - 1) {
157157
id += "/";
@@ -1428,7 +1428,7 @@ struct sql_printer : public printer {
14281428
}
14291429
};
14301430

1431-
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
1431+
static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
14321432
llama_set_n_threads(ctx, n_threads, n_threads);
14331433

14341434
const llama_model * model = llama_get_model(ctx);
@@ -1444,14 +1444,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
14441444
for (int i = 1; i < n_tokens; i++) {
14451445
tokens[i] = std::rand() % n_vocab;
14461446
}
1447-
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
1447+
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
14481448
n_processed += n_tokens;
14491449
}
14501450

14511451
llama_synchronize(ctx);
14521452
}
14531453

1454-
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
1454+
static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
14551455
llama_set_n_threads(ctx, n_threads, n_threads);
14561456

14571457
const llama_model * model = llama_get_model(ctx);
@@ -1460,7 +1460,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
14601460
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
14611461

14621462
for (int i = 0; i < n_gen; i++) {
1463-
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
1463+
llama_decode(ctx, llama_batch_get_one(&token, 1));
14641464
llama_synchronize(ctx);
14651465
token = std::rand() % n_vocab;
14661466
}
@@ -1596,13 +1596,13 @@ int main(int argc, char ** argv) {
15961596
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
15971597
}
15981598
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1599-
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1599+
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
16001600
}
16011601
if (t.n_gen > 0) {
16021602
if (params.progress) {
16031603
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
16041604
}
1605-
test_gen(ctx, 1, 0, t.n_threads);
1605+
test_gen(ctx, 1, t.n_threads);
16061606
}
16071607

16081608
for (int i = 0; i < params.reps; i++) {
@@ -1614,13 +1614,13 @@ int main(int argc, char ** argv) {
16141614
if (params.progress) {
16151615
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
16161616
}
1617-
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1617+
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
16181618
}
16191619
if (t.n_gen > 0) {
16201620
if (params.progress) {
16211621
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
16221622
}
1623-
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
1623+
test_gen(ctx, t.n_gen, t.n_threads);
16241624
}
16251625

16261626
uint64_t t_ns = get_time_ns() - t_start;

0 commit comments

Comments
 (0)