ggml-org
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 19 additions & 5 deletions b/‎Makefile‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/common.cpp‎
Lines changed: 2 additions & 2 deletions b/‎common/common.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/batched-bench/batched-bench.cpp‎
Lines changed: 0 additions & 1 deletion b/‎examples/batched-bench/batched-bench.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/cvector-generator/cvector-generator.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/cvector-generator/cvector-generator.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/eval-callback/eval-callback.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/eval-callback/eval-callback.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/imatrix/imatrix.cpp‎
Lines changed: 11 additions & 2 deletions b/‎examples/imatrix/imatrix.cpp‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎examples/infill/infill.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/infill/infill.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llama-bench/llama-bench.cpp‎
Lines changed: 9 additions & 9 deletions b/‎examples/llama-bench/llama-bench.cpp‎
Lines changed: 9 additions & 9 deletions
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
     set(GGML_LLAMAFILE_DEFAULT ON)
 endif()
 
+if (NOT DEFINED GGML_AMX)
+    set(GGML_AMX ON)
+endif()
+
 if (NOT DEFINED GGML_CUDA_GRAPHS)
     set(GGML_CUDA_GRAPHS_DEFAULT ON)
 endif()
 
@@ -93,11 +93,6 @@ GGML_METAL := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_OPENMP
-GGML_OPENMP := 1
-DEPRECATE_WARNING := 1
-endif
-
 ifdef LLAMA_RPC
 GGML_RPC := 1
 DEPRECATE_WARNING := 1
@@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
 	OBJ_GGML    += ggml/src/llamafile/sgemm.o
 endif
 
+ifndef GGML_NO_AMX
+	MK_CPPFLAGS += -DGGML_USE_AMX
+	OBJ_GGML    += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
+endif
+
 ifdef GGML_RPC
 	MK_CPPFLAGS += -DGGML_USE_RPC
 	OBJ_GGML    += ggml/src/ggml-rpc.o
@@ -1087,6 +1087,19 @@ ggml/src/llamafile/sgemm.o: \
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # GGML_NO_LLAMAFILE
 
+ifndef GGML_NO_AMX
+ggml/src/ggml-amx.o: \
+	ggml/src/ggml-amx.cpp \
+	ggml/include/ggml-amx.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ggml/src/ggml-amx/mmq.o: \
+	ggml/src/ggml-amx/mmq.cpp \
+	ggml/src/ggml-amx/mmq.h \
+	ggml/include/ggml.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif
+
 ifdef GGML_RPC
 ggml/src/ggml-rpc.o: \
 	ggml/src/ggml-rpc.cpp \
@@ -1238,6 +1251,7 @@ clean:
 	rm -vrf ggml/src/ggml-metal-embed.metal
 	rm -vrf ggml/src/ggml-cuda/*.o
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
+	rm -vrf ggml/src/ggml-amx/*.o
 	rm -rvf $(BUILD_TARGETS)
 	rm -rvf $(TEST_TARGETS)
 	rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
 
@@ -29,7 +29,7 @@ variety of hardware - locally and in the cloud.
 
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
-- AVX, AVX2 and AVX512 support for x86 architectures
+- AVX, AVX2, AVX512 and AMX support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
 - Vulkan and SYCL backend support
 
@@ -955,7 +955,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
 
         if (llama_model_has_encoder(model)) {
-            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
+            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
             llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
             if (decoder_start_token_id == -1) {
                 decoder_start_token_id = bos;
@@ -964,7 +964,7 @@ struct common_init_result common_init_from_params(common_params & params) {
             tmp.push_back(decoder_start_token_id);
         }
         if (llama_model_has_decoder(model)) {
-            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
 
@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
 
@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
     llama_kv_cache_clear(ctx);
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
     }
 
@@ -131,7 +131,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 
     std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
 
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         LOG_ERR("%s : failed to eval\n", __func__);
         return false;
     }
 
@@ -496,6 +496,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
         // clear the KV cache
         llama_kv_cache_clear(ctx);
 
+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
@@ -508,9 +510,14 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
                 tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
             }
 
-            // TODO: use batch.logits to save computations instead of relying on logits_all == true
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
+            if (llama_decode(ctx, batch)) {
                 LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
                 return false;
             }
 
@@ -523,6 +530,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
             }
         }
 
+        llama_batch_free(batch);
+
         const auto t_end = std::chrono::high_resolution_clock::now();
 
         if (i == 0) {
 
@@ -396,7 +396,7 @@ int main(int argc, char ** argv) {
 
                 LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
                     LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
 
@@ -151,7 +151,7 @@ static std::string get_gpu_info() {
     int count = ggml_backend_sycl_get_device_count();
     for (int i = 0; i < count; i++) {
         char buf[128];
-        ggml_sycl_get_device_description(i, buf, sizeof(buf));
+        ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
         id += buf;
         if (i < count - 1) {
             id += "/";
@@ -1428,7 +1428,7 @@ struct sql_printer : public printer {
     }
 };
 
-static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
+static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
     const llama_model * model = llama_get_model(ctx);
@@ -1444,14 +1444,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
         for (int i = 1; i < n_tokens; i++) {
             tokens[i] = std::rand() % n_vocab;
         }
-        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
+        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
         n_processed += n_tokens;
     }
 
     llama_synchronize(ctx);
 }
 
-static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
+static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
     const llama_model * model = llama_get_model(ctx);
@@ -1460,7 +1460,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
     llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
 
     for (int i = 0; i < n_gen; i++) {
-        llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
+        llama_decode(ctx, llama_batch_get_one(&token, 1));
         llama_synchronize(ctx);
         token = std::rand() % n_vocab;
     }
@@ -1596,13 +1596,13 @@ int main(int argc, char ** argv) {
                 fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
             }
             //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
-            test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+            test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
         }
         if (t.n_gen > 0) {
             if (params.progress) {
                 fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
             }
-            test_gen(ctx, 1, 0, t.n_threads);
+            test_gen(ctx, 1, t.n_threads);
         }
 
         for (int i = 0; i < params.reps; i++) {
@@ -1614,13 +1614,13 @@ int main(int argc, char ** argv) {
                 if (params.progress) {
                     fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
                 }
-                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+                test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
             }
             if (t.n_gen > 0) {
                 if (params.progress) {
                     fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
                 }
-                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
+                test_gen(ctx, t.n_gen, t.n_threads);
             }
 
             uint64_t t_ns = get_time_ns() - t_start;
Original file line number	Diff line number	Diff line change
`@@ -955,7 +955,7 @@ struct common_init_result common_init_from_params(common_params & params) {`
`955`	`955`	`}`
`956`	`956`
`957`	`957`	`if (llama_model_has_encoder(model)) {`
`958`		`- llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));`
	`958`	`+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));`
`959`	`959`	`llama_token decoder_start_token_id = llama_model_decoder_start_token(model);`
`960`	`960`	`if (decoder_start_token_id == -1) {`
`961`	`961`	`decoder_start_token_id = bos;`
`@@ -964,7 +964,7 @@ struct common_init_result common_init_from_params(common_params & params) {`
`964`	`964`	`tmp.push_back(decoder_start_token_id);`
`965`	`965`	`}`
`966`	`966`	`if (llama_model_has_decoder(model)) {`
`967`		`- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));`
	`967`	`+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));`
`968`	`968`	`}`
`969`	`969`	`llama_kv_cache_clear(lctx);`
`970`	`970`	`llama_synchronize(lctx);`
Original file line number	Diff line number	Diff line change
`@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {`
`339`	`339`
`340`	`340`	`static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {`
`341`	`341`	`llama_kv_cache_clear(ctx);`
`342`		`- if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {`
	`342`	`+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {`
`343`	`343`	`fprintf(stderr, "%s : failed to eval\n", __func__);`
`344`	`344`	`return false;`
`345`	`345`	`}`
Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,7 @@ static bool run(llama_context * ctx, const common_params & params) {`
`131`	`131`
`132`	`132`	`std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);`
`133`	`133`
`134`		`- if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {`
	`134`	`+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {`
`135`	`135`	`LOG_ERR("%s : failed to eval\n", __func__);`
`136`	`136`	`return false;`
`137`	`137`	`}`
Original file line number	Diff line number	Diff line change
`@@ -396,7 +396,7 @@ int main(int argc, char ** argv) {`
`396`	`396`
`397`	`397`	`LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());`
`398`	`398`
`399`		`- if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {`
	`399`	`+ if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {`
`400`	`400`	`LOG_ERR("%s : failed to eval\n", __func__);`
`401`	`401`	`return 1;`
`402`	`402`	`}`
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ static std::string get_gpu_info() {`
`151`	`151`	`int count = ggml_backend_sycl_get_device_count();`
`152`	`152`	`for (int i = 0; i < count; i++) {`
`153`	`153`	`char buf[128];`
`154`		`- ggml_sycl_get_device_description(i, buf, sizeof(buf));`
	`154`	`+ ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));`
`155`	`155`	`id += buf;`
`156`	`156`	`if (i < count - 1) {`
`157`	`157`	`id += "/";`
`@@ -1428,7 +1428,7 @@ struct sql_printer : public printer {`
`1428`	`1428`	`}`
`1429`	`1429`	`};`
`1430`	`1430`
`1431`		`-static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {`
	`1431`	`+static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {`
`1432`	`1432`	`llama_set_n_threads(ctx, n_threads, n_threads);`
`1433`	`1433`
`1434`	`1434`	`const llama_model * model = llama_get_model(ctx);`
`@@ -1444,14 +1444,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat`
`1444`	`1444`	`for (int i = 1; i < n_tokens; i++) {`
`1445`	`1445`	`tokens[i] = std::rand() % n_vocab;`
`1446`	`1446`	`}`
`1447`		`- llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));`
	`1447`	`+ llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));`
`1448`	`1448`	`n_processed += n_tokens;`
`1449`	`1449`	`}`
`1450`	`1450`
`1451`	`1451`	`llama_synchronize(ctx);`
`1452`	`1452`	`}`
`1453`	`1453`
`1454`		`-static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {`
	`1454`	`+static void test_gen(llama_context * ctx, int n_gen, int n_threads) {`
`1455`	`1455`	`llama_set_n_threads(ctx, n_threads, n_threads);`
`1456`	`1456`
`1457`	`1457`	`const llama_model * model = llama_get_model(ctx);`
`@@ -1460,7 +1460,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)`
`1460`	`1460`	`llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;`
`1461`	`1461`
`1462`	`1462`	`for (int i = 0; i < n_gen; i++) {`
`1463`		`- llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));`
	`1463`	`+ llama_decode(ctx, llama_batch_get_one(&token, 1));`
`1464`	`1464`	`llama_synchronize(ctx);`
`1465`	`1465`	`token = std::rand() % n_vocab;`
`1466`	`1466`	`}`
`@@ -1596,13 +1596,13 @@ int main(int argc, char ** argv) {`
`1596`	`1596`	`fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);`
`1597`	`1597`	`}`
`1598`	`1598`	`//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);`
`1599`		`- test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);`
	`1599`	`+ test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);`
`1600`	`1600`	`}`
`1601`	`1601`	`if (t.n_gen > 0) {`
`1602`	`1602`	`if (params.progress) {`
`1603`	`1603`	`fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);`
`1604`	`1604`	`}`
`1605`		`- test_gen(ctx, 1, 0, t.n_threads);`
	`1605`	`+ test_gen(ctx, 1, t.n_threads);`
`1606`	`1606`	`}`
`1607`	`1607`
`1608`	`1608`	`for (int i = 0; i < params.reps; i++) {`
`@@ -1614,13 +1614,13 @@ int main(int argc, char ** argv) {`
`1614`	`1614`	`if (params.progress) {`
`1615`	`1615`	`fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);`
`1616`	`1616`	`}`
`1617`		`- test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);`
	`1617`	`+ test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);`
`1618`	`1618`	`}`
`1619`	`1619`	`if (t.n_gen > 0) {`
`1620`	`1620`	`if (params.progress) {`
`1621`	`1621`	`fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);`
`1622`	`1622`	`}`
`1623`		`- test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);`
	`1623`	`+ test_gen(ctx, t.n_gen, t.n_threads);`
`1624`	`1624`	`}`
`1625`	`1625`
`1626`	`1626`	`uint64_t t_ns = get_time_ns() - t_start;`