Skip to content

Commit e30bfc6

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 6add36a + 87421a2 commit e30bfc6

File tree

19 files changed

+7532
-4082
lines changed

19 files changed

+7532
-4082
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
103103
set(GGML_LLAMAFILE_DEFAULT ON)
104104
endif()
105105

106+
if (NOT DEFINED GGML_AMX)
107+
set(GGML_AMX ON)
108+
endif()
109+
106110
if (NOT DEFINED GGML_CUDA_GRAPHS)
107111
set(GGML_CUDA_GRAPHS_DEFAULT ON)
108112
endif()

Makefile

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,6 @@ GGML_METAL := 1
9393
DEPRECATE_WARNING := 1
9494
endif
9595

96-
ifdef LLAMA_OPENMP
97-
GGML_OPENMP := 1
98-
DEPRECATE_WARNING := 1
99-
endif
100-
10196
ifdef LLAMA_RPC
10297
GGML_RPC := 1
10398
DEPRECATE_WARNING := 1
@@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
584579
OBJ_GGML += ggml/src/llamafile/sgemm.o
585580
endif
586581

582+
ifndef GGML_NO_AMX
583+
MK_CPPFLAGS += -DGGML_USE_AMX
584+
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
585+
endif
586+
587587
ifdef GGML_RPC
588588
MK_CPPFLAGS += -DGGML_USE_RPC
589589
OBJ_GGML += ggml/src/ggml-rpc.o
@@ -1087,6 +1087,19 @@ ggml/src/llamafile/sgemm.o: \
10871087
$(CXX) $(CXXFLAGS) -c $< -o $@
10881088
endif # GGML_NO_LLAMAFILE
10891089

1090+
ifndef GGML_NO_AMX
1091+
ggml/src/ggml-amx.o: \
1092+
ggml/src/ggml-amx.cpp \
1093+
ggml/include/ggml-amx.h
1094+
$(CXX) $(CXXFLAGS) -c $< -o $@
1095+
1096+
ggml/src/ggml-amx/mmq.o: \
1097+
ggml/src/ggml-amx/mmq.cpp \
1098+
ggml/src/ggml-amx/mmq.h \
1099+
ggml/include/ggml.h
1100+
$(CXX) $(CXXFLAGS) -c $< -o $@
1101+
endif
1102+
10901103
ifdef GGML_RPC
10911104
ggml/src/ggml-rpc.o: \
10921105
ggml/src/ggml-rpc.cpp \
@@ -1238,6 +1251,7 @@ clean:
12381251
rm -vrf ggml/src/ggml-metal-embed.metal
12391252
rm -vrf ggml/src/ggml-cuda/*.o
12401253
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
1254+
rm -vrf ggml/src/ggml-amx/*.o
12411255
rm -rvf $(BUILD_TARGETS)
12421256
rm -rvf $(TEST_TARGETS)
12431257
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ variety of hardware - locally and in the cloud.
2929

3030
- Plain C/C++ implementation without any dependencies
3131
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
32-
- AVX, AVX2 and AVX512 support for x86 architectures
32+
- AVX, AVX2, AVX512 and AMX support for x86 architectures
3333
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
3434
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
3535
- Vulkan and SYCL backend support

examples/llama-bench/llama-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ static std::string get_gpu_info() {
151151
int count = ggml_backend_sycl_get_device_count();
152152
for (int i = 0; i < count; i++) {
153153
char buf[128];
154-
ggml_sycl_get_device_description(i, buf, sizeof(buf));
154+
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
155155
id += buf;
156156
if (i < count - 1) {
157157
id += "/";

examples/server/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,8 @@ node index.js
333333

334334
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
335335

336+
`n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`
337+
336338
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
337339
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
338340

examples/server/server.cpp

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ struct slot_params {
131131
int32_t n_keep = 0; // number of tokens to keep from initial prompt
132132
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
133133
int32_t n_predict = -1; // new tokens to predict
134+
int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters
134135

135136
int64_t t_max_prompt_ms = -1; // TODO: implement
136137
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
@@ -173,6 +174,8 @@ struct server_slot {
173174
std::vector<llama_token> prompt_tokens;
174175
std::vector<llama_token> extra_tokens;
175176

177+
size_t last_nl_pos = 0;
178+
176179
std::string generated_text;
177180
std::vector<llama_token> cache_tokens;
178181
std::vector<completion_token_output> generated_token_probs;
@@ -215,6 +218,7 @@ struct server_slot {
215218
SLT_DBG(*this, "%s", "\n");
216219

217220
n_prompt_tokens = 0;
221+
last_nl_pos = 0;
218222
generated_text = "";
219223
has_new_line = false;
220224
truncated = false;
@@ -860,6 +864,7 @@ struct server_context {
860864
slot.params.stream = json_value(data, "stream", false);
861865
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
862866
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
867+
slot.params.n_indent = json_value(data, "n_indent", default_params.n_indent);
863868
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
864869
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
865870
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
@@ -878,7 +883,7 @@ struct server_context {
878883
slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
879884
slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
880885
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
881-
slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
886+
slot.params.n_keep = json_value(data, "n_keep", default_params.n_keep);
882887
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
883888
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
884889
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
@@ -1129,13 +1134,48 @@ struct server_context {
11291134
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
11301135
}
11311136

1132-
// if we have already seen a new line, we stop after a certain time limit
1133-
if (slot.has_new_line && slot.params.t_max_predict_ms > 0 &&
1134-
(ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
1135-
slot.stopped_limit = true;
1136-
slot.has_next_token = false;
1137+
if (slot.has_new_line) {
1138+
// if we have already seen a new line, we stop after a certain time limit
1139+
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
1140+
slot.stopped_limit = true;
1141+
slot.has_next_token = false;
1142+
1143+
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
1144+
}
1145+
1146+
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
1147+
if (slot.params.n_indent > 0) {
1148+
// check the current indentation
1149+
// TODO: improve by not doing it more than once for each new line
1150+
if (slot.last_nl_pos > 0) {
1151+
size_t pos = slot.last_nl_pos;
1152+
1153+
int n_indent = 0;
1154+
while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) {
1155+
n_indent++;
1156+
pos++;
1157+
}
1158+
1159+
if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) {
1160+
slot.stopped_limit = true;
1161+
slot.has_next_token = false;
1162+
1163+
// cut the last line
1164+
slot.generated_text.erase(pos, std::string::npos);
11371165

1138-
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
1166+
SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
1167+
}
1168+
}
1169+
1170+
// find the next new line
1171+
{
1172+
const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
1173+
1174+
if (pos != std::string::npos) {
1175+
slot.last_nl_pos = pos + 1;
1176+
}
1177+
}
1178+
}
11391179
}
11401180

11411181
// check if there is a new line in the generated text

ggml/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF)
109109
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
110110
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
111111
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
112+
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
113+
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
114+
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
112115
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
113116
if (NOT MSVC)
114117
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
@@ -168,6 +171,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
168171
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
169172
option(GGML_OPENMP "ggml: use OpenMP" ON)
170173
option(GGML_RPC "ggml: use RPC" OFF)
174+
option(GGML_AMX "ggml: use AMX" OFF)
171175
option(GGML_SYCL "ggml: use SYCL" OFF)
172176
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
173177
set (GGML_SYCL_TARGET "INTEL" CACHE STRING

ggml/include/ggml-amx.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#pragma once
2+
3+
#include "ggml.h"
4+
#include "ggml-backend.h"
5+
6+
7+
#ifdef __cplusplus
8+
extern "C" {
9+
#endif
10+
11+
// buffer_type API
12+
GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
13+
14+
GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);
15+
16+
// backend API
17+
GGML_API ggml_backend_t ggml_backend_amx_init(void);
18+
19+
GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
20+
21+
GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);
22+
23+
#ifdef __cplusplus
24+
}
25+
#endif

ggml/include/ggml-sycl.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ extern "C" {
1919
// backend API
2020
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
2121

22+
GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
23+
2224
// devide buffer
2325
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
2426

@@ -29,14 +31,19 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const fl
2931
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
3032

3133
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
32-
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
33-
GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
34+
GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
35+
GGML_API void ggml_backend_sycl_get_device_description(int device,
36+
char *description,
37+
size_t description_size);
3438
GGML_API int ggml_backend_sycl_get_device_count();
3539
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
3640

3741
// SYCL doesn't support registering host memory, keep here for reference
3842
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
3943
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
44+
45+
GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
46+
4047
#ifdef __cplusplus
4148
}
4249
#endif

ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2488,6 +2488,7 @@ extern "C" {
24882488
GGML_API int ggml_cpu_has_avx512_vbmi(void);
24892489
GGML_API int ggml_cpu_has_avx512_vnni(void);
24902490
GGML_API int ggml_cpu_has_avx512_bf16(void);
2491+
GGML_API int ggml_cpu_has_amx_int8 (void);
24912492
GGML_API int ggml_cpu_has_fma (void);
24922493
GGML_API int ggml_cpu_has_neon (void);
24932494
GGML_API int ggml_cpu_has_sve (void);

0 commit comments

Comments
 (0)