Skip to content

Commit 6bf6e30

Browse files
committed
Merge branch 'master' into xsn/refactor_server_struct_input
2 parents 9bb1ae6 + d9c3ba2 commit 6bf6e30

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+2277
-1562
lines changed

Makefile

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
445445
MK_CFLAGS += -march=native -mtune=native
446446
HOST_CXXFLAGS += -march=native -mtune=native
447447

448+
# Usage AMX build test
449+
#MK_CFLAGS += -march=graniterapids -mtune=graniterapids
450+
#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
451+
448452
# Usage AVX-only
449453
#MK_CFLAGS += -mfma -mf16c -mavx
450454
#MK_CXXFLAGS += -mfma -mf16c -mavx
@@ -948,17 +952,18 @@ DIR_COMMON = common
948952

949953
OBJ_GGML = \
950954
$(DIR_GGML)/src/ggml.o \
951-
$(DIR_GGML)/src/ggml-aarch64.o \
952955
$(DIR_GGML)/src/ggml-alloc.o \
953956
$(DIR_GGML)/src/ggml-backend.o \
954957
$(DIR_GGML)/src/ggml-backend-reg.o \
955958
$(DIR_GGML)/src/ggml-opt.o \
956959
$(DIR_GGML)/src/ggml-quants.o \
957960
$(DIR_GGML)/src/ggml-threading.o \
958961
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
959-
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
962+
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
960963
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
964+
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
961965
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
966+
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
962967
$(OBJ_GGML_EXT)
963968

964969
OBJ_LLAMA = \
@@ -1098,17 +1103,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
10981103
# Default target
10991104
all: $(BUILD_TARGETS)
11001105

1106+
# force c++ build for source file that have same name as c file
11011107
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
1102-
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
1103-
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
1104-
ggml/src/ggml-cpu/ggml-cpu.cpp \
1105-
ggml/include/ggml-backend.h \
1106-
ggml/include/ggml.h \
1107-
ggml/include/ggml-alloc.h \
1108-
ggml/src/ggml-backend-impl.h \
1109-
ggml/include/ggml-cpu.h \
1110-
ggml/src/ggml-impl.h
1111-
$(CXX) $(CXXFLAGS) -c $< -o $@
1108+
$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
1109+
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
11121110

11131111
# Rules for building object files
11141112
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c

Package.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,15 @@ var sources = [
1010
"src/unicode.cpp",
1111
"src/unicode-data.cpp",
1212
"ggml/src/ggml.c",
13-
"ggml/src/ggml-aarch64.c",
1413
"ggml/src/ggml-alloc.c",
1514
"ggml/src/ggml-backend.cpp",
1615
"ggml/src/ggml-backend-reg.cpp",
1716
"ggml/src/ggml-cpu/ggml-cpu.c",
1817
"ggml/src/ggml-cpu/ggml-cpu.cpp",
19-
"ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
18+
"ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp",
19+
"ggml/src/ggml-cpu/ggml-cpu-hbm.cpp",
2020
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
21+
"ggml/src/ggml-cpu/ggml-cpu-traits.cpp",
2122
"ggml/src/ggml-threading.cpp",
2223
"ggml/src/ggml-quants.c",
2324
]

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
786786
[](common_params & params) {
787787
params.warmup = false;
788788
}
789-
).set_examples({LLAMA_EXAMPLE_MAIN}));
789+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
790790
add_opt(common_arg(
791791
{"--spm-infill"},
792792
string_format(

common/speculative.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ struct common_speculative * common_speculative_init(
6262
}
6363

6464
void common_speculative_free(struct common_speculative * spec) {
65+
if (spec == nullptr) {
66+
return;
67+
}
68+
6569
common_sampler_free(spec->smpl);
6670

6771
llama_batch_free(spec->batch);

convert_hf_to_gguf.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -661,6 +661,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
661661
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
662662
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
663663
res = "minerva-7b"
664+
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
665+
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
666+
res = "roberta-bpe"
664667

665668
if res is None:
666669
logger.warning("\n")
@@ -2533,7 +2536,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
25332536
return [(self.map_tensor_name(name), data_torch)]
25342537

25352538

2536-
@Model.register("BertModel", "CamembertModel")
2539+
@Model.register("BertModel", "CamembertModel", "RobertaModel")
25372540
class BertModel(Model):
25382541
model_arch = gguf.MODEL_ARCH.BERT
25392542

@@ -2574,7 +2577,8 @@ def set_vocab(self):
25742577

25752578
# we need this to validate the size of the token_type embeddings
25762579
# though currently we are passing all zeros to the token_type embeddings
2577-
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
2580+
# "Sequence A" or "Sequence B"
2581+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
25782582

25792583
# convert to phantom space vocab
25802584
def phantom(tok):

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class TOKENIZER_TYPE(IntEnum):
103103
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
104104
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
105105
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
106+
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
106107
]
107108

108109

docs/build.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ cmake --build build --config Release
5555
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
5656
cmake --build build-arm64-windows-llvm-release
5757
```
58-
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
58+
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
5959
6060
## BLAS Build
6161

examples/quantize/README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis
5454

5555
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
5656

57-
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
58-
5957
*(outdated)*
6058

6159
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |

examples/quantize/quantize.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
4848
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
4949
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
5050
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
51-
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
52-
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
53-
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
5451
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
5552
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
5653
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

examples/server/CMakeLists.txt

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,6 @@ endforeach()
3434
add_executable(${TARGET} ${TARGET_SRCS})
3535
install(TARGETS ${TARGET} RUNTIME)
3636

37-
# clean up generated files in pre-build step
38-
foreach(asset ${PUBLIC_ASSETS})
39-
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
40-
add_custom_command(TARGET ${TARGET} PRE_BUILD
41-
COMMAND "${CMAKE_COMMAND}" -E remove -f "${output}"
42-
)
43-
endforeach()
44-
4537
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
4638

4739
if (LLAMA_SERVER_SSL)

0 commit comments

Comments
 (0)