ggml-org
diff --git a/‎Makefile‎
Lines changed: 10 additions & 12 deletions b/‎Makefile‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎Package.swift‎
Lines changed: 3 additions & 2 deletions b/‎Package.swift‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 1 addition & 1 deletion b/‎common/arg.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/speculative.cpp‎
Lines changed: 4 additions & 0 deletions b/‎common/speculative.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 6 additions & 2 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎convert_hf_to_gguf_update.py‎
Lines changed: 1 addition & 0 deletions b/‎convert_hf_to_gguf_update.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/build.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/build.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/quantize/README.md‎
Lines changed: 0 additions & 2 deletions b/‎examples/quantize/README.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎examples/quantize/quantize.cpp‎
Lines changed: 0 additions & 3 deletions b/‎examples/quantize/quantize.cpp‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎examples/server/CMakeLists.txt‎
Lines changed: 0 additions & 8 deletions b/‎examples/server/CMakeLists.txt‎
Lines changed: 0 additions & 8 deletions
@@ -445,6 +445,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	MK_CFLAGS     += -march=native -mtune=native
 	HOST_CXXFLAGS += -march=native -mtune=native
 
+	# Usage AMX build test
+	#MK_CFLAGS     += -march=graniterapids -mtune=graniterapids
+	#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
+
 	# Usage AVX-only
 	#MK_CFLAGS   += -mfma -mf16c -mavx
 	#MK_CXXFLAGS += -mfma -mf16c -mavx
@@ -948,17 +952,18 @@ DIR_COMMON = common
 
 OBJ_GGML = \
 	$(DIR_GGML)/src/ggml.o \
-	$(DIR_GGML)/src/ggml-aarch64.o \
 	$(DIR_GGML)/src/ggml-alloc.o \
 	$(DIR_GGML)/src/ggml-backend.o \
 	$(DIR_GGML)/src/ggml-backend-reg.o \
 	$(DIR_GGML)/src/ggml-opt.o \
 	$(DIR_GGML)/src/ggml-quants.o \
 	$(DIR_GGML)/src/ggml-threading.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
 	$(OBJ_GGML_EXT)
 
 OBJ_LLAMA = \
@@ -1098,17 +1103,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
 # Default target
 all: $(BUILD_TARGETS)
 
+# force c++ build for source file that have same name as c file
 # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
-#       g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
-$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
-	ggml/src/ggml-cpu/ggml-cpu.cpp \
-	ggml/include/ggml-backend.h \
-	ggml/include/ggml.h \
-	ggml/include/ggml-alloc.h \
-	ggml/src/ggml-backend-impl.h \
-	ggml/include/ggml-cpu.h \
-	ggml/src/ggml-impl.h
-	$(CXX) $(CXXFLAGS)   -c $< -o $@
+$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
 
 # Rules for building object files
 $(DIR_GGML)/%.o: $(DIR_GGML)/%.c
 
@@ -10,14 +10,15 @@ var sources = [
     "src/unicode.cpp",
     "src/unicode-data.cpp",
     "ggml/src/ggml.c",
-    "ggml/src/ggml-aarch64.c",
     "ggml/src/ggml-alloc.c",
     "ggml/src/ggml-backend.cpp",
     "ggml/src/ggml-backend-reg.cpp",
     "ggml/src/ggml-cpu/ggml-cpu.c",
     "ggml/src/ggml-cpu/ggml-cpu.cpp",
-    "ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
+    "ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp",
+    "ggml/src/ggml-cpu/ggml-cpu-hbm.cpp",
     "ggml/src/ggml-cpu/ggml-cpu-quants.c",
+    "ggml/src/ggml-cpu/ggml-cpu-traits.cpp",
     "ggml/src/ggml-threading.cpp",
     "ggml/src/ggml-quants.c",
 ]
 
@@ -786,7 +786,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.warmup = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--spm-infill"},
         string_format(
 
@@ -62,6 +62,10 @@ struct common_speculative * common_speculative_init(
 }
 
 void common_speculative_free(struct common_speculative * spec) {
+    if (spec == nullptr) {
+        return;
+    }
+
     common_sampler_free(spec->smpl);
 
     llama_batch_free(spec->batch);
 
@@ -661,6 +661,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
             # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
             res = "minerva-7b"
+        if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
+            # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
+            res = "roberta-bpe"
 
         if res is None:
             logger.warning("\n")
@@ -2533,7 +2536,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("BertModel", "CamembertModel")
+@Model.register("BertModel", "CamembertModel", "RobertaModel")
 class BertModel(Model):
     model_arch = gguf.MODEL_ARCH.BERT
 
@@ -2574,7 +2577,8 @@ def set_vocab(self):
 
         # we need this to validate the size of the token_type embeddings
         # though currently we are passing all zeros to the token_type embeddings
-        self.gguf_writer.add_token_type_count(2)  # "Sequence A" or "Sequence B"
+        # "Sequence A" or "Sequence B"
+        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
 
         # convert to phantom space vocab
         def phantom(tok):
 
@@ -103,6 +103,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
     {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
     {"name": "minerva-7b",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
+    {"name": "roberta-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
 ]
 
 
 
@@ -55,7 +55,7 @@ cmake --build build --config Release
     cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
     cmake --build build-arm64-windows-llvm-release
     ```
-    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
+    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
 
 ## BLAS Build
 
 
@@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis
 
 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
 
-The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
-
 *(outdated)*
 
 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
 
@@ -48,9 +48,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
     { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
     { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
-    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
     { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  },
     { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  },
     { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B",          },
 
@@ -34,14 +34,6 @@ endforeach()
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 
-# clean up generated files in pre-build step
-foreach(asset ${PUBLIC_ASSETS})
-    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
-    add_custom_command(TARGET ${TARGET} PRE_BUILD
-        COMMAND "${CMAKE_COMMAND}" -E remove -f "${output}"
-    )
-endforeach()
-
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 
 if (LLAMA_SERVER_SSL)
Original file line number	Diff line number	Diff line change
`@@ -786,7 +786,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`786`	`786`	`[](common_params & params) {`
`787`	`787`	`params.warmup = false;`
`788`	`788`	`}`
`789`		`- ).set_examples({LLAMA_EXAMPLE_MAIN}));`
	`789`	`+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));`
`790`	`790`	`add_opt(common_arg(`
`791`	`791`	`{"--spm-infill"},`
`792`	`792`	`string_format(`
Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,10 @@ struct common_speculative * common_speculative_init(`
`62`	`62`	`}`
`63`	`63`
`64`	`64`	`void common_speculative_free(struct common_speculative * spec) {`
	`65`	`+ if (spec == nullptr) {`
	`66`	`+ return;`
	`67`	`+ }`
	`68`	`+`
`65`	`69`	`common_sampler_free(spec->smpl);`
`66`	`70`
`67`	`71`	`llama_batch_free(spec->batch);`
Original file line number	Diff line number	Diff line change
`@@ -103,6 +103,7 @@ class TOKENIZER_TYPE(IntEnum):`
`103`	`103`	`{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },`
`104`	`104`	`{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },`
`105`	`105`	`{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },`
	`106`	`+ {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},`
`106`	`107`	`]`
`107`	`108`
`108`	`109`