Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1515,3 +1515,29 @@ jobs:
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

ggml-ci-arm64-cpu-kleidiai:
runs-on: ubuntu-22.04-arm

steps:
- name: Clone
id: checkout
uses: actions/checkout@v4

- name: ccache
uses: ggml-org/[email protected]
with:
key: ggml-ci-arm64-cpu-kleidiai
evict-old-files: 1d

- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libcurl4-openssl-dev

- name: Test
id: ggml-ci
run: |
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

31 changes: 31 additions & 0 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
# # with MUSA support
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with KLEIDIAI support
# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#

if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
Expand Down Expand Up @@ -115,6 +118,34 @@ if [ ! -z ${GG_BUILD_NO_SVE} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
fi

if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
echo ">>===== Enabling KleidiAI support"

CANDIDATES=("armv9-a+dotprod+i8mm" "armv8.6-a+dotprod+i8mm" "armv8.2-a+dotprod")
CPU=""

for cpu in "${CANDIDATES[@]}"; do
if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
CPU="$cpu"
break
fi
done

if [ -z "$CPU" ]; then
echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
exit 1
fi

echo ">>===== Using ARM baseline: ${CPU}"

CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
-DGGML_NATIVE=OFF \
-DGGML_CPU_KLEIDIAI=ON \
-DGGML_CPU_AARCH64=ON \
-DGGML_CPU_ARM_ARCH=${CPU} \
-DBUILD_SHARED_LIBS=OFF"
fi

## helpers

# download a file if it does not exist or if it is outdated
Expand Down
8 changes: 8 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1935,6 +1935,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_ctx_checkpoints = value;
}
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--cache-ram", "-cram"}, "N",
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
[](common_params & params, int value) {
params.cache_ram_mib = value;
}
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
Expand Down
6 changes: 3 additions & 3 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ struct common_chat_msg_content_part {
struct common_chat_msg {
std::string role;
std::string content;
std::vector<common_chat_msg_content_part> content_parts = {};
std::vector<common_chat_tool_call> tool_calls = {};
std::vector<common_chat_msg_content_part> content_parts;
std::vector<common_chat_tool_call> tool_calls;
std::string reasoning_content;
std::string tool_name;
std::string tool_call_id;
Expand All @@ -44,7 +44,7 @@ struct common_chat_msg {
bool empty() const {
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
}
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
for (auto i = 0u; i < tool_calls.size(); i++) {
if (ids_cache.size() <= i) {
auto id = tool_calls[i].id;
Expand Down
5 changes: 3 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ struct common_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool no_perf = false; // disable performance metrics
bool ctx_shift = false; // context shift on infinite text generation
bool ctx_shift = false; // context shift on infinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache

Expand Down Expand Up @@ -425,7 +425,8 @@ struct common_params {
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t n_ctx_checkpoints = 3; // max number of context checkpoints per slot
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.

std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
Expand Down
72 changes: 69 additions & 3 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,15 @@ class ModelBase:
# Mistral format specifics
is_mistral_format: bool = False
disable_mistral_community_chat_template: bool = False
sentence_transformers_dense_modules: bool = False

def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
disable_mistral_community_chat_template: bool = False):
disable_mistral_community_chat_template: bool = False,
sentence_transformers_dense_modules: bool = False):
if type(self) is ModelBase or \
type(self) is TextModel or \
type(self) is MmprojModel:
Expand All @@ -114,6 +116,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
self.lazy = not eager or (remote_hf_model_id is not None)
self.dry_run = dry_run
self.remote_hf_model_id = remote_hf_model_id
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
if remote_hf_model_id is not None:
self.is_safetensors = True

Expand Down Expand Up @@ -5269,6 +5272,53 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
@ModelBase.register("Gemma3TextModel")
class EmbeddingGemma(Gemma3Model):
model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
module_paths = []
dense_features_dims = {}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.sentence_transformers_dense_modules:
# read modules.json to determine if model has Dense layers
modules_file = self.dir_model / "modules.json"
if modules_file.is_file():
with open(modules_file, encoding="utf-8") as modules_json_file:
mods = json.load(modules_json_file)
for mod in mods:
if mod["type"] == "sentence_transformers.models.Dense":
mod_path = mod["path"]
# check if model.safetensors file for Dense layer exists
model_tensors_file = self.dir_model / mod_path / "model.safetensors"
if model_tensors_file.is_file():
self.module_paths.append(mod_path)
# read config.json of the Dense layer to get in/out features
mod_conf_file = self.dir_model / mod_path / "config.json"
if mod_conf_file.is_file():
with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file:
mod_conf = json.load(mod_conf_json_file)
# hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights
prefix = self._get_dense_prefix(mod_path)
if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None:
self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"])

def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
from safetensors.torch import load_file
module_paths = list(self.module_paths)
for i, module_path in enumerate(module_paths):
tensors_file = self.dir_model / module_path / "model.safetensors"
local_tensors = load_file(tensors_file)
tensor_name = self._get_dense_prefix(module_path)
for name, local_tensor in local_tensors.items():
if not name.endswith(".weight"):
continue
orig_name = name.replace("linear", tensor_name)
name = self.map_tensor_name(orig_name)
yield name, local_tensor.clone()

@staticmethod
def _get_dense_prefix(module_path) -> str:
"""Get the tensor name prefix for the Dense layer from module path."""
tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3"
return tensor_name

def set_gguf_parameters(self):
super().set_gguf_parameters()
Expand All @@ -5285,6 +5335,10 @@ def set_gguf_parameters(self):
logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
f"instead of {self.hparams['sliding_window']}")
self.gguf_writer.add_sliding_window(orig_sliding_window)
if self.sentence_transformers_dense_modules:
for dense, dims in self.dense_features_dims.items():
logger.info(f"Setting dense layer {dense} in/out features to {dims}")
self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1])

self._try_set_pooling_type()

Expand Down Expand Up @@ -9335,6 +9389,13 @@ def parse_args() -> argparse.Namespace:
)
)

parser.add_argument(
"--sentence-transformers-dense-modules", action="store_true",
help=("Whether to include sentence-transformers dense modules."
"It can be used for sentence-transformers models, like google/embeddinggemma-300m"
"Default these modules are not included.")
)

args = parser.parse_args()
if not args.print_supported_models and args.model is None:
parser.error("the following arguments are required: model")
Expand Down Expand Up @@ -9397,9 +9458,13 @@ def main() -> None:
if args.remote:
hf_repo_id = args.model
from huggingface_hub import snapshot_download
allowed_patterns = ["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"]
if args.sentence_transformers_dense_modules:
# include sentence-transformers dense modules safetensors files
allowed_patterns.append("*.safetensors")
local_dir = snapshot_download(
repo_id=hf_repo_id,
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
allow_patterns=allowed_patterns)
dir_model = Path(local_dir)
logger.info(f"Downloaded config and tokenizer to {local_dir}")
else:
Expand Down Expand Up @@ -9467,7 +9532,8 @@ def main() -> None:
split_max_tensors=args.split_max_tensors,
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
small_first_shard=args.no_tensor_first_split,
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
)

if args.vocab_only:
Expand Down
23 changes: 21 additions & 2 deletions examples/model-conversion/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -116,20 +116,39 @@ embedding-convert-model:
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
./scripts/embedding/convert-model.sh

embedding-convert-model-st:
$(call validate_embedding_model_path,embedding-convert-model-st)
@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
./scripts/embedding/convert-model.sh -st

embedding-run-original-model:
$(call validate_embedding_model_path,embedding-run-original-model)
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
USE_SENTENCE_TRANSFORMERS="$(USE_SENTENCE_TRANSFORMERS)" \
./scripts/embedding/run-original-model.py \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
$(if $(USE_SENTENCE_TRANSFORMERS),--use-sentence-transformers)

embedding-run-original-model-st: USE_SENTENCE_TRANSFORMERS=1
embedding-run-original-model-st: embedding-run-original-model

embedding-run-converted-model:
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
$(if $(USE_POOLING),--pooling)

embedding-run-converted-model-st: USE_POOLING=1
embedding-run-converted-model-st: embedding-run-converted-model

embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
@./scripts/embedding/compare-embeddings-logits.sh \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model-st
@./scripts/embedding/compare-embeddings-logits.sh \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

embedding-inspect-original-model:
$(call validate_embedding_model_path,embedding-inspect-original-model)
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}
Expand Down
24 changes: 24 additions & 0 deletions examples/model-conversion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,23 @@ This command will save two files to the `data` directory, one is a binary
file containing logits which will be used for comparison with the converted
model, and the other is a text file which allows for manual visual inspection.

#### Using SentenceTransformer with numbered layers
For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
03_Dense, 04_Normalize), use the `-st` targets to apply all these layers:

```console
# Run original model with SentenceTransformer (applies all numbered layers)
(venv) $ make embedding-run-original-model-st

# Run converted model with pooling enabled
(venv) $ make embedding-run-converted-model-st
```

This will use the SentenceTransformer library to load and run the model, which
automatically applies all the numbered layers in the correct order. This is
particularly useful when comparing with models that should include these
additional transformation layers beyond just the base model output.

### Model conversion
After updates have been made to [gguf-py](../../gguf-py) to add support for the
new model the model can be converted to GGUF format using the following command:
Expand All @@ -208,6 +225,13 @@ was done manually in the previous steps) and compare the logits:
(venv) $ make embedding-verify-logits
```

For models with SentenceTransformer layers, use the `-st` verification target:
```console
(venv) $ make embedding-verify-logits-st
```
This convenience target automatically runs both the original model with SentenceTransformer
and the converted model with pooling enabled, then compares the results.

### llama-server verification
To verify that the converted model works with llama-server, the following
command can be used:
Expand Down
Loading