Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,13 @@ jobs:
matrix:
config:
# Multi-stage build
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: true}
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
steps:
- name: Check out the repo
uses: actions/checkout@v4
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
- [x] [GLM-4-0414](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
Expand Down Expand Up @@ -259,7 +260,9 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)

You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.

By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.

After downloading a model, use the CLI tools to run it locally - see below.

Expand Down
20 changes: 16 additions & 4 deletions build-xcframework.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ COMMON_CMAKE_ARGS=(
-DGGML_OPENMP=${GGML_OPENMP}
)

XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
echo "Detected Xcode version: $XCODE_VERSION"

check_required_tool() {
local tool=$1
local install_message=$2
Expand Down Expand Up @@ -325,21 +330,28 @@ combine_static_libraries() {

# Platform-specific post-processing for device builds
if [[ "$is_simulator" == "false" ]]; then
if command -v vtool &>/dev/null; then
if command -v xcrun vtool &>/dev/null; then
case "$platform" in
"ios")
echo "Marking binary as a framework binary for iOS..."
vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
;;
"visionos")
echo "Marking binary as a framework binary for visionOS..."
vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
echo "Xcode version greater than 16.2, using visionOS."
VISION_OS_BUILD_VERSION="visionos"
else
echo "Xcode version less than or equal to 16.2, using xros."
VISION_OS_BUILD_VERSION="xros"
fi
xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
;;
"tvos")
echo "Marking binary as a framework binary for tvOS..."
vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
;;
esac
Expand Down
17 changes: 8 additions & 9 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,13 @@ static bool common_download_file_single(const std::string & url, const std::stri
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);

http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
// Check if hf-token or bearer-token was specified
if (!bearer_token.empty()) {
std::string auth_header = "Authorization: Bearer " + bearer_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
}
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);

#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
Expand Down Expand Up @@ -544,7 +545,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::string res_str;
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;

std::string model_endpoint = get_model_endpoint();

std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
Expand Down Expand Up @@ -659,13 +663,8 @@ static void common_params_handle_model(
}
}

std::string hf_endpoint = "https://huggingface.co/";
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
if (hf_endpoint_env) {
hf_endpoint = hf_endpoint_env;
if (hf_endpoint.back() != '/') hf_endpoint += '/';
}
model.url = hf_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
std::string model_endpoint = get_model_endpoint();
model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
// make sure model path is present (for caching purposes)
if (model.path.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs
Expand Down
2 changes: 1 addition & 1 deletion common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1622,7 +1622,7 @@ static common_chat_params common_chat_templates_apply_jinja(
}

// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
return common_chat_params_init_hermes_2_pro(tmpl, params);
}

Expand Down
19 changes: 17 additions & 2 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -830,7 +830,7 @@ std::string fs_get_cache_directory() {
if (getenv("LLAMA_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE");
} else {
#ifdef __linux__
#if defined(__linux__) || defined(__FreeBSD__)
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else {
Expand All @@ -840,7 +840,9 @@ std::string fs_get_cache_directory() {
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32)
cache_directory = std::getenv("LOCALAPPDATA");
#endif // __linux__
#else
# error Unknown architecture
#endif
cache_directory = ensure_trailing_slash(cache_directory);
cache_directory += "llama.cpp";
}
Expand Down Expand Up @@ -1027,6 +1029,19 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}

std::string get_model_endpoint() {
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
std::string model_endpoint = "https://huggingface.co/";
if (endpoint_env) {
model_endpoint = endpoint_env;
if (model_endpoint.back() != '/') model_endpoint += '/';
}
return model_endpoint;
}

void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
llama_clear_adapter_lora(ctx);
for (auto & la : lora) {
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,8 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

std::string get_model_endpoint();

//
// Batch utils
//
Expand Down
26 changes: 20 additions & 6 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
res = "llama4"
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
res = "glm4"

if res is None:
logger.warning("\n")
Expand Down Expand Up @@ -1750,7 +1753,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:

low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
assert low_freq_wavelen != high_freq_wavelen
# assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4

rope_factors = []
for freq in freqs:
Expand Down Expand Up @@ -1806,10 +1809,6 @@ def set_gguf_parameters(self):
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
name = name.replace("language_model.", "")
name = name.replace("feed_forward.", "mlp.") # a bit hacky for now
name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now

# split the gate_up into gate and up
if "gate_up_proj" in name:
name_up = name.replace("gate_up_proj", "up_proj.weight")
Expand Down Expand Up @@ -4901,6 +4900,22 @@ def prepare_tensors(self):
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)


@Model.register("Glm4ForCausalLM")
class Glm4Model(Model):
model_arch = gguf.MODEL_ARCH.GLM4

def set_vocab(self):
self._set_vocab_gpt2()

def set_gguf_parameters(self):
super().set_gguf_parameters()
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])


@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
class ChatGLMModel(Model):
model_arch = gguf.MODEL_ARCH.CHATGLM
Expand Down Expand Up @@ -5592,7 +5607,6 @@ def main() -> None:
with torch.inference_mode():
output_type = ftype_map[args.outtype]
model_architecture = hparams["architectures"][0]

try:
model_class = Model.from_model_architecture(model_architecture)
except NotImplementedError:
Expand Down
1 change: 1 addition & 0 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
]


Expand Down
23 changes: 16 additions & 7 deletions examples/llava/clip-impl.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "ggml.h"
#include "gguf.h"
#include "clip.h"

#include "clip.h"

Expand Down Expand Up @@ -202,23 +203,31 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
// cpp wrappers
//

// wrapper for clip_image_size
struct clip_image_size_deleter {
void operator()(clip_image_size * val) { clip_image_size_free(val); }
};
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;

// wrapper for clip_image_u8
struct clip_image_u8_deleter {
void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
};
typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;

// wrapper for clip_image_f32
struct clip_image_f32_deleter {
void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
};
typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;

struct clip_image_f32_batch_deleter {
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
struct clip_image_u8_batch {
std::vector<clip_image_u8_ptr> entries;
};

typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;

// TODO @ngxson : we're currently having a naming clash between struct clip_image_size and function clip_image_size()
struct clip_image_f32_batch {
std::vector<clip_image_f32_ptr> entries;
};

//
// common utils
Expand Down
Loading