-
Notifications
You must be signed in to change notification settings - Fork 13.4k
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Name and Version
build/bin/llama-cli --version
register_backend: registered backend Metal (1 devices)
register_device: registered device Metal (Apple M3)
register_backend: registered backend BLAS (1 devices)
register_device: registered device BLAS (Accelerate)
register_backend: registered backend CPU (1 devices)
register_device: registered device CPU (Apple M3)
version: 6457 (4bf55492)
built with Apple clang version 17.0.0 (clang-1700.0.13.5) for arm64-apple-darwin24.6.0
Operating systems
Mac
GGML backends
Metal
Hardware
AirM3
Apple M3
24 GB
Version 15.6.1 (24G90)
Models
google_gemma-3-270m-it-Q8_0.gguf
https://huggingface.co/bartowski/google_gemma-3-270m-it-GGUF/resolve/main/google_gemma-3-270m-it-Q8_0.gguf
Problem description & steps to reproduce
test.c
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pwd.h>
#include <mach/mach.h>
#include "llama.h"
static const char *model_rel =
"/Downloads/Models/google_gemma-3-270m-it-Q8_0.gguf";
static const char *model_url =
"https://huggingface.co/bartowski/google_gemma-3-270m-it-GGUF/"
"resolve/main/google_gemma-3-270m-it-Q8_0.gguf";
static void llm_log(enum ggml_log_level level, const char *text, void *that) {
fprintf(stderr, "%s", text);
}
static size_t memory_used_mb(void) {
struct mach_task_basic_info i;
mach_msg_type_number_t c = MACH_TASK_BASIC_INFO_COUNT;
kern_return_t k = task_info(mach_task_self(), MACH_TASK_BASIC_INFO,
(task_info_t)&i, &c);
size_t r = 0;
if (k == KERN_SUCCESS) { r = (size_t)(i.resident_size / (1024 * 1024)); }
return r;
}
static const char *home_dir(void) {
const char *h = getenv("HOME");
if (!h || !*h) {
struct passwd *pw = getpwuid(getuid());
h = (pw && pw->pw_dir) ? pw->pw_dir : "/";
}
return h;
}
static int ensure_model(const char *path) {
int ok = 0;
if (access(path, R_OK) == 0) { ok = 1; }
if (!ok) {
char cmd[2048] = {0};
int n = snprintf(cmd, sizeof(cmd),
"mkdir -p \"$(dirname '%s')\" && "
"curl -L --fail --retry 3 -o \"%s\" \"%s\"",
path, path, model_url);
if (n > 0 && n < (int)sizeof(cmd)) { ok = (system(cmd) == 0); }
if (ok) { ok = (access(path, R_OK) == 0); }
}
return ok;
}
int main(int argc, const char *argv[]) {
const char *home = home_dir();
char path[1024] = {0};
int n = snprintf(path, sizeof(path), "%s%s", home, model_rel);
assert(n > 0 && n < (int)sizeof(path));
printf("model: %s\n", path);
int ok = ensure_model(path);
assert(ok);
int loops = 10000;
llama_backend_init();
ggml_backend_load_all();
ggml_log_set(llm_log, 0);
struct llama_model_params mp = llama_model_default_params();
struct llama_model *m = llama_model_load_from_file(path, mp);
assert(m);
struct llama_context_params cp = llama_context_default_params();
for (int i = 0; i < loops; i++) {
struct llama_context *c = llama_init_from_model(m, cp);
assert(c);
llama_free(c);
if ((i % 1000) == 0) {
printf("[%04d] memory used: %zuMB\n", i, memory_used_mb());
}
}
llama_model_free(m);
llama_backend_free();
return 0;
}
llama.cpp $ cmake -B build -DCMAKE_BUILD_TYPE=Debug -DBUILD_SHARED_LIBS=OFF
...
llama.cpp $ cmake --build build --config Debug
....
llama.cpp $
clang -Iinclude -Iggml/include -c test.c -o test.o
clang++ -o test test.o \
build/src/libllama.a \
build/common/libcommon.a \
build/ggml/src/libggml.a \
build/ggml/src/libggml-base.a \
build/ggml/src/libggml-cpu.a \
build/ggml/src/ggml-blas/libggml-blas.a \
build/ggml/src/ggml-metal/libggml-metal.a \
build/tools/mtmd/libmtmd.a \
-pthread \
-framework Accelerate \
-framework Foundation \
-framework Metal \
-framework MetalKit
llama.cpp $ ./c.sh && ./test 2>/dev/null
model: /Users/leo/Downloads/Models/google_gemma-3-270m-it-Q8_0.gguf
[0000] memory used: 371MB
[1000] memory used: 412MB
[2000] memory used: 449MB
[3000] memory used: 485MB
[4000] memory used: 520MB
[5000] memory used: 555MB
[6000] memory used: 591MB
[7000] memory used: 626MB
[8000] memory used: 662MB
[9000] memory used: 697MB
First Bad Commit
N/A
Relevant log output
register_backend: registered backend Metal (1 devices)
register_device: registered device Metal (Apple M3)
register_backend: registered backend BLAS (1 devices)
register_device: registered device BLAS (Accelerate)
register_backend: registered backend CPU (1 devices)
register_device: registered device CPU (Apple M3)
llama_model_load_from_file_impl: using device Metal (Apple M3) (unknown id) - 16383 MiB free
llama_model_loader: loaded meta data with 43 key-value pairs and 236 tensors from /Users/leo/Downloads/Models/google_gemma-3-270m-it-Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = gemma3
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = Gemma 3 270m It
llama_model_loader: - kv 3: general.finetune str = it
llama_model_loader: - kv 4: general.basename str = gemma-3
llama_model_loader: - kv 5: general.size_label str = 270M
llama_model_loader: - kv 6: general.license str = gemma
llama_model_loader: - kv 7: general.base_model.count u32 = 1
llama_model_loader: - kv 8: general.base_model.0.name str = Gemma 3 270m
llama_model_loader: - kv 9: general.base_model.0.organization str = Google
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/google/gemma-3...
llama_model_loader: - kv 11: general.tags arr[str,4] = ["gemma3", "gemma", "google", "text-g...
llama_model_loader: - kv 12: gemma3.context_length u32 = 32768
llama_model_loader: - kv 13: gemma3.embedding_length u32 = 640
llama_model_loader: - kv 14: gemma3.block_count u32 = 18
llama_model_loader: - kv 15: gemma3.feed_forward_length u32 = 2048
llama_model_loader: - kv 16: gemma3.attention.head_count u32 = 4
llama_model_loader: - kv 17: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001
llama_model_loader: - kv 18: gemma3.attention.key_length u32 = 256
llama_model_loader: - kv 19: gemma3.attention.value_length u32 = 256
llama_model_loader: - kv 20: gemma3.rope.freq_base f32 = 1000000.000000
llama_model_loader: - kv 21: gemma3.attention.sliding_window u32 = 512
llama_model_loader: - kv 22: gemma3.attention.head_count_kv u32 = 1
llama_model_loader: - kv 23: tokenizer.ggml.model str = llama
llama_model_loader: - kv 24: tokenizer.ggml.pre str = default
llama_model_loader: - kv 25: tokenizer.ggml.tokens arr[str,262144] = ["<pad>", "<eos>", "<bos>", "<unk>", ...
llama_model_loader: - kv 26: tokenizer.ggml.scores arr[f32,262144] = [-1000.000000, -1000.000000, -1000.00...
llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,262144] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
llama_model_loader: - kv 28: tokenizer.ggml.bos_token_id u32 = 2
llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 1
llama_model_loader: - kv 30: tokenizer.ggml.unknown_token_id u32 = 3
llama_model_loader: - kv 31: tokenizer.ggml.padding_token_id u32 = 0
llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 33: tokenizer.ggml.add_sep_token bool = false
llama_model_loader: - kv 34: tokenizer.ggml.add_eos_token bool = false
llama_model_loader: - kv 35: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r...
llama_model_loader: - kv 36: tokenizer.ggml.add_space_prefix bool = false
llama_model_loader: - kv 37: general.quantization_version u32 = 2
llama_model_loader: - kv 38: general.file_type u32 = 7
llama_model_loader: - kv 39: quantize.imatrix.file str = /models_out/gemma-3-270m-it-GGUF/goog...
llama_model_loader: - kv 40: quantize.imatrix.dataset str = /training_dir/calibration_datav5.txt
llama_model_loader: - kv 41: quantize.imatrix.entries_count u32 = 126
llama_model_loader: - kv 42: quantize.imatrix.chunks_count u32 = 821
llama_model_loader: - type f32: 109 tensors
llama_model_loader: - type q8_0: 127 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type = Q8_0
print_info: file size = 271.81 MiB (8.50 BPW)
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working