Skip to content

Eval bug: Potential memory leak in llama_free(ctx) #15954

@leok7v

Description

@leok7v

Name and Version

build/bin/llama-cli --version 
register_backend: registered backend Metal (1 devices)
register_device: registered device Metal (Apple M3)
register_backend: registered backend BLAS (1 devices)
register_device: registered device BLAS (Accelerate)
register_backend: registered backend CPU (1 devices)
register_device: registered device CPU (Apple M3)
version: 6457 (4bf55492)
built with Apple clang version 17.0.0 (clang-1700.0.13.5) for arm64-apple-darwin24.6.0

Operating systems

Mac

GGML backends

Metal

Hardware

AirM3
Apple M3
24 GB
Version 15.6.1 (24G90)

Models

google_gemma-3-270m-it-Q8_0.gguf
https://huggingface.co/bartowski/google_gemma-3-270m-it-GGUF/resolve/main/google_gemma-3-270m-it-Q8_0.gguf

llama.log

Problem description & steps to reproduce

test.c

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pwd.h>
#include <mach/mach.h>
#include "llama.h"

static const char *model_rel =
"/Downloads/Models/google_gemma-3-270m-it-Q8_0.gguf";
static const char *model_url =
"https://huggingface.co/bartowski/google_gemma-3-270m-it-GGUF/"
"resolve/main/google_gemma-3-270m-it-Q8_0.gguf";

static void llm_log(enum ggml_log_level level, const char *text, void *that) {
    fprintf(stderr, "%s", text);
}

static size_t memory_used_mb(void) {
    struct mach_task_basic_info i;
    mach_msg_type_number_t c = MACH_TASK_BASIC_INFO_COUNT;
    kern_return_t k = task_info(mach_task_self(), MACH_TASK_BASIC_INFO,
                                (task_info_t)&i, &c);
    size_t r = 0;
    if (k == KERN_SUCCESS) { r = (size_t)(i.resident_size / (1024 * 1024)); }
    return r;
}

static const char *home_dir(void) {
    const char *h = getenv("HOME");
    if (!h || !*h) {
        struct passwd *pw = getpwuid(getuid());
        h = (pw && pw->pw_dir) ? pw->pw_dir : "/";
    }
    return h;
}

static int ensure_model(const char *path) {
    int ok = 0;
    if (access(path, R_OK) == 0) { ok = 1; }
    if (!ok) {
        char cmd[2048] = {0};
        int n = snprintf(cmd, sizeof(cmd),
                         "mkdir -p \"$(dirname '%s')\" && "
                         "curl -L --fail --retry 3 -o \"%s\" \"%s\"",
                         path, path, model_url);
        if (n > 0 && n < (int)sizeof(cmd)) { ok = (system(cmd) == 0); }
        if (ok) { ok = (access(path, R_OK) == 0); }
    }
    return ok;
}

int main(int argc, const char *argv[]) {
    const char *home = home_dir();
    char path[1024] = {0};
    int n = snprintf(path, sizeof(path), "%s%s", home, model_rel);
    assert(n > 0 && n < (int)sizeof(path));
    printf("model: %s\n", path);
    int ok = ensure_model(path);
    assert(ok);
    int loops = 10000;
    llama_backend_init();
    ggml_backend_load_all();
    ggml_log_set(llm_log, 0);
    struct llama_model_params mp = llama_model_default_params();
    struct llama_model *m = llama_model_load_from_file(path, mp);
    assert(m);
    struct llama_context_params cp = llama_context_default_params();
    for (int i = 0; i < loops; i++) {
        struct llama_context *c = llama_init_from_model(m, cp);
        assert(c);
        llama_free(c);
        if ((i % 1000) == 0) {
            printf("[%04d] memory used: %zuMB\n", i, memory_used_mb());
        }
    }
    llama_model_free(m);
    llama_backend_free();
    return 0;
}
llama.cpp $ cmake -B build -DCMAKE_BUILD_TYPE=Debug -DBUILD_SHARED_LIBS=OFF
...
llama.cpp $ cmake --build build --config Debug
....
llama.cpp $ 
clang -Iinclude -Iggml/include -c test.c -o test.o
clang++ -o test test.o \
  build/src/libllama.a \
  build/common/libcommon.a \
  build/ggml/src/libggml.a \
  build/ggml/src/libggml-base.a \
  build/ggml/src/libggml-cpu.a \
  build/ggml/src/ggml-blas/libggml-blas.a \
  build/ggml/src/ggml-metal/libggml-metal.a \
  build/tools/mtmd/libmtmd.a \
  -pthread \
  -framework Accelerate \
  -framework Foundation \
  -framework Metal \
  -framework MetalKit
llama.cpp $ ./c.sh && ./test 2>/dev/null
model: /Users/leo/Downloads/Models/google_gemma-3-270m-it-Q8_0.gguf
[0000] memory used: 371MB
[1000] memory used: 412MB
[2000] memory used: 449MB
[3000] memory used: 485MB
[4000] memory used: 520MB
[5000] memory used: 555MB
[6000] memory used: 591MB
[7000] memory used: 626MB
[8000] memory used: 662MB
[9000] memory used: 697MB

First Bad Commit

N/A

Relevant log output

register_backend: registered backend Metal (1 devices)
register_device: registered device Metal (Apple M3)
register_backend: registered backend BLAS (1 devices)
register_device: registered device BLAS (Accelerate)
register_backend: registered backend CPU (1 devices)
register_device: registered device CPU (Apple M3)
llama_model_load_from_file_impl: using device Metal (Apple M3) (unknown id) - 16383 MiB free
llama_model_loader: loaded meta data with 43 key-value pairs and 236 tensors from /Users/leo/Downloads/Models/google_gemma-3-270m-it-Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Gemma 3 270m It
llama_model_loader: - kv   3:                           general.finetune str              = it
llama_model_loader: - kv   4:                           general.basename str              = gemma-3
llama_model_loader: - kv   5:                         general.size_label str              = 270M
llama_model_loader: - kv   6:                            general.license str              = gemma
llama_model_loader: - kv   7:                   general.base_model.count u32              = 1
llama_model_loader: - kv   8:                  general.base_model.0.name str              = Gemma 3 270m
llama_model_loader: - kv   9:          general.base_model.0.organization str              = Google
llama_model_loader: - kv  10:              general.base_model.0.repo_url str              = https://huggingface.co/google/gemma-3...
llama_model_loader: - kv  11:                               general.tags arr[str,4]       = ["gemma3", "gemma", "google", "text-g...
llama_model_loader: - kv  12:                      gemma3.context_length u32              = 32768
llama_model_loader: - kv  13:                    gemma3.embedding_length u32              = 640
llama_model_loader: - kv  14:                         gemma3.block_count u32              = 18
llama_model_loader: - kv  15:                 gemma3.feed_forward_length u32              = 2048
llama_model_loader: - kv  16:                gemma3.attention.head_count u32              = 4
llama_model_loader: - kv  17:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
llama_model_loader: - kv  18:                gemma3.attention.key_length u32              = 256
llama_model_loader: - kv  19:              gemma3.attention.value_length u32              = 256
llama_model_loader: - kv  20:                      gemma3.rope.freq_base f32              = 1000000.000000
llama_model_loader: - kv  21:            gemma3.attention.sliding_window u32              = 512
llama_model_loader: - kv  22:             gemma3.attention.head_count_kv u32              = 1
llama_model_loader: - kv  23:                       tokenizer.ggml.model str              = llama
llama_model_loader: - kv  24:                         tokenizer.ggml.pre str              = default
llama_model_loader: - kv  25:                      tokenizer.ggml.tokens arr[str,262144]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
llama_model_loader: - kv  26:                      tokenizer.ggml.scores arr[f32,262144]  = [-1000.000000, -1000.000000, -1000.00...
llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,262144]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
llama_model_loader: - kv  28:                tokenizer.ggml.bos_token_id u32              = 2
llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 1
llama_model_loader: - kv  30:            tokenizer.ggml.unknown_token_id u32              = 3
llama_model_loader: - kv  31:            tokenizer.ggml.padding_token_id u32              = 0
llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  33:               tokenizer.ggml.add_sep_token bool             = false
llama_model_loader: - kv  34:               tokenizer.ggml.add_eos_token bool             = false
llama_model_loader: - kv  35:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
llama_model_loader: - kv  36:            tokenizer.ggml.add_space_prefix bool             = false
llama_model_loader: - kv  37:               general.quantization_version u32              = 2
llama_model_loader: - kv  38:                          general.file_type u32              = 7
llama_model_loader: - kv  39:                      quantize.imatrix.file str              = /models_out/gemma-3-270m-it-GGUF/goog...
llama_model_loader: - kv  40:                   quantize.imatrix.dataset str              = /training_dir/calibration_datav5.txt
llama_model_loader: - kv  41:             quantize.imatrix.entries_count u32              = 126
llama_model_loader: - kv  42:              quantize.imatrix.chunks_count u32              = 821
llama_model_loader: - type  f32:  109 tensors
llama_model_loader: - type q8_0:  127 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type   = Q8_0
print_info: file size   = 271.81 MiB (8.50 BPW)

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions