diff --git a/README.md b/README.md index 17f59e988e3d1..d2fbc760c6a40 100644 --- a/README.md +++ b/README.md @@ -542,6 +542,81 @@ To learn more about model quantization, [read this documentation](tools/quantize - [Performance troubleshooting](docs/development/token_generation_performance_tips.md) - [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks) +#### Testing + +##### Memory Leak Testing + +The repository includes comprehensive memory leak regression tests to ensure proper memory management across various lifecycle scenarios. These tests go beyond the existing AddressSanitizer (ASan) integration by providing dedicated leak detection test suites. + +**Running with AddressSanitizer:** + +The primary memory leak detection mechanism uses AddressSanitizer, which is configured as a build option: + +```bash +# Build with AddressSanitizer enabled +cmake -B build -DLLAMA_SANITIZE_ADDRESS=ON -DCMAKE_BUILD_TYPE=Debug +cmake --build build + +# Run the memory leak regression tests +cd build +ctest -R test-memory-leaks --output-on-failure + +# Or run directly +./bin/test-memory-leaks +``` + +Other available sanitizers: +- `LLAMA_SANITIZE_THREAD=ON` - Detects data races (note: runs without OpenMP) +- `LLAMA_SANITIZE_UNDEFINED=ON` - Detects undefined behavior + +**Running with Valgrind:** + +Optional Valgrind integration is available for additional leak checking: + +```bash +# Build the tests (Valgrind target is automatically configured if valgrind is installed) +cmake -B build +cmake --build build + +# Run memory leak tests with Valgrind +cd build +make test-valgrind +``` + +The Valgrind target runs with comprehensive leak detection flags: +- `--leak-check=full` - Detailed leak information +- `--show-leak-kinds=all` - Reports all leak types +- `--track-origins=yes` - Tracks origin of uninitialized values + +**Test Coverage:** + +The `test-memory-leaks.cpp` suite includes 10 comprehensive tests covering: + +1. **Backend initialization cycles** - Repeated `llama_backend_init()` / `llama_backend_free()` cycles +2. **Model load/unload cycles** - Repeated model loading and cleanup (10 iterations) +3. **Context lifecycle** - Context creation and destruction patterns (10 iterations) +4. **Multiple contexts per model** - Creating multiple contexts from the same model (5 contexts) +5. **Sampler lifecycle** - Sampler creation, chain operations, and cleanup +6. **Batch operations** - Batch allocation and deallocation patterns +7. **KV cache clearing** - Memory clearing operations on contexts +8. **Threaded contexts** - Concurrent model usage with multiple threads +9. **Model load cancellation** - Cleanup when canceling model loading mid-process +10. **Error condition cleanup** - Proper cleanup when operations fail (e.g., invalid model path) + +All tests follow proper cleanup order: sampler → context → model → backend. + +**Environment Variables:** + +- `LLAMACPP_TEST_MODELFILE` - Path to test model file (required for running tests) + +**Continuous Integration:** + +The GitHub Actions CI automatically runs all tests with all three sanitizers (ADDRESS, THREAD, UNDEFINED) on every pull request to catch memory issues before they reach production. + +**Known Issues:** + +- `test-opt.cpp` is currently disabled with `LLAMA_SANITIZE_ADDRESS` due to a known memory leak in `ggml_opt_alloc()` called within a loop (see `tests/test-opt.cpp:300`) + #### Seminal papers and background on the models If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 91719577564a9..ca754d18cd7d8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -201,6 +201,7 @@ llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") +llama_build_and_test(test-memory-leaks.cpp LABEL "model") if (NOT GGML_BACKEND_DL) # these tests use the backends directly and cannot be built with dynamic loading @@ -219,3 +220,23 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd) get_filename_component(TEST_TARGET test-c.c NAME_WE) add_executable(${TEST_TARGET} test-c.c) target_link_libraries(${TEST_TARGET} PRIVATE llama) + +# Optional Valgrind target for memory leak checking +find_program(VALGRIND_EXECUTABLE valgrind) +if(VALGRIND_EXECUTABLE) + add_custom_target(test-valgrind + COMMAND ${VALGRIND_EXECUTABLE} + --leak-check=full + --show-leak-kinds=all + --track-origins=yes + --error-exitcode=1 + ${CMAKE_CURRENT_BINARY_DIR}/test-memory-leaks + DEPENDS test-memory-leaks + COMMENT "Running memory leak tests with Valgrind" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + message(STATUS "Valgrind found: ${VALGRIND_EXECUTABLE}") + message(STATUS "Run 'make test-valgrind' to check for memory leaks with Valgrind") +else() + message(STATUS "Valgrind not found - install it for additional leak checking") +endif() diff --git a/tests/test-memory-leaks.cpp b/tests/test-memory-leaks.cpp new file mode 100644 index 0000000000000..247dfebfcee51 --- /dev/null +++ b/tests/test-memory-leaks.cpp @@ -0,0 +1,353 @@ +// +// + +#include "llama.h" +#include "get-model.h" +#include +#include +#include +#include +#include + +static void test_model_load_unload_cycles(const char * model_path) { + fprintf(stderr, "test_model_load_unload_cycles: "); + + for (int i = 0; i < 10; i++) { + llama_backend_init(); + + auto params = llama_model_default_params(); + auto * model = llama_model_load_from_file(model_path, params); + if (model == nullptr) { + fprintf(stderr, "FAILED (model load failed on iteration %d)\n", i); + return; + } + + llama_model_free(model); + llama_backend_free(); + } + + fprintf(stderr, "OK\n"); +} + +static void test_context_lifecycle(const char * model_path) { + fprintf(stderr, "test_context_lifecycle: "); + + llama_backend_init(); + + auto model_params = llama_model_default_params(); + auto * model = llama_model_load_from_file(model_path, model_params); + if (model == nullptr) { + fprintf(stderr, "FAILED (model load failed)\n"); + llama_backend_free(); + return; + } + + for (int i = 0; i < 10; i++) { + auto ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 512; + + auto * ctx = llama_init_from_model(model, ctx_params); + if (ctx == nullptr) { + fprintf(stderr, "FAILED (context creation failed on iteration %d)\n", i); + llama_model_free(model); + llama_backend_free(); + return; + } + + llama_free(ctx); + } + + llama_model_free(model); + llama_backend_free(); + + fprintf(stderr, "OK\n"); +} + +static void test_multiple_contexts_same_model(const char * model_path) { + fprintf(stderr, "test_multiple_contexts_same_model: "); + + llama_backend_init(); + + auto model_params = llama_model_default_params(); + auto * model = llama_model_load_from_file(model_path, model_params); + if (model == nullptr) { + fprintf(stderr, "FAILED (model load failed)\n"); + llama_backend_free(); + return; + } + + const int num_contexts = 5; + std::vector contexts(num_contexts); + + auto ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 512; + + for (int i = 0; i < num_contexts; i++) { + contexts[i] = llama_init_from_model(model, ctx_params); + if (contexts[i] == nullptr) { + fprintf(stderr, "FAILED (context %d creation failed)\n", i); + for (int j = 0; j < i; j++) { + llama_free(contexts[j]); + } + llama_model_free(model); + llama_backend_free(); + return; + } + } + + for (auto * ctx : contexts) { + llama_free(ctx); + } + + llama_model_free(model); + llama_backend_free(); + + fprintf(stderr, "OK\n"); +} + +static void test_sampler_lifecycle(const char * model_path) { + fprintf(stderr, "test_sampler_lifecycle: "); + + llama_backend_init(); + + auto model_params = llama_model_default_params(); + auto * model = llama_model_load_from_file(model_path, model_params); + if (model == nullptr) { + fprintf(stderr, "FAILED (model load failed)\n"); + llama_backend_free(); + return; + } + + auto ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 512; + auto * ctx = llama_init_from_model(model, ctx_params); + if (ctx == nullptr) { + fprintf(stderr, "FAILED (context creation failed)\n"); + llama_model_free(model); + llama_backend_free(); + return; + } + + for (int i = 0; i < 10; i++) { + auto sparams = llama_sampler_chain_default_params(); + auto * smpl = llama_sampler_chain_init(sparams); + if (smpl == nullptr) { + fprintf(stderr, "FAILED (sampler creation failed on iteration %d)\n", i); + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + return; + } + + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + llama_sampler_free(smpl); + } + + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + + fprintf(stderr, "OK\n"); +} + +static void test_error_condition_cleanup(const char * /* model_path */) { + fprintf(stderr, "test_error_condition_cleanup: "); + + llama_backend_init(); + + auto params = llama_model_default_params(); + auto * model = llama_model_load_from_file("/nonexistent/path/to/model.gguf", params); + if (model != nullptr) { + fprintf(stderr, "FAILED (expected nullptr for nonexistent model)\n"); + llama_model_free(model); + llama_backend_free(); + return; + } + + llama_backend_free(); + + fprintf(stderr, "OK\n"); +} + +static void test_model_load_cancel(const char * model_path) { + fprintf(stderr, "test_model_load_cancel: "); + + llama_backend_init(); + + auto params = llama_model_default_params(); + params.use_mmap = false; + params.progress_callback = [](float progress, void * ctx) { + (void) ctx; + return progress > 0.50f; + }; + + auto * model = llama_model_load_from_file(model_path, params); + + if (model != nullptr) { + llama_model_free(model); + } + + llama_backend_free(); + + fprintf(stderr, "OK\n"); +} + +static void test_batch_operations(const char * model_path) { + fprintf(stderr, "test_batch_operations: "); + + llama_backend_init(); + + auto model_params = llama_model_default_params(); + auto * model = llama_model_load_from_file(model_path, model_params); + if (model == nullptr) { + fprintf(stderr, "FAILED (model load failed)\n"); + llama_backend_free(); + return; + } + + auto ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 512; + auto * ctx = llama_init_from_model(model, ctx_params); + if (ctx == nullptr) { + fprintf(stderr, "FAILED (context creation failed)\n"); + llama_model_free(model); + llama_backend_free(); + return; + } + + for (int i = 0; i < 10; i++) { + llama_batch batch = llama_batch_init(32, 0, 1); + + llama_batch_free(batch); + } + + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + + fprintf(stderr, "OK\n"); +} + +static void test_backend_init_free_cycles() { + fprintf(stderr, "test_backend_init_free_cycles: "); + + for (int i = 0; i < 10; i++) { + llama_backend_init(); + llama_backend_free(); + } + + fprintf(stderr, "OK\n"); +} + +static void test_threaded_contexts(const char * model_path) { + fprintf(stderr, "test_threaded_contexts: "); + + llama_backend_init(); + + auto model_params = llama_model_default_params(); + auto * model = llama_model_load_from_file(model_path, model_params); + if (model == nullptr) { + fprintf(stderr, "FAILED (model load failed)\n"); + llama_backend_free(); + return; + } + + std::atomic failed = false; + std::vector threads; + const int num_threads = 3; + + for (int t = 0; t < num_threads; t++) { + threads.emplace_back([&, t, model]() { + auto ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 512; + + auto * ctx = llama_init_from_model(model, ctx_params); + if (ctx == nullptr) { + failed.store(true); + return; + } + + auto sparams = llama_sampler_chain_default_params(); + auto * smpl = llama_sampler_chain_init(sparams); + if (smpl == nullptr) { + llama_free(ctx); + failed.store(true); + return; + } + + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + + llama_sampler_free(smpl); + llama_free(ctx); + }); + } + + for (auto & thread : threads) { + thread.join(); + } + + llama_model_free(model); + llama_backend_free(); + + if (failed) { + fprintf(stderr, "FAILED (thread error)\n"); + } else { + fprintf(stderr, "OK\n"); + } +} + +static void test_kv_cache_clear_operations(const char * model_path) { + fprintf(stderr, "test_kv_cache_clear_operations: "); + + llama_backend_init(); + + auto model_params = llama_model_default_params(); + auto * model = llama_model_load_from_file(model_path, model_params); + if (model == nullptr) { + fprintf(stderr, "FAILED (model load failed)\n"); + llama_backend_free(); + return; + } + + auto ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 512; + auto * ctx = llama_init_from_model(model, ctx_params); + if (ctx == nullptr) { + fprintf(stderr, "FAILED (context creation failed)\n"); + llama_model_free(model); + llama_backend_free(); + return; + } + + for (int i = 0; i < 10; i++) { + llama_memory_t mem = llama_get_memory(ctx); + llama_memory_clear(mem, false); + } + + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + + fprintf(stderr, "OK\n"); +} + +int main(int argc, char ** argv) { + auto * model_path = get_model_or_exit(argc, argv); + + fprintf(stderr, "Running memory leak regression tests...\n\n"); + + test_backend_init_free_cycles(); + test_model_load_unload_cycles(model_path); + test_context_lifecycle(model_path); + test_multiple_contexts_same_model(model_path); + test_sampler_lifecycle(model_path); + test_batch_operations(model_path); + test_kv_cache_clear_operations(model_path); + test_threaded_contexts(model_path); + test_model_load_cancel(model_path); + test_error_condition_cleanup(model_path); + + fprintf(stderr, "\nAll memory leak tests completed successfully!\n"); + + return 0; +}