Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,18 @@ llama_build_and_test(test-regex-partial.cpp)

llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)

llama_build_and_test(test-concurrent-stress.cpp
NAME test-concurrent-stress
ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf
-ngl 99 -p "Test prompt" -n 64 -c 256 -np 4 -t 4
LABEL concurrent)

llama_build_and_test(test-kv-cache-concurrent.cpp
NAME test-kv-cache-concurrent
ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf
-ngl 99 -p "Test prompt" -n 64 -c 256 -np 8 -t 4
LABEL concurrent)

# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
if (NOT WIN32)
llama_build_and_test(test-arg-parser.cpp)
Expand Down
300 changes: 300 additions & 0 deletions tests/test-concurrent-stress.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"

#include <algorithm>
#include <array>
#include <atomic>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <random>
#include <sstream>
#include <string>
#include <thread>
#include <vector>


struct test_result {
std::atomic<int> contexts_created{0};
std::atomic<int> contexts_destroyed{0};
std::atomic<int> batches_processed{0};
std::atomic<int> errors{0};
std::atomic<bool> corruption_detected{false};
};

static void test_rapid_context_cycles(
llama_model * model,
llama_context_params base_params,
test_result & result,
int thread_id,
int iterations
) {
const int64_t t_start = ggml_time_us();

std::random_device rd;
std::mt19937 gen(rd() + thread_id);
std::uniform_int_distribution<> delay_dist(1, 10);

for (int i = 0; i < iterations; i++) {
llama_context * ctx = llama_init_from_model(model, base_params);

if (!ctx) {
LOG_ERR("thread %d: failed to create context on iteration %d\n", thread_id, i);
result.errors++;
continue;
}

result.contexts_created++;

std::this_thread::sleep_for(std::chrono::milliseconds(delay_dist(gen)));

llama_free(ctx);
result.contexts_destroyed++;
}

const int64_t t_end = ggml_time_us();
LOG_INF("thread %d: completed %d context cycles in %.2f ms\n",
thread_id, iterations, (t_end - t_start) / 1000.0);
}


static void test_backend_resource_stress(
llama_model * model,
llama_context_params base_params,
test_result & result,
int thread_id,
int iterations
) {
std::random_device rd;
std::mt19937 gen(rd() + thread_id);
std::uniform_int_distribution<> delay_dist(1, 8);

for (int i = 0; i < iterations; i++) {
llama_context_params ctx_params = base_params;

ctx_params.n_ctx = 128 + (i % 4) * 64;
ctx_params.n_batch = 32 + (i % 3) * 16;

llama_context * ctx = llama_init_from_model(model, ctx_params);
if (!ctx) {
LOG_ERR("thread %d: failed to create context with varying params on iteration %d\n", thread_id, i);
result.errors++;
continue;
}

result.contexts_created++;

std::this_thread::sleep_for(std::chrono::milliseconds(delay_dist(gen)));

llama_free(ctx);
result.contexts_destroyed++;
}
}

int main(int argc, char ** argv) {
common_params params;

if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}

common_init();

llama_backend_init();
llama_numa_init(params.numa);

auto mparams = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (!model) {
LOG_ERR("failed to load model\n");
return 1;
}

auto cparams = common_context_params_to_llama(params);
llama_context * ctx = llama_init_from_model(model, cparams);
if (!ctx) {
LOG_ERR("failed to create context\n");
llama_model_free(model);
return 1;
}

const int n_threads = params.cpuparams.n_threads;
const int iterations_per_thread = 20;

LOG_INF("Starting concurrent stress tests with %d threads, %d iterations per thread\n",
n_threads, iterations_per_thread);

LOG_INF("\n=== Test 1: Rapid Context Creation/Destruction Cycles ===\n");
{
test_result result;
std::vector<std::thread> threads;

const int64_t t_start = ggml_time_us();

for (int i = 0; i < n_threads; i++) {
threads.emplace_back(test_rapid_context_cycles, model, cparams,
std::ref(result), i, iterations_per_thread);
}

for (auto & t : threads) {
t.join();
}

const int64_t t_end = ggml_time_us();

LOG_INF("Test 1 Results:\n");
LOG_INF(" Contexts created: %d\n", result.contexts_created.load());
LOG_INF(" Contexts destroyed: %d\n", result.contexts_destroyed.load());
LOG_INF(" Errors: %d\n", result.errors.load());
LOG_INF(" Total time: %.2f ms\n", (t_end - t_start) / 1000.0);
LOG_INF(" Avg time per context: %.2f ms\n",
(t_end - t_start) / 1000.0 / result.contexts_created.load());

if (result.contexts_created != result.contexts_destroyed) {
LOG_ERR("FAIL: Context leak detected! Created: %d, Destroyed: %d\n",
result.contexts_created.load(), result.contexts_destroyed.load());
llama_free(ctx);
llama_model_free(model);
return 1;
}

if (result.errors > 0) {
LOG_ERR("FAIL: %d errors occurred during context cycles\n", result.errors.load());
llama_free(ctx);
llama_model_free(model);
return 1;
}

LOG_INF("PASS: No context leaks or errors detected\n");
}

LOG_INF("\n=== Test 2: Parallel Context Operations ===\n");
{
test_result result;
std::vector<std::thread> threads;

const int64_t t_start = ggml_time_us();

auto parallel_context_ops = [&](int thread_id) {
std::random_device rd;
std::mt19937 gen(rd() + thread_id);
std::uniform_int_distribution<> delay_dist(1, 5);

for (int i = 0; i < iterations_per_thread / 4; i++) {
llama_context * thread_ctx = llama_init_from_model(model, cparams);
if (!thread_ctx) {
LOG_ERR("thread %d: failed to create context on iteration %d\n", thread_id, i);
result.errors++;
continue;
}

result.contexts_created++;

std::vector<llama_token> tokens = common_tokenize(thread_ctx, "Test prompt", true, true);
if (!tokens.empty()) {
llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
for (size_t j = 0; j < tokens.size(); j++) {
common_batch_add(batch, tokens[j], j, {0}, false);
}

if (llama_decode(thread_ctx, batch) == 0) {
result.batches_processed++;
}

llama_batch_free(batch);
}

std::this_thread::sleep_for(std::chrono::milliseconds(delay_dist(gen)));

llama_free(thread_ctx);
result.contexts_destroyed++;
}
};

for (int i = 0; i < n_threads; i++) {
threads.emplace_back(parallel_context_ops, i);
}

for (auto & t : threads) {
t.join();
}

const int64_t t_end = ggml_time_us();

LOG_INF("Test 2 Results:\n");
LOG_INF(" Contexts created: %d\n", result.contexts_created.load());
LOG_INF(" Contexts destroyed: %d\n", result.contexts_destroyed.load());
LOG_INF(" Batches processed: %d\n", result.batches_processed.load());
LOG_INF(" Errors: %d\n", result.errors.load());
LOG_INF(" Total time: %.2f ms\n", (t_end - t_start) / 1000.0);

if (result.contexts_created != result.contexts_destroyed) {
LOG_ERR("FAIL: Context leak detected! Created: %d, Destroyed: %d\n",
result.contexts_created.load(), result.contexts_destroyed.load());
llama_free(ctx);
llama_model_free(model);
return 1;
}

if (result.errors > 0) {
LOG_ERR("FAIL: %d errors occurred during parallel operations\n", result.errors.load());
llama_free(ctx);
llama_model_free(model);
return 1;
}

LOG_INF("PASS: All parallel context operations completed successfully\n");
}

LOG_INF("\n=== Test 3: Backend Resource Allocation Stress ===\n");
{
test_result result;
std::vector<std::thread> threads;

const int64_t t_start = ggml_time_us();

for (int i = 0; i < n_threads; i++) {
threads.emplace_back(test_backend_resource_stress, model, cparams,
std::ref(result), i, iterations_per_thread / 4);
}

for (auto & t : threads) {
t.join();
}

const int64_t t_end = ggml_time_us();

LOG_INF("Test 3 Results:\n");
LOG_INF(" Contexts created: %d\n", result.contexts_created.load());
LOG_INF(" Contexts destroyed: %d\n", result.contexts_destroyed.load());
LOG_INF(" Errors: %d\n", result.errors.load());
LOG_INF(" Total time: %.2f ms\n", (t_end - t_start) / 1000.0);

if (result.contexts_created != result.contexts_destroyed) {
LOG_ERR("FAIL: Resource leak detected! Created: %d, Destroyed: %d\n",
result.contexts_created.load(), result.contexts_destroyed.load());
llama_free(ctx);
llama_model_free(model);
return 1;
}

if (result.errors > 0) {
LOG_ERR("FAIL: %d errors occurred during resource stress test\n", result.errors.load());
llama_free(ctx);
llama_model_free(model);
return 1;
}

LOG_INF("PASS: No resource leaks detected\n");
}

llama_free(ctx);
llama_model_free(model);

LOG_INF("\n=== All Concurrent Stress Tests PASSED ===\n");
return 0;
}
Loading
Loading