Skip to content

Commit dad7aa4

Browse files
feat: Add comprehensive concurrent testing infrastructure for AT-102
- Create test-concurrent-stress.cpp for sustained concurrent load testing * Rapid context creation/destruction cycles * Parallel context operations with batch processing * Backend resource allocation stress testing * All tests verify no context leaks or errors - Create test-kv-cache-concurrent.cpp for dedicated KV cache race detection * Concurrent KV cache prepare operations * Concurrent KV cache update operations with varying context sizes * Concurrent sequence operations (copy, remove) * Mixed concurrent operations combining all patterns * Each thread creates its own context (proper threading model) - Enhance test-thread-safety.cpp with race condition detection * Add rapid context recreation stress test * Use random timing delays to increase race condition exposure * Track context creation/destruction with atomic counters * Verify no resource leaks under stress - Extend test_completion.py with high-volume concurrent server tests * test_completion_high_volume_concurrent: 8-50 concurrent requests * test_completion_parallel_decoding: Multiple parallel decode streams * test_completion_cache_consistency_concurrent: Cache validation under load - Update CMakeLists.txt with new test targets * Add test-concurrent-stress with appropriate test parameters * Add test-kv-cache-concurrent with appropriate test parameters * Both use established llama_build_and_test pattern * Tests labeled 'concurrent' for easy filtering Targets critical concurrent areas: - KV cache prepare() and update() operations - Context initialization and management under concurrent access - Server task queue and slot management (Python tests) - Backend resource allocation under high concurrency All tests follow proper llama.cpp threading model where each thread manages its own context rather than sharing contexts across threads. Tests validated locally: - test-concurrent-stress: PASSED (80 contexts created/destroyed, 0 errors) - test-kv-cache-concurrent: PASSED (all 4 test suites, 0 errors) - test-thread-safety: PASSED (including new stress test) - Regression tests: 36/36 existing tests passed ThreadSanitizer integration already exists in CMakeLists.txt via LLAMA_SANITIZE_THREAD option for automated race detection. Related to JIRA ticket AT-102 Co-Authored-By: Alex Peng <[email protected]>
1 parent 661ae31 commit dad7aa4

File tree

5 files changed

+933
-0
lines changed

5 files changed

+933
-0
lines changed

tests/CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,18 @@ llama_build_and_test(test-regex-partial.cpp)
187187

188188
llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
189189

190+
llama_build_and_test(test-concurrent-stress.cpp
191+
NAME test-concurrent-stress
192+
ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf
193+
-ngl 99 -p "Test prompt" -n 64 -c 256 -np 4 -t 4
194+
LABEL concurrent)
195+
196+
llama_build_and_test(test-kv-cache-concurrent.cpp
197+
NAME test-kv-cache-concurrent
198+
ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf
199+
-ngl 99 -p "Test prompt" -n 64 -c 256 -np 8 -t 4
200+
LABEL concurrent)
201+
190202
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
191203
if (NOT WIN32)
192204
llama_build_and_test(test-arg-parser.cpp)

tests/test-concurrent-stress.cpp

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
#include "arg.h"
2+
#include "common.h"
3+
#include "log.h"
4+
#include "llama.h"
5+
6+
#include <algorithm>
7+
#include <array>
8+
#include <atomic>
9+
#include <chrono>
10+
#include <cmath>
11+
#include <cstdio>
12+
#include <cstring>
13+
#include <ctime>
14+
#include <random>
15+
#include <sstream>
16+
#include <string>
17+
#include <thread>
18+
#include <vector>
19+
20+
21+
struct test_result {
22+
std::atomic<int> contexts_created{0};
23+
std::atomic<int> contexts_destroyed{0};
24+
std::atomic<int> batches_processed{0};
25+
std::atomic<int> errors{0};
26+
std::atomic<bool> corruption_detected{false};
27+
};
28+
29+
static void test_rapid_context_cycles(
30+
llama_model * model,
31+
llama_context_params base_params,
32+
test_result & result,
33+
int thread_id,
34+
int iterations
35+
) {
36+
const int64_t t_start = ggml_time_us();
37+
38+
std::random_device rd;
39+
std::mt19937 gen(rd() + thread_id);
40+
std::uniform_int_distribution<> delay_dist(1, 10);
41+
42+
for (int i = 0; i < iterations; i++) {
43+
llama_context * ctx = llama_init_from_model(model, base_params);
44+
45+
if (!ctx) {
46+
LOG_ERR("thread %d: failed to create context on iteration %d\n", thread_id, i);
47+
result.errors++;
48+
continue;
49+
}
50+
51+
result.contexts_created++;
52+
53+
std::this_thread::sleep_for(std::chrono::milliseconds(delay_dist(gen)));
54+
55+
llama_free(ctx);
56+
result.contexts_destroyed++;
57+
}
58+
59+
const int64_t t_end = ggml_time_us();
60+
LOG_INF("thread %d: completed %d context cycles in %.2f ms\n",
61+
thread_id, iterations, (t_end - t_start) / 1000.0);
62+
}
63+
64+
65+
static void test_backend_resource_stress(
66+
llama_model * model,
67+
llama_context_params base_params,
68+
test_result & result,
69+
int thread_id,
70+
int iterations
71+
) {
72+
std::random_device rd;
73+
std::mt19937 gen(rd() + thread_id);
74+
std::uniform_int_distribution<> delay_dist(1, 8);
75+
76+
for (int i = 0; i < iterations; i++) {
77+
llama_context_params ctx_params = base_params;
78+
79+
ctx_params.n_ctx = 128 + (i % 4) * 64;
80+
ctx_params.n_batch = 32 + (i % 3) * 16;
81+
82+
llama_context * ctx = llama_init_from_model(model, ctx_params);
83+
if (!ctx) {
84+
LOG_ERR("thread %d: failed to create context with varying params on iteration %d\n", thread_id, i);
85+
result.errors++;
86+
continue;
87+
}
88+
89+
result.contexts_created++;
90+
91+
std::this_thread::sleep_for(std::chrono::milliseconds(delay_dist(gen)));
92+
93+
llama_free(ctx);
94+
result.contexts_destroyed++;
95+
}
96+
}
97+
98+
int main(int argc, char ** argv) {
99+
common_params params;
100+
101+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
102+
return 1;
103+
}
104+
105+
common_init();
106+
107+
llama_backend_init();
108+
llama_numa_init(params.numa);
109+
110+
auto mparams = common_model_params_to_llama(params);
111+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
112+
if (!model) {
113+
LOG_ERR("failed to load model\n");
114+
return 1;
115+
}
116+
117+
auto cparams = common_context_params_to_llama(params);
118+
llama_context * ctx = llama_init_from_model(model, cparams);
119+
if (!ctx) {
120+
LOG_ERR("failed to create context\n");
121+
llama_model_free(model);
122+
return 1;
123+
}
124+
125+
const int n_threads = params.cpuparams.n_threads;
126+
const int iterations_per_thread = 20;
127+
128+
LOG_INF("Starting concurrent stress tests with %d threads, %d iterations per thread\n",
129+
n_threads, iterations_per_thread);
130+
131+
LOG_INF("\n=== Test 1: Rapid Context Creation/Destruction Cycles ===\n");
132+
{
133+
test_result result;
134+
std::vector<std::thread> threads;
135+
136+
const int64_t t_start = ggml_time_us();
137+
138+
for (int i = 0; i < n_threads; i++) {
139+
threads.emplace_back(test_rapid_context_cycles, model, cparams,
140+
std::ref(result), i, iterations_per_thread);
141+
}
142+
143+
for (auto & t : threads) {
144+
t.join();
145+
}
146+
147+
const int64_t t_end = ggml_time_us();
148+
149+
LOG_INF("Test 1 Results:\n");
150+
LOG_INF(" Contexts created: %d\n", result.contexts_created.load());
151+
LOG_INF(" Contexts destroyed: %d\n", result.contexts_destroyed.load());
152+
LOG_INF(" Errors: %d\n", result.errors.load());
153+
LOG_INF(" Total time: %.2f ms\n", (t_end - t_start) / 1000.0);
154+
LOG_INF(" Avg time per context: %.2f ms\n",
155+
(t_end - t_start) / 1000.0 / result.contexts_created.load());
156+
157+
if (result.contexts_created != result.contexts_destroyed) {
158+
LOG_ERR("FAIL: Context leak detected! Created: %d, Destroyed: %d\n",
159+
result.contexts_created.load(), result.contexts_destroyed.load());
160+
llama_free(ctx);
161+
llama_model_free(model);
162+
return 1;
163+
}
164+
165+
if (result.errors > 0) {
166+
LOG_ERR("FAIL: %d errors occurred during context cycles\n", result.errors.load());
167+
llama_free(ctx);
168+
llama_model_free(model);
169+
return 1;
170+
}
171+
172+
LOG_INF("PASS: No context leaks or errors detected\n");
173+
}
174+
175+
LOG_INF("\n=== Test 2: Parallel Context Operations ===\n");
176+
{
177+
test_result result;
178+
std::vector<std::thread> threads;
179+
180+
const int64_t t_start = ggml_time_us();
181+
182+
auto parallel_context_ops = [&](int thread_id) {
183+
std::random_device rd;
184+
std::mt19937 gen(rd() + thread_id);
185+
std::uniform_int_distribution<> delay_dist(1, 5);
186+
187+
for (int i = 0; i < iterations_per_thread / 4; i++) {
188+
llama_context * thread_ctx = llama_init_from_model(model, cparams);
189+
if (!thread_ctx) {
190+
LOG_ERR("thread %d: failed to create context on iteration %d\n", thread_id, i);
191+
result.errors++;
192+
continue;
193+
}
194+
195+
result.contexts_created++;
196+
197+
std::vector<llama_token> tokens = common_tokenize(thread_ctx, "Test prompt", true, true);
198+
if (!tokens.empty()) {
199+
llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
200+
for (size_t j = 0; j < tokens.size(); j++) {
201+
common_batch_add(batch, tokens[j], j, {0}, false);
202+
}
203+
204+
if (llama_decode(thread_ctx, batch) == 0) {
205+
result.batches_processed++;
206+
}
207+
208+
llama_batch_free(batch);
209+
}
210+
211+
std::this_thread::sleep_for(std::chrono::milliseconds(delay_dist(gen)));
212+
213+
llama_free(thread_ctx);
214+
result.contexts_destroyed++;
215+
}
216+
};
217+
218+
for (int i = 0; i < n_threads; i++) {
219+
threads.emplace_back(parallel_context_ops, i);
220+
}
221+
222+
for (auto & t : threads) {
223+
t.join();
224+
}
225+
226+
const int64_t t_end = ggml_time_us();
227+
228+
LOG_INF("Test 2 Results:\n");
229+
LOG_INF(" Contexts created: %d\n", result.contexts_created.load());
230+
LOG_INF(" Contexts destroyed: %d\n", result.contexts_destroyed.load());
231+
LOG_INF(" Batches processed: %d\n", result.batches_processed.load());
232+
LOG_INF(" Errors: %d\n", result.errors.load());
233+
LOG_INF(" Total time: %.2f ms\n", (t_end - t_start) / 1000.0);
234+
235+
if (result.contexts_created != result.contexts_destroyed) {
236+
LOG_ERR("FAIL: Context leak detected! Created: %d, Destroyed: %d\n",
237+
result.contexts_created.load(), result.contexts_destroyed.load());
238+
llama_free(ctx);
239+
llama_model_free(model);
240+
return 1;
241+
}
242+
243+
if (result.errors > 0) {
244+
LOG_ERR("FAIL: %d errors occurred during parallel operations\n", result.errors.load());
245+
llama_free(ctx);
246+
llama_model_free(model);
247+
return 1;
248+
}
249+
250+
LOG_INF("PASS: All parallel context operations completed successfully\n");
251+
}
252+
253+
LOG_INF("\n=== Test 3: Backend Resource Allocation Stress ===\n");
254+
{
255+
test_result result;
256+
std::vector<std::thread> threads;
257+
258+
const int64_t t_start = ggml_time_us();
259+
260+
for (int i = 0; i < n_threads; i++) {
261+
threads.emplace_back(test_backend_resource_stress, model, cparams,
262+
std::ref(result), i, iterations_per_thread / 4);
263+
}
264+
265+
for (auto & t : threads) {
266+
t.join();
267+
}
268+
269+
const int64_t t_end = ggml_time_us();
270+
271+
LOG_INF("Test 3 Results:\n");
272+
LOG_INF(" Contexts created: %d\n", result.contexts_created.load());
273+
LOG_INF(" Contexts destroyed: %d\n", result.contexts_destroyed.load());
274+
LOG_INF(" Errors: %d\n", result.errors.load());
275+
LOG_INF(" Total time: %.2f ms\n", (t_end - t_start) / 1000.0);
276+
277+
if (result.contexts_created != result.contexts_destroyed) {
278+
LOG_ERR("FAIL: Resource leak detected! Created: %d, Destroyed: %d\n",
279+
result.contexts_created.load(), result.contexts_destroyed.load());
280+
llama_free(ctx);
281+
llama_model_free(model);
282+
return 1;
283+
}
284+
285+
if (result.errors > 0) {
286+
LOG_ERR("FAIL: %d errors occurred during resource stress test\n", result.errors.load());
287+
llama_free(ctx);
288+
llama_model_free(model);
289+
return 1;
290+
}
291+
292+
LOG_INF("PASS: No resource leaks detected\n");
293+
}
294+
295+
llama_free(ctx);
296+
llama_model_free(model);
297+
298+
LOG_INF("\n=== All Concurrent Stress Tests PASSED ===\n");
299+
return 0;
300+
}

0 commit comments

Comments
 (0)