Skip to content

Commit 44871c8

Browse files
slarenggerganov
andcommitted
llama : add thread safety test (llama/14035)
* llama : add thread safety test * llamafile : remove global state * llama : better LLAMA_SPLIT_MODE_NONE logic when main_gpu < 0 GPU devices are not used --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent ad6cd94 commit 44871c8

File tree

3 files changed

+13
-6
lines changed

3 files changed

+13
-6
lines changed

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
503503
// TODO: move to ggml-threading
504504
void ggml_barrier(struct ggml_threadpool * tp);
505505

506+
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
507+
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
508+
506509
#ifdef __cplusplus
507510
}
508511
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
559559
#endif
560560
}
561561

562+
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
563+
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
564+
}
565+
566+
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
567+
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
568+
}
569+
562570
#if defined(__gnu_linux__)
563571
static cpu_set_t ggml_get_numa_affinity(void) {
564572
cpu_set_t cpuset;

ggml/src/ggml-cpu/llamafile/sgemm.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@
5353
#include "ggml-cpu-impl.h"
5454
#include "ggml-quants.h"
5555

56-
#include <atomic>
5756
#include <array>
5857
#include <type_traits>
5958

@@ -394,8 +393,6 @@ class tinyBLAS {
394393

395394
template <int RM, int RN, int BM>
396395
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
397-
static std::atomic<int64_t> current_chunk;
398-
399396
GGML_ASSERT(m % (RM * BM) == 0);
400397
const int64_t ytiles = m / (RM * BM);
401398
const int64_t xtiles = (n + RN -1) / RN;
@@ -410,7 +407,7 @@ class tinyBLAS {
410407
if (params->ith == 0) {
411408
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
412409
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
413-
std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
410+
ggml_threadpool_chunk_set(params->threadpool, params->nth);
414411
}
415412

416413
ggml_barrier(params->threadpool);
@@ -439,8 +436,7 @@ class tinyBLAS {
439436
GGML_ASSERT(jj == jj2);
440437
}
441438

442-
// next step.
443-
job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
439+
job = ggml_threadpool_chunk_add(params->threadpool, 1);
444440
}
445441

446442
ggml_barrier(params->threadpool);

0 commit comments

Comments
 (0)