Skip to content

Commit 1f5accb

Browse files
NoahOksuzggerganov
andauthored
Fix garbled output with REPACK at high thread counts (ggml-org#16956)
* Fix garbled output with REPACK at high thread counts Fixed a race condition in the REPACK matrix multiplication code that caused garbled output when using 26+ threads (model-dependent threshold). The issue occurred because with high thread counts, the code forced chunk count to equal thread count, creating many small chunks. After aligning these chunks to NB_COLS boundaries, adjacent chunks could overlap, causing data corruption and race conditions. The fix enforces minimum chunk sizes based on NB_COLS and caps maximum chunk count to prevent creating too many tiny chunks, ensuring proper alignment without overlaps. * Update ggml/src/ggml-cpu/repack.cpp Co-authored-by: Georgi Gerganov <[email protected]> * Update ggml/src/ggml-cpu/repack.cpp Co-authored-by: Georgi Gerganov <[email protected]> --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 2759ccd commit 1f5accb

File tree

1 file changed

+25
-0
lines changed

1 file changed

+25
-0
lines changed

ggml/src/ggml-cpu/repack.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1678,10 +1678,24 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
16781678
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
16791679
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
16801680

1681+
// Ensure minimum chunk size to avoid alignment issues with high thread counts
1682+
// Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
1683+
const int64_t min_chunk_size = NB_COLS;
1684+
if (nchunk > 0 && (nr / nchunk) < min_chunk_size && nr >= min_chunk_size) {
1685+
nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
1686+
}
1687+
16811688
if (nth == 1 || nchunk < nth || disable_chunking) {
16821689
nchunk = nth;
16831690
}
16841691

1692+
// Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
1693+
// This prevents creating too many tiny chunks that could overlap after alignment
1694+
const int64_t max_nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
1695+
if (nchunk > max_nchunk) {
1696+
nchunk = max_nchunk;
1697+
}
1698+
16851699
if (ith == 0) {
16861700
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
16871701
ggml_threadpool_chunk_set(params->threadpool, nth);
@@ -1695,8 +1709,15 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
16951709
while (current_chunk < nchunk) {
16961710
int64_t src0_start = (current_chunk * ne01) / nchunk;
16971711
int64_t src0_end = ((current_chunk + 1) * ne01) / nchunk;
1712+
1713+
// Align boundaries to NB_COLS - round up to ensure all data is included
1714+
// The chunk size limiting above ensures chunks are large enough to prevent overlaps
16981715
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
16991716
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
1717+
if (src0_end > ne01) {
1718+
src0_end = ne01;
1719+
}
1720+
17001721
if (src0_start >= src0_end) {
17011722
break;
17021723
}
@@ -1808,8 +1829,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
18081829
int64_t src0_cur_start = (ith * ne01) / nth;
18091830
int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
18101831

1832+
// Align boundaries to NB_COLS - round up to ensure all data is included
18111833
src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
18121834
src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
1835+
if (src0_cur_end > ne01) {
1836+
src0_cur_end = ne01;
1837+
}
18131838

18141839
if (src0_cur_start >= src0_cur_end) {
18151840
return;

0 commit comments

Comments
 (0)