Skip to content

Commit 7cb118f

Browse files
committed
x86 correct (Llama-2, Phi-3).
[WIP] Fit llama.cpp build visibility. Runtime error-free. Wrong outputs. [WIP] Remove some deprecated codes. [Fix] ggml_tmac_transform_tensor should use *data as the original data. And gather code logics in ggml_tmac_can_mul_mat. Change tuning profile time back to 5000ms. Hard code bits/groupsize/sym. GPTQ Llama correct. Unify quantization_config loading.
1 parent 526739b commit 7cb118f

31 files changed

+4473
-20
lines changed

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "llama.h"
1111

1212
#include <algorithm>
13+
#include <chrono>
1314
#include <cinttypes>
1415
#include <climits>
1516
#include <cmath>

convert_hf_to_gguf.py

Lines changed: 271 additions & 16 deletions
Large diffs are not rendered by default.

ggml/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
208208
# toolchain for vulkan-shaders-gen
209209
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
210210

211+
option(GGML_TMAC "ggml: use TMAC" OFF)
212+
211213
# extra artifacts
212214
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
213215
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
@@ -217,6 +219,9 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
217219
#
218220

219221
set(CMAKE_C_STANDARD 11)
222+
if (GGML_TMAC)
223+
set(CMAKE_C_STANDARD 17)
224+
endif()
220225
set(CMAKE_C_STANDARD_REQUIRED true)
221226

222227
set(CMAKE_CXX_STANDARD 17)

ggml/include/ggml-cpu.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ extern "C" {
5757
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
5858
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
5959
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
60+
GGML_BACKEND_API void ggml_threadpool_atomic_store_explicit(struct ggml_threadpool * threadpool, int value);
61+
GGML_BACKEND_API int ggml_threadpool_atomic_fetch_add_explicit(struct ggml_threadpool * threadpool, int value);
6062

6163
// ggml_graph_plan() has to be called before ggml_graph_compute()
6264
// when plan.work_size > 0, caller must allocate memory for plan.work_data
@@ -120,6 +122,8 @@ extern "C" {
120122

121123
GGML_BACKEND_API void ggml_cpu_init(void);
122124

125+
GGML_BACKEND_API void ggml_cpu_tmac_init(const char * fname);
126+
123127
//
124128
// CPU backend
125129
//

ggml/include/ggml.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,16 @@ extern "C" {
388388
// GGML_TYPE_IQ4_NL_4_4 = 36,
389389
// GGML_TYPE_IQ4_NL_4_8 = 37,
390390
// GGML_TYPE_IQ4_NL_8_8 = 38,
391-
GGML_TYPE_COUNT = 39,
391+
GGML_TYPE_TMAC_BN_0 = 39,
392+
GGML_TYPE_TMAC_W2G64_0 = 40,
393+
GGML_TYPE_TMAC_W2G64_1 = 41,
394+
GGML_TYPE_TMAC_W2G128_0 = 42,
395+
GGML_TYPE_TMAC_W2G128_1 = 43,
396+
GGML_TYPE_TMAC_W4G64_0 = 44,
397+
GGML_TYPE_TMAC_W4G64_1 = 45,
398+
GGML_TYPE_TMAC_W4G128_0 = 46,
399+
GGML_TYPE_TMAC_W4G128_1 = 47,
400+
GGML_TYPE_COUNT = 48,
392401
};
393402

394403
// precision

ggml/src/CMakeLists.txt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ add_library(ggml-base
196196
ggml.c
197197
ggml-alloc.c
198198
ggml-backend.cpp
199+
ggml-common.h
199200
ggml-opt.cpp
200201
ggml-threading.cpp
201202
ggml-threading.h
@@ -211,6 +212,29 @@ endif()
211212
add_library(ggml
212213
ggml-backend-reg.cpp)
213214

215+
# if (GGML_TMAC)
216+
# # set(GGML_HEADERS_TMAC
217+
# # ggml-cpu/tmac/lut_ctor.h
218+
# # ggml-cpu/tmac/tbl.h
219+
# # ggml-cpu/tmac/ggml-tmac.h
220+
# # ../../common/log.h
221+
# # )
222+
# set(GGML_SOURCES_TMAC
223+
# ggml-cpu/tmac/lut_ctor.cpp
224+
# ggml-cpu/tmac/tbl.cpp
225+
# ggml-cpu/tmac/ggml-tmac.cpp
226+
# ../../common/log.cpp
227+
# )
228+
# # list (APPEND GGML_CPU_SOURCES ${GGML_SOURCES_TMAC} ${GGML_HEADERS_TMAC})
229+
# target_sources(ggml-base PRIVATE ${GGML_SOURCES_TMAC})
230+
# target_compile_definitions(ggml-base PUBLIC GGML_USE_TMAC)
231+
# target_include_directories(ggml-base PUBLIC ggml-cpu/tmac)
232+
# target_compile_definitions(ggml PUBLIC GGML_USE_TMAC)
233+
# target_include_directories(ggml PUBLIC ggml-cpu/tmac)
234+
# target_compile_options(ggml-base PUBLIC /arch:AVX2)
235+
# target_compile_definitions(ggml-base PUBLIC GGML_AVX2 GGML_FMA GGML_F16C)
236+
# endif()
237+
214238
target_link_libraries(ggml PUBLIC ggml-base)
215239

216240
if (CMAKE_SYSTEM_NAME MATCHES "Linux")

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
2222
ggml-cpu/amx/amx.h
2323
ggml-cpu/amx/mmq.cpp
2424
ggml-cpu/amx/mmq.h
25+
ggml-cpu/tmac/tmac.cpp
26+
ggml-cpu/tmac/tmac.h
27+
ggml-cpu/tmac/lut_mul_mat.cpp
28+
ggml-cpu/tmac/lut_mul_mat.h
29+
ggml-cpu/tmac/lut_ctor.cpp
30+
ggml-cpu/tmac/lut_ctor.h
31+
ggml-cpu/tmac/tbl.cpp
32+
ggml-cpu/tmac/tbl.h
2533
ggml-cpu/ggml-cpu-impl.h
2634
ggml-cpu/common.h
2735
ggml-cpu/binary-ops.h
@@ -72,6 +80,36 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
7280
ggml-cpu/llamafile/sgemm.h)
7381
endif()
7482

83+
if (GGML_TMAC)
84+
target_compile_definitions(${GGML_CPU_NAME} PUBLIC GGML_USE_TMAC)
85+
target_include_directories(${GGML_CPU_NAME} PUBLIC ggml-cpu/tmac)
86+
get_target_property(cdefs ${GGML_CPU_NAME} COMPILE_DEFINITIONS)
87+
message(STATUS "GGML_CPU_NAME: ${GGML_CPU_NAME} COMPILE_DEFINITIONS: ${cdefs}")
88+
89+
# set(GGML_HEADERS_TMAC
90+
# ggml-cpu/tmac/lut_ctor.h
91+
# ggml-cpu/tmac/tbl.h
92+
# ggml-cpu/tmac/ggml-tmac.h
93+
# ../../common/log.h
94+
# )
95+
# set(GGML_SOURCES_TMAC
96+
# ggml-cpu/tmac/lut_ctor.cpp
97+
# ggml-cpu/tmac/tbl.cpp
98+
# ggml-cpu/tmac/ggml-tmac.cpp
99+
# ../../common/log.cpp
100+
# )
101+
# list (APPEND GGML_CPU_SOURCES ${GGML_SOURCES_TMAC} ${GGML_HEADERS_TMAC})
102+
103+
if ((NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") OR
104+
(NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang"))
105+
message(FATAL_ERROR "Clang is required for T-MAC compilation")
106+
endif()
107+
108+
if (GGML_TMAC_RECHUNK)
109+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE TMAC_RECHUNK)
110+
endif()
111+
endif()
112+
75113
if (GGML_CPU_HBM)
76114
find_library(memkind memkind REQUIRED)
77115

@@ -145,6 +183,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
145183
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
146184
endif()
147185
endif()
186+
if (GGML_TMAC)
187+
# ARM Windows with LLVM clang GNU interface
188+
# We need fullfp16 for T-MAC
189+
# TODO: check_cxx_source_compiles
190+
list(APPEND ARCH_FLAGS -march=armv8.2a+fp16)
191+
endif()
148192

149193
# show enabled features
150194
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
@@ -181,7 +225,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
181225
if (GGML_NATIVE)
182226
include(ggml-cpu/cmake/FindSIMD.cmake)
183227
endif ()
184-
if (GGML_AVX512)
228+
# Can't use GGML_AVX512 with T-MAC and Clang for MSVC
229+
# with error: conflicting types for '_m_prefetchw
230+
if (GGML_AVX512 AND (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") AND (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang"))
185231
list(APPEND ARCH_FLAGS /arch:AVX512)
186232
# /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
187233
# MSVC has no compile-time flags enabling specific
@@ -323,6 +369,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
323369
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
324370
endif()
325371
endif()
372+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" AND GGML_TMAC)
373+
# We need fullfp16 for T-MAC
374+
# TODO: we need to simplify this logic through check_cxx_source_compiles or Presets?
375+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
376+
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
377+
# Device with armv8.7a+ cpu, e.g., WSL on Surface Laptop 7
378+
# based on arm64-windows-llvm.cmake
379+
list(APPEND ARCH_FLAGS -march=armv8.7-a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only)
380+
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
381+
else ()
382+
# Jetson AGX Orin, Raspberry Pi 5
383+
list(APPEND ARCH_FLAGS -march=armv8.2a+fp16)
384+
endif ()
326385
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
327386
message(STATUS "loongarch64 detected")
328387

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@
5050
#include "llamafile/sgemm.h"
5151
#endif
5252

53+
#ifdef GGML_USE_TMAC
54+
#include "tmac.h"
55+
#endif
56+
5357
#if defined(_MSC_VER)
5458
// disable "possible loss of data" to avoid hundreds of casts
5559
// we should just be careful :)
@@ -373,7 +377,51 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
373377
.vec_dot_type = GGML_TYPE_Q8_K,
374378
.nrows = 1,
375379
},
376-
};
380+
[GGML_TYPE_TMAC_BN_0] = {
381+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
382+
.vec_dot_type = GGML_TYPE_F32,
383+
.nrows = 1,
384+
},
385+
[GGML_TYPE_TMAC_W2G64_0] = {
386+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
387+
.vec_dot_type = GGML_TYPE_F32,
388+
.nrows = 1,
389+
},
390+
[GGML_TYPE_TMAC_W2G64_1] = {
391+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
392+
.vec_dot_type = GGML_TYPE_F32,
393+
.nrows = 1,
394+
},
395+
[GGML_TYPE_TMAC_W2G128_0] = {
396+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
397+
.vec_dot_type = GGML_TYPE_F32,
398+
.nrows = 1,
399+
},
400+
[GGML_TYPE_TMAC_W2G128_1] = {
401+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
402+
.vec_dot_type = GGML_TYPE_F32,
403+
.nrows = 1,
404+
},
405+
[GGML_TYPE_TMAC_W4G64_0] = {
406+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
407+
.vec_dot_type = GGML_TYPE_F32,
408+
.nrows = 1,
409+
},
410+
[GGML_TYPE_TMAC_W4G64_1] = {
411+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
412+
.vec_dot_type = GGML_TYPE_F32,
413+
.nrows = 1,
414+
},
415+
[GGML_TYPE_TMAC_W4G128_0] = {
416+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
417+
.vec_dot_type = GGML_TYPE_F32,
418+
.nrows = 1,
419+
},
420+
[GGML_TYPE_TMAC_W4G128_1] = {
421+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
422+
.vec_dot_type = GGML_TYPE_F32,
423+
.nrows = 1,
424+
},};
377425

378426
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
379427
return &type_traits_cpu[type];
@@ -2639,6 +2687,14 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
26392687
#endif
26402688
}
26412689

2690+
void ggml_threadpool_atomic_store_explicit(struct ggml_threadpool * threadpool, int value) {
2691+
atomic_store_explicit(&threadpool->current_chunk, value, memory_order_relaxed);
2692+
}
2693+
2694+
int ggml_threadpool_atomic_fetch_add_explicit(struct ggml_threadpool * threadpool, int value) {
2695+
return (int)atomic_fetch_add_explicit(&threadpool->current_chunk, value, memory_order_relaxed);
2696+
}
2697+
26422698
struct ggml_cplan ggml_graph_plan(
26432699
const struct ggml_cgraph * cgraph,
26442700
int n_threads,
@@ -3406,6 +3462,10 @@ void ggml_cpu_init(void) {
34063462
ggml_init_arm_arch_features();
34073463
#endif
34083464

3465+
#ifdef GGML_USE_TMAC
3466+
ggml_tmac_init();
3467+
#endif
3468+
34093469
is_first_call = false;
34103470
}
34113471

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "ggml-cpu-traits.h"
66
#include "ggml-impl.h"
77
#include "amx/amx.h"
8+
#include "tmac/tmac.h"
89

910
#include <cctype>
1011
#include <string>
@@ -43,6 +44,12 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
4344
}
4445
#endif
4546

47+
#ifdef GGML_USE_TMAC
48+
if (ggml_backend_tmac_buffer_type()) {
49+
bufts.push_back(ggml_backend_tmac_buffer_type());
50+
}
51+
#endif
52+
4653
#ifdef GGML_USE_CPU_KLEIDIAI
4754
if (ggml_backend_cpu_kleidiai_buffer_type()) {
4855
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());

ggml/src/ggml-cpu/ops.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4978,6 +4978,15 @@ void ggml_compute_forward_clamp(
49784978
case GGML_TYPE_I32:
49794979
case GGML_TYPE_I64:
49804980
case GGML_TYPE_F64:
4981+
case GGML_TYPE_TMAC_BN_0:
4982+
case GGML_TYPE_TMAC_W2G64_0:
4983+
case GGML_TYPE_TMAC_W2G64_1:
4984+
case GGML_TYPE_TMAC_W2G128_0:
4985+
case GGML_TYPE_TMAC_W2G128_1:
4986+
case GGML_TYPE_TMAC_W4G64_0:
4987+
case GGML_TYPE_TMAC_W4G64_1:
4988+
case GGML_TYPE_TMAC_W4G128_0:
4989+
case GGML_TYPE_TMAC_W4G128_1:
49814990
case GGML_TYPE_COUNT:
49824991
{
49834992
GGML_ABORT("fatal error");

0 commit comments

Comments
 (0)