Skip to content

Commit ee7b4d3

Browse files
committed
Latest commits with fixes for MinGW, backend-specific subobjects for json config
* guarded new cpu cores parking functionality for MinGW as it doesn't compile there * you can now put performance-related setting under "VK", "BLAS" or "CL" subobjects in json, such as context/batch/ubatch and others * minor fixes to formatting in the saved text files
1 parent cc01fae commit ee7b4d3

28 files changed

+2269
-934
lines changed

base_sampling2/chat_layer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -617,7 +617,7 @@ class chat
617617
bool logit_bias_check_ending(std::string_view token_str) {
618618
for (auto word : params.sparams.logit_bias_strings_ending) {
619619
auto token_str_pos = word.find(token_str);
620-
if (token_str_pos == (token_str.length() - 1)) return true;
620+
if (token_str_pos == (word.length() - 1)) return true;
621621
}
622622

623623
return false;

base_sampling2/common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
214214

215215
DWORD p = NORMAL_PRIORITY_CLASS;
216216
switch (prio) {
217+
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
217218
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
218219
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
219220
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -239,6 +240,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
239240

240241
int p = 0;
241242
switch (prio) {
243+
case GGML_SCHED_PRIO_LOW: p = 5; break;
242244
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
243245
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
244246
case GGML_SCHED_PRIO_HIGH: p = -10; break;

base_sampling2/include/jsonParams.h

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -641,26 +641,6 @@ static void getPerformanceParamsFromJson(nlohmann::json& config, common_params&
641641
}
642642
}
643643

644-
//gpu offload
645-
if (checkJNum(config, "n_gpu_layers")) params.n_gpu_layers = config["n_gpu_layers"];
646-
if (config["main_gpu"].is_boolean()) params.main_gpu = config["main_gpu"];
647-
648-
// backend-specific
649-
#ifdef GGML_USE_VULKAN
650-
if (checkJNum(config, "n_gpu_layers_vk")) params.n_gpu_layers = config["n_gpu_layers_vk"];
651-
if (checkJNum(config, "n_threads_vk")) params.cpuparams.n_threads = config["n_threads_vk"];
652-
if (checkJNum(config, "n_threads_batch_vk")) params.cpuparams_batch.n_threads = config["n_threads_batch_vk"];
653-
if (config["use_mmap_vk"].is_boolean()) params.use_mmap = config["use_mmap_vk"];
654-
if (config["flash_attn_vk"].is_boolean()) params.flash_attn = config["flash_attn_vk"];
655-
if (config["no_kv_offload_vk"].is_boolean()) params.no_kv_offload = config["no_kv_offload_vk"];
656-
#elif GGML_USE_CLBLAST
657-
if (checkJNum(config, "n_gpu_layers_clblast")) params.n_gpu_layers = config["n_gpu_layers_clblast"];
658-
if (checkJNum(config, "n_threads_clblast")) params.cpuparams.n_threads = config["n_threads_clblast"];
659-
if (checkJNum(config, "n_threads_batch_clblast")) params.cpuparams_batch.n_threads = config["n_threads_batch_clblast"];
660-
661-
if (checkJNum(config, "clblast_platform_id")) params.clblast_platform_id = config["clblast_platform_id"];
662-
#endif
663-
664644
// context-related
665645
if (checkJNum(config, "ctx-size")) params.n_ctx = config["ctx-size"];
666646
if (checkJNum(config, "grp_attn_n")) params.grp_attn_n = config["grp_attn_n"];
@@ -708,6 +688,43 @@ static void getPerformanceParamsFromJson(nlohmann::json& config, common_params&
708688
params.control_vectors.push_back({ el.value(), el.key(), });
709689
}
710690
}
691+
692+
//gpu offload
693+
if (checkJNum(config, "n_gpu_layers")) params.n_gpu_layers = config["n_gpu_layers"];
694+
if (config["main_gpu"].is_boolean()) params.main_gpu = config["main_gpu"];
695+
}
696+
697+
static void getBackendParamsFromJson(nlohmann::json& config, common_params& params) {
698+
// backend-specific
699+
#ifdef GGML_USE_VULKAN
700+
if (checkJNum(config, "n_gpu_layers_vk")) params.n_gpu_layers = config["n_gpu_layers_vk"];
701+
if (checkJNum(config, "n_threads_vk")) params.cpuparams.n_threads = config["n_threads_vk"];
702+
if (checkJNum(config, "n_threads_batch_vk")) params.cpuparams_batch.n_threads = config["n_threads_batch_vk"];
703+
if (config["use_mmap_vk"].is_boolean()) params.use_mmap = config["use_mmap_vk"];
704+
if (config["flash_attn_vk"].is_boolean()) params.flash_attn = config["flash_attn_vk"];
705+
if (config["no_kv_offload_vk"].is_boolean()) params.no_kv_offload = config["no_kv_offload_vk"];
706+
707+
if (checkJObj(config, "VK")) {
708+
nlohmann::json config_vk = config["VK"];
709+
getPerformanceParamsFromJson(config_vk, params);
710+
}
711+
#elif GGML_USE_CLBLAST
712+
if (checkJNum(config, "n_gpu_layers_clblast")) params.n_gpu_layers = config["n_gpu_layers_clblast"];
713+
if (checkJNum(config, "n_threads_clblast")) params.cpuparams.n_threads = config["n_threads_clblast"];
714+
if (checkJNum(config, "n_threads_batch_clblast")) params.cpuparams_batch.n_threads = config["n_threads_batch_clblast"];
715+
716+
if (checkJNum(config, "clblast_platform_id")) params.clblast_platform_id = config["clblast_platform_id"];
717+
718+
if (checkJObj(config, "CL")) {
719+
nlohmann::json config_cl = config["CL"];
720+
getPerformanceParamsFromJson(config_cl, params);
721+
}
722+
#elif GGML_USE_BLAS
723+
if (checkJObj(config, "BLAS")) {
724+
nlohmann::json config_blas = config["BLAS"];
725+
getPerformanceParamsFromJson(config_blas, params);
726+
}
727+
#endif
711728
}
712729

713730
static void getParamsFromJson(nlohmann::json& config, common_params& params, bool hasFile = false, bool headless = false){
@@ -717,6 +734,7 @@ static void getParamsFromJson(nlohmann::json& config, common_params& params, boo
717734
getPromptingParamsFromJson(config, params, hasFile, headless);
718735
// performance and misc
719736
getPerformanceParamsFromJson(config, params);
737+
getBackendParamsFromJson(config, params);
720738
//sampling
721739
getSamplingParamsFromJson(config, params);
722740
getTensorOverridePairs(config, params);

base_sampling2/master/ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2181,6 +2181,7 @@ extern "C" {
21812181

21822182
// scheduling priorities
21832183
enum ggml_sched_priority {
2184+
GGML_SCHED_PRIO_LOW = -1,
21842185
GGML_SCHED_PRIO_NORMAL,
21852186
GGML_SCHED_PRIO_MEDIUM,
21862187
GGML_SCHED_PRIO_HIGH,

base_sampling2/master/ggml/src/ggml-backend.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
13401340
// allocate graph
13411341
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
13421342
// the re-allocation may cause the split inputs to be moved to a different address
1343-
ggml_backend_sched_synchronize(sched);
1343+
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
1344+
for (int i = 0; i < sched->n_backends; i++) {
1345+
ggml_backend_synchronize(sched->backends[i]);
1346+
}
13441347
#ifndef NDEBUG
13451348
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
13461349
#endif
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
15641567

15651568
ggml_backend_sched_split_graph(sched, graph);
15661569

1567-
15681570
if (!ggml_backend_sched_alloc_splits(sched)) {
15691571
return false;
15701572
}
@@ -1598,9 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
15981600
for (int i = 0; i < sched->n_backends; i++) {
15991601
ggml_backend_synchronize(sched->backends[i]);
16001602
}
1601-
// reset the current copy to 0 so that the graphs will be similar during generation
1602-
// necessary for CUDA graphs
1603-
sched->cur_copy = 0;
1603+
if (!sched->is_alloc) {
1604+
// if the graph is not already allocated, always use copy 0 after a synchronization
1605+
// this ensures that during generation the same copy is used every time,
1606+
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1607+
sched->cur_copy = 0;
1608+
}
16041609
}
16051610

16061611
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {

base_sampling2/master/ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
8282
target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
8383
endif()
8484

85-
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
86-
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
87-
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
88-
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
89-
85+
if (GGML_SYSTEM_ARCH STREQUAL "ARM")
9086
message(STATUS "ARM detected")
91-
9287
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
9388
message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
9489
else()
@@ -170,12 +165,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
170165
endforeach()
171166
endif()
172167
endif()
173-
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
174-
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
175-
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
176-
168+
elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
177169
message(STATUS "x86 detected")
178-
179170
if (MSVC)
180171
# instruction set detection for MSVC only
181172
if (GGML_NATIVE)
@@ -318,7 +309,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
318309
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
319310
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
320311
endif()
321-
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
312+
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
322313
message(STATUS "PowerPC detected")
323314
if (GGML_NATIVE)
324315
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
@@ -344,18 +335,17 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
344335
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
345336
endif()
346337
endif()
347-
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
338+
elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
348339
message(STATUS "loongarch64 detected")
349-
350340
list(APPEND ARCH_FLAGS -march=loongarch64)
351341
if (GGML_LASX)
352342
list(APPEND ARCH_FLAGS -mlasx)
353343
endif()
354344
if (GGML_LSX)
355345
list(APPEND ARCH_FLAGS -mlsx)
356346
endif()
357-
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
358-
message(STATUS "RISC-V detected")
347+
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
348+
message(STATUS "riscv64 detected")
359349
if (GGML_RVV)
360350
if (GGML_XTHEADVECTOR)
361351
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
@@ -365,7 +355,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
365355
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
366356
endif()
367357
endif()
368-
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
358+
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
369359
message(STATUS "s390x detected")
370360
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
371361
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})

base_sampling2/master/ggml/src/ggml-cpu/ggml-cpu-quants.c

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6995,7 +6995,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
69956995

69966996
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
69976997
assert(n % QK_K == 0);
6998+
#ifdef __ARM_FEATURE_MATMUL_INT8
6999+
assert((nrc == 2) || (nrc == 1));
7000+
#else
69987001
assert(nrc == 1);
7002+
#endif
69997003
UNUSED(nrc);
70007004
UNUSED(bx);
70017005
UNUSED(by);
@@ -7012,6 +7016,146 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
70127016

70137017
uint32_t utmp[4];
70147018

7019+
#if defined(__ARM_FEATURE_MATMUL_INT8)
7020+
if (nrc == 2) {
7021+
const block_q4_K * GGML_RESTRICT x0 = x;
7022+
const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx);
7023+
const block_q8_K * GGML_RESTRICT y0 = y;
7024+
const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
7025+
7026+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
7027+
7028+
float32x4_t vfsum = vdupq_n_f32(0.0f);
7029+
7030+
for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
7031+
const uint8_t * GGML_RESTRICT qx0 = x0->qs;
7032+
const uint8_t * GGML_RESTRICT qx1 = x1->qs;
7033+
const int8_t * GGML_RESTRICT qy0 = y0->qs;
7034+
const int8_t * GGML_RESTRICT qy1 = y1->qs;
7035+
7036+
// decode scales and mins
7037+
int8_t x0_scales[8], x1_scales[8];
7038+
int16x8_t x0_mins, x1_mins;
7039+
{
7040+
uint32_t scales_mins[3];
7041+
memcpy(scales_mins, x0->scales, 12);
7042+
const uint32_t mins_0_3 = scales_mins[1] & kmask1;
7043+
const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
7044+
const uint32x2_t mins = {mins_0_3, mins_4_7};
7045+
x0_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
7046+
uint32_t scales[2];
7047+
scales[0] = scales_mins[0] & kmask1; // scales 0~3
7048+
scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
7049+
memcpy(x0_scales, scales, 8);
7050+
}
7051+
{
7052+
uint32_t scales_mins[3];
7053+
memcpy(scales_mins, x1->scales, 12);
7054+
const uint32_t mins_0_3 = scales_mins[1] & kmask1;
7055+
const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
7056+
const uint32x2_t mins = {mins_0_3, mins_4_7};
7057+
x1_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
7058+
uint32_t scales[2];
7059+
scales[0] = scales_mins[0] & kmask1; // scales 0~3
7060+
scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
7061+
memcpy(x1_scales, scales, 8);
7062+
}
7063+
7064+
int32x4_t visum = {0};
7065+
7066+
// process 64 data points per iteration, totally 256 data points
7067+
for (int j = 0; j < QK_K / 64; ++j, qx0 += 32, qx1 += 32, qy0 += 64, qy1 += 64) {
7068+
const int8x16x4_t vy0 = vld1q_s8_x4(qy0);
7069+
const int8x16x4_t vy1 = vld1q_s8_x4(qy1);
7070+
7071+
int8x16_t vx0[4], vx1[4];
7072+
{
7073+
const uint8x16x2_t vv = vld1q_u8_x2(qx0);
7074+
vx0[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
7075+
vx0[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
7076+
vx0[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
7077+
vx0[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
7078+
}
7079+
{
7080+
const uint8x16x2_t vv = vld1q_u8_x2(qx1);
7081+
vx1[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
7082+
vx1[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
7083+
vx1[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
7084+
vx1[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
7085+
}
7086+
7087+
// process 32 data points (share same block scale) per iteration
7088+
for (int k = 0; k < 2; ++k) {
7089+
const int blk = j * 2 + k;
7090+
const int32x4_t block_scale = {
7091+
x0_scales[blk],
7092+
x0_scales[blk],
7093+
x1_scales[blk],
7094+
x1_scales[blk],
7095+
};
7096+
7097+
int32x4_t vr = {0};
7098+
for (int l = 0; l < 2; ++l) {
7099+
const int idx = k * 2 + l;
7100+
const int64x2_t vx0_s64 = vreinterpretq_s64_s8(vx0[idx]);
7101+
const int64x2_t vx1_s64 = vreinterpretq_s64_s8(vx1[idx]);
7102+
const int64x2_t vy0_s64 = vreinterpretq_s64_s8(vy0.val[idx]);
7103+
const int64x2_t vy1_s64 = vreinterpretq_s64_s8(vy1.val[idx]);
7104+
const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vx0_s64, vx1_s64));
7105+
const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vx0_s64, vx1_s64));
7106+
const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vy0_s64, vy1_s64));
7107+
const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vy0_s64, vy1_s64));
7108+
vr = vmmlaq_s32(vr, vx_l, vy_l);
7109+
vr = vmmlaq_s32(vr, vx_h, vy_h);
7110+
}
7111+
// apply block scale, will NOT overflow
7112+
// block_scale * sum_256(int4*int8) <= 2^(8+8+4+8) = 28 bits
7113+
visum = vmlaq_s32(visum, vr, block_scale);
7114+
}
7115+
}
7116+
7117+
// adjust bias, apply superblock scale
7118+
{
7119+
int32_t bias[4];
7120+
// no obvious uplift from sve sdot-16, just use neon mul add
7121+
const int16x8_t y0_sums = vpaddq_s16(vld1q_s16(y0->bsums), vld1q_s16(y0->bsums+8));
7122+
const int16x8_t y1_sums = vpaddq_s16(vld1q_s16(y1->bsums), vld1q_s16(y1->bsums+8));
7123+
bias[0] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x0_mins)),
7124+
vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x0_mins))));
7125+
bias[1] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x0_mins)),
7126+
vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x0_mins))));
7127+
bias[2] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x1_mins)),
7128+
vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x1_mins))));
7129+
bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
7130+
vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
7131+
const float32x4_t dmins = {
7132+
GGML_FP16_TO_FP32(x0->dmin) * y0->d,
7133+
GGML_FP16_TO_FP32(x0->dmin) * y1->d,
7134+
GGML_FP16_TO_FP32(x1->dmin) * y0->d,
7135+
GGML_FP16_TO_FP32(x1->dmin) * y1->d,
7136+
};
7137+
vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
7138+
7139+
const float32x4_t superblock_scale = {
7140+
GGML_FP16_TO_FP32(x0->d) * y0->d,
7141+
GGML_FP16_TO_FP32(x0->d) * y1->d,
7142+
GGML_FP16_TO_FP32(x1->d) * y0->d,
7143+
GGML_FP16_TO_FP32(x1->d) * y1->d,
7144+
};
7145+
vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
7146+
}
7147+
}
7148+
7149+
// vfsum = ABCD -> ACBD
7150+
// AC -> s, BD -> (s+bs)
7151+
vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
7152+
vst1_f32(s, vget_low_f32 (vfsum));
7153+
vst1_f32(s + bs, vget_high_f32(vfsum));
7154+
7155+
return;
7156+
}
7157+
#endif
7158+
70157159
#ifdef __ARM_FEATURE_SVE
70167160
float sumf = 0;
70177161
for (int i = 0; i < nb; ++i) {

0 commit comments

Comments
 (0)