Skip to content

Commit 91b6823

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents 210d1d4 + 9fdfcda commit 91b6823

File tree

7 files changed

+237
-96
lines changed

7 files changed

+237
-96
lines changed

ggml/src/ggml-cpu/ggml-cpu-quants.c

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6596,7 +6596,118 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
65966596
}
65976597

65986598
*s = hsum_float_8(acc);
6599+
#elif defined(__VXE__) || defined(__VXE2__)
6600+
uint32_t aux[3];
6601+
uint32_t utmp[4];
6602+
6603+
const int32x4_t v_z = vec_splat_s32(0);
6604+
const uint8x16_t v_3m = vec_splat_u8(0x03);
6605+
6606+
const uint8x16_t v_0c = vec_splat_u8(1);
6607+
const uint8x16_t v_1c = vec_sl(v_0c, 1);
6608+
const uint8x16_t v_2c = vec_sl(v_0c, 2);
6609+
const uint8x16_t v_3c = vec_sl(v_0c, 3);
6610+
6611+
uint8x16_t q3h[4];
6612+
uint8x16_t q3b[2];
6613+
int8x16_t q3bytes[4];
6614+
int8x16_t q8bytes[4];
6615+
uint8x16_t qhbits[2];
6616+
6617+
float sum = 0;
6618+
6619+
for (int i = 0; i < nb; ++i) {
6620+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
65996621

6622+
const uint8_t * restrict x0l = x[i].qs;
6623+
const uint8_t * restrict x0h = x[i].hmask;
6624+
const int8_t * restrict y0 = y[i].qs;
6625+
6626+
qhbits[0] = vec_xl(0 , x0h);
6627+
qhbits[1] = vec_xl(16, x0h);
6628+
6629+
int32_t isum = 0;
6630+
6631+
memcpy(aux, x[i].scales, 12);
6632+
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
6633+
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
6634+
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
6635+
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
6636+
6637+
int8_t * scale = (int8_t *)utmp;
6638+
for (int j = 0; j < 16; ++j) scale[j] -= 32;
6639+
6640+
for (int j = 0; j < QK_K/128; ++j) {
6641+
int32x4_t isum0, isum1, isum2, isum3;
6642+
6643+
q3b[0] = vec_xl(0 , x0l);
6644+
q3b[1] = vec_xl(16, x0l);
6645+
x0l += 32;
6646+
6647+
q8bytes[0] = vec_xl(0 , y0);
6648+
q8bytes[1] = vec_xl(16 , y0);
6649+
q8bytes[2] = vec_xl(32 , y0);
6650+
q8bytes[3] = vec_xl(48 , y0);
6651+
q8bytes[4] = vec_xl(64 , y0);
6652+
q8bytes[5] = vec_xl(80 , y0);
6653+
q8bytes[6] = vec_xl(96 , y0);
6654+
q8bytes[7] = vec_xl(112, y0);
6655+
y0 += 128;
6656+
6657+
q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
6658+
q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
6659+
q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
6660+
q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
6661+
6662+
q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
6663+
q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
6664+
q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
6665+
q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
6666+
6667+
isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
6668+
isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
6669+
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
6670+
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
6671+
6672+
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
6673+
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
6674+
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
6675+
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
6676+
6677+
scale += 4;
6678+
6679+
q3h[0] = vec_andc(v_2c, qhbits[0]);
6680+
q3h[1] = vec_andc(v_2c, qhbits[1]);
6681+
q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
6682+
q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
6683+
6684+
q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
6685+
q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
6686+
q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
6687+
q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
6688+
6689+
isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
6690+
isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
6691+
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
6692+
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
6693+
6694+
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
6695+
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
6696+
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
6697+
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
6698+
6699+
scale += 4;
6700+
6701+
if (j == 0) {
6702+
qhbits[0] = vec_sr(qhbits[0], 4);
6703+
qhbits[1] = vec_sr(qhbits[1], 4);
6704+
}
6705+
}
6706+
6707+
sum += d * isum;
6708+
}
6709+
6710+
*s = sum;
66006711
#else
66016712
// scalar version
66026713
// This function is written like this so the compiler can manage to vectorize most of it

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,24 +11,26 @@
1111
#include <vector>
1212

1313
#ifdef GGML_USE_CPU_HBM
14-
#include "ggml-cpu-hbm.h"
14+
# include "ggml-cpu-hbm.h"
1515
#endif
1616

1717
#ifdef GGML_USE_CPU_KLEIDIAI
18-
#include "kleidiai/kleidiai.h"
19-
#endif
20-
21-
#if defined(__APPLE__)
22-
#include <sys/types.h>
23-
#include <sys/sysctl.h>
18+
# include "kleidiai/kleidiai.h"
2419
#endif
2520

2621
#if defined(_WIN32)
27-
#define WIN32_LEAN_AND_MEAN
28-
#ifndef NOMINMAX
29-
#define NOMINMAX
22+
# define WIN32_LEAN_AND_MEAN
23+
# ifndef NOMINMAX
24+
# define NOMINMAX
25+
# endif
26+
# include <windows.h>
27+
#else
28+
# include <unistd.h>
3029
#endif
31-
#include <windows.h>
30+
31+
#if defined(__APPLE__)
32+
# include <sys/sysctl.h>
33+
# include <sys/types.h>
3234
#endif
3335

3436
// ggml-backend interface
@@ -70,8 +72,10 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_ty
7072
}
7173

7274
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
73-
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
74-
if (extra && extra == buft) return true;
75+
for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
76+
if (extra && extra == buft) {
77+
return true;
78+
}
7579
}
7680
return false;
7781
}
@@ -330,9 +334,18 @@ static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t d
330334
}
331335

332336
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
333-
// TODO
334-
*free = 0;
335-
*total = 0;
337+
#ifdef _WIN32
338+
MEMORYSTATUSEX status;
339+
status.dwLength = sizeof(status);
340+
GlobalMemoryStatusEx(&status);
341+
*total = status.ullTotalPhys;
342+
*free = status.ullAvailPhys;
343+
#else
344+
long pages = sysconf(_SC_PHYS_PAGES);
345+
long page_size = sysconf(_SC_PAGE_SIZE);
346+
*total = pages * page_size;
347+
*free = *total;
348+
#endif
336349

337350
GGML_UNUSED(dev);
338351
}

ggml/src/ggml-rpc/ggml-rpc.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1594,6 +1594,14 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
15941594
void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
15951595
const char * cache_dir,
15961596
size_t free_mem, size_t total_mem) {
1597+
printf("Starting RPC server v%d.%d.%d\n",
1598+
RPC_PROTO_MAJOR_VERSION,
1599+
RPC_PROTO_MINOR_VERSION,
1600+
RPC_PROTO_PATCH_VERSION);
1601+
printf(" endpoint : %s\n", endpoint);
1602+
printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
1603+
printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
1604+
15971605
std::string host;
15981606
int port;
15991607
if (!parse_endpoint(endpoint, host, port)) {
@@ -1753,6 +1761,9 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
17531761
if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
17541762
return (void *)ggml_backend_rpc_add_device;
17551763
}
1764+
if (std::strcmp(name, "ggml_backend_rpc_start_server") == 0) {
1765+
return (void *)ggml_backend_rpc_start_server;
1766+
}
17561767
return NULL;
17571768

17581769
GGML_UNUSED(reg);

tools/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@ else()
2727
add_subdirectory(run)
2828
add_subdirectory(tokenize)
2929
add_subdirectory(tts)
30+
add_subdirectory(llava)
31+
if (GGML_RPC)
32+
add_subdirectory(rpc)
33+
endif()
3034
if (NOT GGML_BACKEND_DL)
3135
# these examples use the backends directly and cannot be built with dynamic loading
3236
add_subdirectory(cvector-generator)
3337
add_subdirectory(export-lora)
34-
add_subdirectory(llava)
35-
if (GGML_RPC)
36-
add_subdirectory(rpc)
37-
endif()
3838
endif()
3939
endif()

tools/llava/clip.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3382,7 +3382,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33823382
GGML_ABORT("Unknown projector type");
33833383
}
33843384

3385-
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
3385+
// ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
3386+
ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
3387+
ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
3388+
if (reg) {
3389+
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
3390+
if (ggml_backend_set_n_threads_fn) {
3391+
ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
3392+
}
3393+
}
33863394

33873395
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
33883396
if (status != GGML_STATUS_SUCCESS) {

tools/llava/llava.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "llava.h"
33

44
#include "llama.h"
5+
#include "ggml-cpp.h"
56

67
#include <algorithm>
78
#include <cerrno>
@@ -209,7 +210,10 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
209210
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
210211
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
211212
ggml_build_forward_expand(gf, flatten);
212-
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
213+
214+
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
215+
ggml_backend_graph_compute(backend.get(), gf);
216+
213217
struct ggml_tensor* result = ggml_graph_node(gf, -1);
214218

215219
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context

0 commit comments

Comments
 (0)