Skip to content

Commit aa51d1d

Browse files
authored
Merge branch 'ggerganov:master' into patch-4
2 parents a79d81d + dd3a6ce commit aa51d1d

File tree

4 files changed

+123
-56
lines changed

4 files changed

+123
-56
lines changed

CMakePresets.json

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,12 @@
2424
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
2525
}
2626
},
27-
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
28-
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
29-
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
30-
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
31-
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
27+
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
28+
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
29+
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
30+
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
31+
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
32+
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
3233

3334
{
3435
"name": "arm64-windows-msvc", "hidden": true,
@@ -57,25 +58,28 @@
5758
}
5859
},
5960

60-
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
61-
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
62-
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
61+
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
62+
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
63+
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
6364

64-
{ "name": "arm64-apple-clang-debug" , "inherits": [ "base", "arm64-apple-clang", "debug" ] },
65-
{ "name": "arm64-apple-clang-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
66-
{ "name": "arm64-apple-clang+static-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
65+
{ "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
66+
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
67+
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
6768

68-
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
69+
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
6970
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
7071
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
7172

72-
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
73+
{ "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
7374
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
7475
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
7576

76-
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
77+
{ "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
7778
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
7879
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
79-
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
80+
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
81+
82+
{ "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
83+
{ "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
8084
]
8185
}

ggml/src/ggml-aarch64.c

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,42 @@
88

99
#define UNUSED GGML_UNUSED
1010

11-
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
11+
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
1212
block_q4_0x4 out;
1313

1414
for (int i = 0; i < 4; i++) {
1515
out.d[i] = in[i].d;
1616
}
1717

18-
for (int i = 0; i < QK4_0 * 2; i++) {
19-
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
20-
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
21-
src_offset += (i % blck_size_interleave);
18+
const int end = QK4_0 * 2 / blck_size_interleave;
2219

23-
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
20+
if (blck_size_interleave == 8) {
21+
const uint64_t xor_mask = 0x8888888888888888ULL;
22+
for (int i = 0; i < end; ++i) {
23+
int src_id = i % 4;
24+
int src_offset = (i / 4) * blck_size_interleave;
25+
int dst_offset = i * blck_size_interleave;
26+
27+
uint64_t elems;
28+
// Using memcpy to avoid unaligned memory accesses
29+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
30+
elems ^= xor_mask;
31+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
32+
}
33+
} else if (blck_size_interleave == 4) {
34+
const uint32_t xor_mask = 0x88888888;
35+
for (int i = 0; i < end; ++i) {
36+
int src_id = i % 4;
37+
int src_offset = (i / 4) * blck_size_interleave;
38+
int dst_offset = i * blck_size_interleave;
39+
40+
uint32_t elems;
41+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
42+
elems ^= xor_mask;
43+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
44+
}
45+
} else {
46+
GGML_ASSERT(false);
2447
}
2548

2649
return out;
@@ -30,19 +53,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
3053
// returns an interleaved block_q4_0x8
3154
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
3255
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
33-
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
56+
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
3457
block_q4_0x8 out;
3558

3659
for (int i = 0; i < 8; i++) {
3760
out.d[i] = in[i].d;
3861
}
3962

40-
for (int i = 0; i < QK4_0 * 4; i++) {
41-
int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
42-
int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
43-
src_offset += (i % blck_size_interleave);
63+
const int end = QK4_0 * 4 / blck_size_interleave;
64+
const uint64_t xor_mask = 0x8888888888888888ULL;
65+
66+
for (int i = 0; i < end; ++i) {
67+
int src_id = i % 8;
68+
int src_offset = (i / 8) * blck_size_interleave;
69+
int dst_offset = i * blck_size_interleave;
4470

45-
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
71+
uint64_t elems;
72+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
73+
elems ^= xor_mask;
74+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
4675
}
4776

4877
return out;
@@ -71,11 +100,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
71100
}
72101

73102
if (nrows_interleaved == 8) {
74-
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave, 0x88);
103+
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
75104
out_ptr = (block_q4_0x8 *) out_ptr + 1;
76105
}
77106
else if (nrows_interleaved == 4) {
78-
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x88);
107+
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
79108
out_ptr = (block_q4_0x4 *) out_ptr + 1;
80109
}
81110
}

ggml/src/ggml-cpu/ggml-cpu-aarch64.c

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3387,19 +3387,42 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
33873387
}
33883388

33893389
// FIXME: this code is duplicated from ggml-aarch64.c
3390-
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
3390+
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
33913391
block_q4_0x4 out;
33923392

33933393
for (int i = 0; i < 4; i++) {
33943394
out.d[i] = in[i].d;
33953395
}
33963396

3397-
for (int i = 0; i < QK4_0 * 2; i++) {
3398-
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
3399-
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
3400-
src_offset += (i % blck_size_interleave);
3397+
const int end = QK4_0 * 2 / blck_size_interleave;
34013398

3402-
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
3399+
if (blck_size_interleave == 8) {
3400+
const uint64_t xor_mask = 0x8888888888888888ULL;
3401+
for (int i = 0; i < end; ++i) {
3402+
int src_id = i % 4;
3403+
int src_offset = (i / 4) * blck_size_interleave;
3404+
int dst_offset = i * blck_size_interleave;
3405+
3406+
uint64_t elems;
3407+
// Using memcpy to avoid unaligned memory accesses
3408+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
3409+
elems ^= xor_mask;
3410+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
3411+
}
3412+
} else if (blck_size_interleave == 4) {
3413+
const uint32_t xor_mask = 0x88888888;
3414+
for (int i = 0; i < end; ++i) {
3415+
int src_id = i % 4;
3416+
int src_offset = (i / 4) * blck_size_interleave;
3417+
int dst_offset = i * blck_size_interleave;
3418+
3419+
uint32_t elems;
3420+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
3421+
elems ^= xor_mask;
3422+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
3423+
}
3424+
} else {
3425+
GGML_ASSERT(false);
34033426
}
34043427

34053428
return out;
@@ -3409,19 +3432,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
34093432
// returns an interleaved block_q4_0x8
34103433
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
34113434
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
3412-
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
3435+
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
34133436
block_q4_0x8 out;
34143437

34153438
for (int i = 0; i < 8; i++) {
34163439
out.d[i] = in[i].d;
34173440
}
34183441

3419-
for (int i = 0; i < QK4_0 * 4; i++) {
3420-
int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
3421-
int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
3422-
src_offset += (i % blck_size_interleave);
3442+
const int end = QK4_0 * 4 / blck_size_interleave;
3443+
const uint64_t xor_mask = 0x8888888888888888ULL;
3444+
3445+
for (int i = 0; i < end; ++i) {
3446+
int src_id = i % 8;
3447+
int src_offset = (i / 8) * blck_size_interleave;
3448+
int dst_offset = i * blck_size_interleave;
34233449

3424-
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
3450+
uint64_t elems;
3451+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
3452+
elems ^= xor_mask;
3453+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
34253454
}
34263455

34273456
return out;
@@ -3449,7 +3478,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
34493478
for (int i = 0; i < nrows_interleaved; i++) {
34503479
dst_tmp[i] = src[x + i * nblocks];
34513480
}
3452-
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
3481+
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
34533482
}
34543483
src += nrows_interleaved * nblocks;
34553484
}
@@ -3480,7 +3509,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
34803509
for (int i = 0; i < nrows_interleaved; i++ ) {
34813510
dst_tmp[i] = src[x + i * nblocks];
34823511
}
3483-
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
3512+
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
34843513
}
34853514
src += nrows_interleaved * nblocks;
34863515
}

src/llama.cpp

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2907,9 +2907,15 @@ struct llama_model {
29072907
// for quantize-stats only
29082908
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
29092909

2910-
int64_t t_load_us = 0;
2910+
int64_t t_load_us = 0;
29112911
int64_t t_start_us = 0;
29122912

2913+
// total number of parameters in the model
2914+
uint64_t n_elements = 0;
2915+
2916+
// total size of all the tensors in the model in bytes
2917+
size_t n_bytes = 0;
2918+
29132919
// keep track of loaded lora adapters
29142920
std::set<struct llama_lora_adapter *> lora_adapters;
29152921

@@ -4275,8 +4281,8 @@ struct llama_model_loader {
42754281
int n_tensors = 0;
42764282
int n_created = 0;
42774283

4278-
int64_t n_elements = 0;
4279-
size_t n_bytes = 0;
4284+
uint64_t n_elements = 0;
4285+
size_t n_bytes = 0;
42804286

42814287
bool use_mmap = false;
42824288
bool check_tensors;
@@ -5344,6 +5350,11 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
53445350
}
53455351
}
53465352

5353+
static void llm_load_stats(llama_model_loader & ml, llama_model & model) {
5354+
model.n_elements = ml.n_elements;
5355+
model.n_bytes = ml.n_bytes;
5356+
}
5357+
53475358
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
53485359
model.arch = ml.get_arch();
53495360
if (model.arch == LLM_ARCH_UNKNOWN) {
@@ -9256,6 +9267,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
92569267
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
92579268
}
92589269

9270+
llm_load_stats(ml, model);
92599271
llm_load_print_meta(ml, model);
92609272

92619273
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
@@ -18601,6 +18613,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1860118613
llama_model model;
1860218614
llm_load_arch(ml, model);
1860318615
llm_load_hparams(ml, model);
18616+
llm_load_stats(ml, model);
1860418617

1860518618
struct quantize_state_internal qs(model, params);
1860618619

@@ -19953,19 +19966,11 @@ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t bu
1995319966
}
1995419967

1995519968
uint64_t llama_model_size(const struct llama_model * model) {
19956-
uint64_t size = 0;
19957-
for (const auto & it : model->tensors_by_name) {
19958-
size += ggml_nbytes(it.second);
19959-
}
19960-
return size;
19969+
return model->n_bytes;
1996119970
}
1996219971

1996319972
uint64_t llama_model_n_params(const struct llama_model * model) {
19964-
uint64_t nparams = 0;
19965-
for (const auto & it : model->tensors_by_name) {
19966-
nparams += ggml_nelements(it.second);
19967-
}
19968-
return nparams;
19973+
return model->n_elements;
1996919974
}
1997019975

1997119976
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {

0 commit comments

Comments
 (0)