Skip to content

Commit 6d7b58e

Browse files
ikawrakowIwan Kawrakow
andauthored
Revert #79 (#192)
* Revert "Do not quantize activations if not necessary (#79)" This reverts commit 0bf4d99. * Fixed compilation after revert --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 4601a8c commit 6d7b58e

File tree

2 files changed

+16
-48
lines changed

2 files changed

+16
-48
lines changed

ggml/include/ggml.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -724,7 +724,6 @@ extern "C" {
724724
// since https://github.com/ggerganov/ggml/issues/287
725725
struct ggml_cplan {
726726
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
727-
size_t q_size;
728727
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
729728

730729
int n_threads;

ggml/src/ggml.c

Lines changed: 16 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -2586,7 +2586,6 @@ struct ggml_compute_params {
25862586

25872587
// work buffer for all threads
25882588
size_t wsize;
2589-
size_t qsize;
25902589
void * wdata;
25912590

25922591
struct ggml_compute_state_shared * shared;
@@ -13940,7 +13939,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
1394013939
return;
1394113940
}
1394213941

13943-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME;
13942+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata;
1394413943
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
1394513944

1394613945
assert(ne12 % ne02 == 0);
@@ -14110,12 +14109,7 @@ UseGgmlGemm1:;
1411014109
#endif
1411114110

1411214111
if (src1->type != vec_dot_type) {
14113-
char * wdata = (char *)params->wdata + params->wsize - params->qsize;
14114-
14115-
if (strncmp(src1->name, wdata - GGML_MAX_NAME, GGML_MAX_NAME) == 0) {
14116-
goto AlreadyQuantized;
14117-
}
14118-
wdata += GGML_MAX_NAME;
14112+
char * wdata = params->wdata;
1411914113

1412014114
#if IK_PRINT_TIMING
1412114115
int64_t t1 = ggml_time_us();
@@ -14125,7 +14119,7 @@ UseGgmlGemm1:;
1412514119
const size_t nbw2 = nbw1*ne11;
1412614120
const size_t nbw3 = nbw2*ne12;
1412714121

14128-
assert(params->qsize >= ne13*nbw3);
14122+
assert(params->wsize >= ne13*nbw3);
1412914123
GGML_ASSERT(src1->type == GGML_TYPE_F32);
1413014124

1413114125
for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -14157,17 +14151,14 @@ UseGgmlGemm1:;
1415714151
#endif
1415814152

1415914153
if (ith == 0) {
14160-
wdata -= GGML_MAX_NAME;
14161-
memcpy(wdata, src1->name, GGML_MAX_NAME);
1416214154
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
1416314155
//atomic_store(&params->shared->current_chunk, nth);
1416414156
}
1416514157

14166-
AlreadyQuantized:;
14158+
ggml_barrier(params->shared);
1416714159
}
1416814160

14169-
const void * wdata = (src1->type == vec_dot_type) ? src1->data
14170-
: (const void *)((const char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME);
14161+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
1417114162

1417214163
#if GGML_USE_IQK_MULMAT
1417314164
if (src1->type != vec_dot_type && dst->type == GGML_TYPE_F32) {
@@ -14354,10 +14345,9 @@ static void ggml_compute_forward_mul_mat_id(
1435414345
const int n_ids = ids->ne[0]; // n_expert_used
1435514346
const int n_as = ne02; // n_expert
1435614347

14357-
char * qdata = (char *)params->wdata + params->wsize - params->qsize;
14358-
14359-
char * wdata_src1_end = (src1->type == vec_dot_type) ? qdata :
14360-
qdata + GGML_PAD(GGML_MAX_NAME + ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
14348+
char * wdata_src1_end = (src1->type == vec_dot_type) ?
14349+
(char *) params->wdata :
14350+
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
1436114351

1436214352
struct mmid_row_mapping {
1436314353
int32_t i1;
@@ -14367,19 +14357,14 @@ static void ggml_compute_forward_mul_mat_id(
1436714357
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
1436814358
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
1436914359

14370-
bool store_name = false;
1437114360
if (src1->type != vec_dot_type) {
14372-
if (strncmp(src1->name, qdata, GGML_MAX_NAME) == 0) {
14373-
goto QuantizationAlreadyDone;
14374-
}
14375-
store_name = true;
14376-
char * wdata = qdata + GGML_MAX_NAME;
14361+
char * wdata = params->wdata;
1437714362

1437814363
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
1437914364
const size_t nbw2 = nbw1*ne11;
1438014365
const size_t nbw3 = nbw2*ne12;
1438114366

14382-
assert(params->qsize >= ne13*nbw3);
14367+
assert(params->wsize >= ne13*nbw3);
1438314368
GGML_ASSERT(src1->type == GGML_TYPE_F32);
1438414369

1438514370
for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -14395,12 +14380,7 @@ static void ggml_compute_forward_mul_mat_id(
1439514380

1439614381
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
1439714382

14398-
QuantizationAlreadyDone:;
1439914383
if (ith == 0) {
14400-
if (store_name) {
14401-
memcpy(qdata, src1->name, GGML_MAX_NAME);
14402-
}
14403-
1440414384
// initialize matrix_row_counts
1440514385
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
1440614386

@@ -14429,7 +14409,7 @@ QuantizationAlreadyDone:;
1442914409

1443014410
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
1443114411

14432-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : qdata + GGML_MAX_NAME;
14412+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
1443314413
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
1443414414

1443514415
const int64_t nr0 = ne01; // src0 rows
@@ -21017,7 +20997,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
2101720997
}
2101820998

2101920999
size_t work_size = 0;
21020-
size_t q_size = 0;
2102121000

2102221001
struct ggml_cplan cplan;
2102321002
memset(&cplan, 0, sizeof(struct ggml_cplan));
@@ -21033,7 +21012,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
2103321012
max_tasks = MAX(max_tasks, n_tasks);
2103421013

2103521014
size_t cur = 0;
21036-
size_t cur_q = 0;
2103721015

2103821016
switch (node->op) {
2103921017
case GGML_OP_CPY:
@@ -21064,8 +21042,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
2106421042
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
2106521043

2106621044
if (node->src[1]->type != vec_dot_type) {
21067-
cur_q = ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
21068-
//cur_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
21045+
cur = ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
2106921046
}
2107021047
} break;
2107121048
case GGML_OP_MUL_MAT_ID:
@@ -21075,13 +21052,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
2107521052
const struct ggml_tensor * src1 = node->src[1];
2107621053
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
2107721054
if (src1->type != vec_dot_type) {
21078-
cur_q += ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
21079-
//cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1));
21055+
cur += ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
2108021056
}
2108121057
const int n_as = src0->ne[2];
21082-
cur_q += GGML_PAD(cur, sizeof(int64_t)); // align
21083-
cur_q += n_as * sizeof(int64_t); // matrix_row_counts
21084-
cur_q += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
21058+
cur += GGML_PAD(cur, sizeof(int64_t)); // align
21059+
cur += n_as * sizeof(int64_t); // matrix_row_counts
21060+
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
2108521061
} break;
2108621062
case GGML_OP_OUT_PROD:
2108721063
{
@@ -21170,20 +21146,14 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
2117021146
}
2117121147

2117221148
work_size = MAX(work_size, cur);
21173-
q_size = MAX(q_size, cur_q);
2117421149
}
2117521150

2117621151
if (work_size > 0) {
2117721152
work_size += CACHE_LINE_SIZE*(n_threads - 1);
2117821153
}
21179-
if (q_size > 0) {
21180-
q_size += GGML_MAX_NAME;
21181-
}
21182-
work_size += q_size;
2118321154

2118421155
cplan.n_threads = MIN(max_tasks, n_threads);
2118521156
cplan.work_size = work_size;
21186-
cplan.q_size = q_size;
2118721157
cplan.work_data = NULL;
2118821158

2118921159
return cplan;
@@ -21201,7 +21171,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2120121171
/*.ith =*/ state->ith,
2120221172
/*.nth =*/ state->shared->n_threads,
2120321173
/*.wsize =*/ cplan->work_size,
21204-
/*.qsize =*/ cplan->q_size,
2120521174
/*.wdata =*/ cplan->work_data,
2120621175
/*.shared=*/ state->shared,
2120721176
};

0 commit comments

Comments
 (0)