Skip to content

Commit 3d77cd8

Browse files
committed
Revert do not quantize activations if not necessary
Reapply "Revert do not quantize activations if not necessary"
1 parent 3d34d33 commit 3d77cd8

File tree

2 files changed

+18
-52
lines changed

2 files changed

+18
-52
lines changed

ggml/include/ggml-cpu.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ extern "C" {
3434
// since https://github.com/ggerganov/ggml/issues/287
3535
struct ggml_cplan {
3636
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
37-
size_t q_size;
3837
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
3938

4039
int n_threads;

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 18 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1373,7 +1373,6 @@ struct ggml_compute_params {
13731373

13741374
// work buffer for all threads
13751375
size_t wsize;
1376-
size_t qsize;
13771376
void * wdata;
13781377

13791378
struct ggml_threadpool * threadpool;
@@ -7598,18 +7597,13 @@ UseGgmlGemm1:;
75987597
#endif
75997598

76007599
if (src1->type != vec_dot_type) {
7601-
char * wdata = (char *)params->wdata + params->wsize - params->qsize;
7602-
7603-
if (strncmp(src1->name, wdata - GGML_MAX_NAME, GGML_MAX_NAME) == 0) {
7604-
goto AlreadyQunatized;
7605-
}
7606-
wdata += GGML_MAX_NAME;
7600+
char * wdata = params->wdata;
76077601

76087602
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
76097603
const size_t nbw2 = nbw1*ne11;
76107604
const size_t nbw3 = nbw2*ne12;
76117605

7612-
assert(params->qsize >= ne13*nbw3);
7606+
assert(params->wsize >= ne13*nbw3);
76137607
GGML_ASSERT(src1->type == GGML_TYPE_F32);
76147608

76157609
for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -7630,21 +7624,14 @@ UseGgmlGemm1:;
76307624
}
76317625
}
76327626
}
7627+
}
76337628

7634-
ggml_barrier(params->threadpool);
7635-
7636-
if (ith == 0) {
7637-
wdata -= GGML_MAX_NAME;
7638-
memcpy(wdata, src1->name, GGML_MAX_NAME);
7639-
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7640-
atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
7641-
}
7642-
7643-
AlreadyQunatized:;
7629+
if (ith == 0) {
7630+
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7631+
atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
76447632
}
76457633

7646-
const void * wdata = (src1->type == vec_dot_type) ? src1->data
7647-
: (const void *)((const char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME);
7634+
ggml_barrier(params->threadpool);
76487635

76497636
#if GGML_USE_LLAMAFILE
76507637
if (src1->type != vec_dot_type) {
@@ -7795,10 +7782,9 @@ static void ggml_compute_forward_mul_mat_id(
77957782
const int n_ids = ids->ne[0]; // n_expert_used
77967783
const int n_as = ne02; // n_expert
77977784

7798-
char * qdata = (char *)params->wdata + params->wsize - params->qsize;
7799-
7800-
char * wdata_src1_end = (src1->type == vec_dot_type) ? qdata :
7801-
qdata + GGML_PAD(GGML_MAX_NAME + ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
7785+
char * wdata_src1_end = (src1->type == vec_dot_type) ?
7786+
(char *) params->wdata :
7787+
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
78027788

78037789
struct mmid_row_mapping {
78047790
int32_t i1;
@@ -7808,19 +7794,14 @@ static void ggml_compute_forward_mul_mat_id(
78087794
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
78097795
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
78107796

7811-
bool store_name = false;
78127797
if (src1->type != vec_dot_type) {
7813-
if (strncmp(src1->name, qdata, GGML_MAX_NAME) == 0) {
7814-
goto QuantizationAlreadyDone;
7815-
}
7816-
store_name = true;
7817-
char * wdata = qdata + GGML_MAX_NAME;
7798+
char * wdata = params->wdata;
78187799

78197800
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
78207801
const size_t nbw2 = nbw1*ne11;
78217802
const size_t nbw3 = nbw2*ne12;
78227803

7823-
assert(params->qsize >= ne13*nbw3);
7804+
assert(params->wsize >= ne13*nbw3);
78247805
GGML_ASSERT(src1->type == GGML_TYPE_F32);
78257806

78267807
for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -7836,12 +7817,7 @@ static void ggml_compute_forward_mul_mat_id(
78367817

78377818
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
78387819

7839-
QuantizationAlreadyDone:;
78407820
if (ith == 0) {
7841-
if (store_name) {
7842-
memcpy(qdata, src1->name, GGML_MAX_NAME);
7843-
}
7844-
78457821
// initialize matrix_row_counts
78467822
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
78477823

@@ -7870,7 +7846,7 @@ QuantizationAlreadyDone:;
78707846

78717847
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
78727848

7873-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : qdata + GGML_MAX_NAME;
7849+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
78747850
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
78757851

78767852
const int64_t nr0 = ne01; // src0 rows
@@ -13369,7 +13345,6 @@ struct ggml_cplan ggml_graph_plan(
1336913345
}
1337013346

1337113347
size_t work_size = 0;
13372-
size_t q_size = 0;
1337313348

1337413349
struct ggml_cplan cplan;
1337513350
memset(&cplan, 0, sizeof(struct ggml_cplan));
@@ -13385,7 +13360,6 @@ struct ggml_cplan ggml_graph_plan(
1338513360
max_tasks = MAX(max_tasks, n_tasks);
1338613361

1338713362
size_t cur = 0;
13388-
size_t cur_q = 0;
1338913363

1339013364
switch (node->op) {
1339113365
case GGML_OP_CPY:
@@ -13425,7 +13399,7 @@ struct ggml_cplan ggml_graph_plan(
1342513399
} else
1342613400
#endif
1342713401
if (node->src[1]->type != vec_dot_type) {
13428-
cur_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
13402+
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
1342913403
}
1343013404
} break;
1343113405
case GGML_OP_MUL_MAT_ID:
@@ -13435,12 +13409,12 @@ struct ggml_cplan ggml_graph_plan(
1343513409
const struct ggml_tensor * src1 = node->src[1];
1343613410
const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
1343713411
if (src1->type != vec_dot_type) {
13438-
cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1));
13412+
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
1343913413
}
1344013414
const int n_as = src0->ne[2];
13441-
cur_q += GGML_PAD(cur, sizeof(int64_t)); // align
13442-
cur_q += n_as * sizeof(int64_t); // matrix_row_counts
13443-
cur_q += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
13415+
cur += GGML_PAD(cur, sizeof(int64_t)); // align
13416+
cur += n_as * sizeof(int64_t); // matrix_row_counts
13417+
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
1344413418
} break;
1344513419
case GGML_OP_OUT_PROD:
1344613420
{
@@ -13529,21 +13503,15 @@ struct ggml_cplan ggml_graph_plan(
1352913503
}
1353013504

1353113505
work_size = MAX(work_size, cur);
13532-
q_size = MAX(q_size, cur_q);
1353313506
}
1353413507

1353513508
if (work_size > 0) {
1353613509
work_size += CACHE_LINE_SIZE*(n_threads);
1353713510
}
13538-
if (q_size > 0) {
13539-
q_size += GGML_MAX_NAME;
13540-
}
13541-
work_size += q_size;
1354213511

1354313512
cplan.threadpool = threadpool;
1354413513
cplan.n_threads = MIN(max_tasks, n_threads);
1354513514
cplan.work_size = work_size;
13546-
cplan.q_size = q_size;
1354713515
cplan.work_data = NULL;
1354813516

1354913517
return cplan;
@@ -13562,7 +13530,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1356213530
/*.ith =*/ state->ith,
1356313531
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
1356413532
/*.wsize =*/ cplan->work_size,
13565-
/*.qsize =*/ cplan->q_size,
1356613533
/*.wdata =*/ cplan->work_data,
1356713534
/*.threadpool=*/ tp,
1356813535
};

0 commit comments

Comments
 (0)