Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,6 @@ extern "C" {
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
size_t q_size;
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

int n_threads;
Expand Down
63 changes: 16 additions & 47 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2586,7 +2586,6 @@ struct ggml_compute_params {

// work buffer for all threads
size_t wsize;
size_t qsize;
void * wdata;

struct ggml_compute_state_shared * shared;
Expand Down Expand Up @@ -13940,7 +13939,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
return;
}

const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);

assert(ne12 % ne02 == 0);
Expand Down Expand Up @@ -14110,12 +14109,7 @@ UseGgmlGemm1:;
#endif

if (src1->type != vec_dot_type) {
char * wdata = (char *)params->wdata + params->wsize - params->qsize;

if (strncmp(src1->name, wdata - GGML_MAX_NAME, GGML_MAX_NAME) == 0) {
goto AlreadyQuantized;
}
wdata += GGML_MAX_NAME;
char * wdata = params->wdata;

#if IK_PRINT_TIMING
int64_t t1 = ggml_time_us();
Expand All @@ -14125,7 +14119,7 @@ UseGgmlGemm1:;
const size_t nbw2 = nbw1*ne11;
const size_t nbw3 = nbw2*ne12;

assert(params->qsize >= ne13*nbw3);
assert(params->wsize >= ne13*nbw3);
GGML_ASSERT(src1->type == GGML_TYPE_F32);

for (int64_t i13 = 0; i13 < ne13; ++i13) {
Expand Down Expand Up @@ -14157,17 +14151,14 @@ UseGgmlGemm1:;
#endif

if (ith == 0) {
wdata -= GGML_MAX_NAME;
memcpy(wdata, src1->name, GGML_MAX_NAME);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
//atomic_store(&params->shared->current_chunk, nth);
}

AlreadyQuantized:;
ggml_barrier(params->shared);
}

const void * wdata = (src1->type == vec_dot_type) ? src1->data
: (const void *)((const char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME);
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;

#if GGML_USE_IQK_MULMAT
if (src1->type != vec_dot_type && dst->type == GGML_TYPE_F32) {
Expand Down Expand Up @@ -14354,10 +14345,9 @@ static void ggml_compute_forward_mul_mat_id(
const int n_ids = ids->ne[0]; // n_expert_used
const int n_as = ne02; // n_expert

char * qdata = (char *)params->wdata + params->wsize - params->qsize;

char * wdata_src1_end = (src1->type == vec_dot_type) ? qdata :
qdata + GGML_PAD(GGML_MAX_NAME + ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
char * wdata_src1_end = (src1->type == vec_dot_type) ?
(char *) params->wdata :
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));

struct mmid_row_mapping {
int32_t i1;
Expand All @@ -14367,19 +14357,14 @@ static void ggml_compute_forward_mul_mat_id(
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]

bool store_name = false;
if (src1->type != vec_dot_type) {
if (strncmp(src1->name, qdata, GGML_MAX_NAME) == 0) {
goto QuantizationAlreadyDone;
}
store_name = true;
char * wdata = qdata + GGML_MAX_NAME;
char * wdata = params->wdata;

const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
const size_t nbw2 = nbw1*ne11;
const size_t nbw3 = nbw2*ne12;

assert(params->qsize >= ne13*nbw3);
assert(params->wsize >= ne13*nbw3);
GGML_ASSERT(src1->type == GGML_TYPE_F32);

for (int64_t i13 = 0; i13 < ne13; ++i13) {
Expand All @@ -14395,12 +14380,7 @@ static void ggml_compute_forward_mul_mat_id(

#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]

QuantizationAlreadyDone:;
if (ith == 0) {
if (store_name) {
memcpy(qdata, src1->name, GGML_MAX_NAME);
}

// initialize matrix_row_counts
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));

Expand Down Expand Up @@ -14429,7 +14409,7 @@ QuantizationAlreadyDone:;

const char * src0_cur = (const char *) src0->data + cur_a*nb02;

const void * wdata = (src1->type == vec_dot_type) ? src1->data : qdata + GGML_MAX_NAME;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);

const int64_t nr0 = ne01; // src0 rows
Expand Down Expand Up @@ -21017,7 +20997,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
}

size_t work_size = 0;
size_t q_size = 0;

struct ggml_cplan cplan;
memset(&cplan, 0, sizeof(struct ggml_cplan));
Expand All @@ -21033,7 +21012,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
max_tasks = MAX(max_tasks, n_tasks);

size_t cur = 0;
size_t cur_q = 0;

switch (node->op) {
case GGML_OP_CPY:
Expand Down Expand Up @@ -21064,8 +21042,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;

if (node->src[1]->type != vec_dot_type) {
cur_q = ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
//cur_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
cur = ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
}
} break;
case GGML_OP_MUL_MAT_ID:
Expand All @@ -21075,13 +21052,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
const struct ggml_tensor * src1 = node->src[1];
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
if (src1->type != vec_dot_type) {
cur_q += ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
//cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1));
cur += ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
}
const int n_as = src0->ne[2];
cur_q += GGML_PAD(cur, sizeof(int64_t)); // align
cur_q += n_as * sizeof(int64_t); // matrix_row_counts
cur_q += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
cur += GGML_PAD(cur, sizeof(int64_t)); // align
cur += n_as * sizeof(int64_t); // matrix_row_counts
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
} break;
case GGML_OP_OUT_PROD:
{
Expand Down Expand Up @@ -21170,20 +21146,14 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
}

work_size = MAX(work_size, cur);
q_size = MAX(q_size, cur_q);
}

if (work_size > 0) {
work_size += CACHE_LINE_SIZE*(n_threads - 1);
}
if (q_size > 0) {
q_size += GGML_MAX_NAME;
}
work_size += q_size;

cplan.n_threads = MIN(max_tasks, n_threads);
cplan.work_size = work_size;
cplan.q_size = q_size;
cplan.work_data = NULL;

return cplan;
Expand All @@ -21201,7 +21171,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
/*.ith =*/ state->ith,
/*.nth =*/ state->shared->n_threads,
/*.wsize =*/ cplan->work_size,
/*.qsize =*/ cplan->q_size,
/*.wdata =*/ cplan->work_data,
/*.shared=*/ state->shared,
};
Expand Down