Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 121 additions & 71 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -7599,7 +7599,6 @@ UseGgmlGemm2:;
if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
num_rows_per_vec_dot = 1;
}

ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);

if (nth >= nchunk0 * nchunk1) {
Expand All @@ -7612,6 +7611,75 @@ UseGgmlGemm2:;

// ggml_compute_forward_mul_mat_id

#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]

struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};

static void ggml_compute_forward_mul_mat_id_one_chunk(
struct ggml_tensor * dst,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const int64_t cur_a,
const int64_t ir0_start,
const int64_t ir0_end,
const int64_t ir1_start,
const int64_t ir1_end,
const char * src0_cur,
const struct mmid_row_mapping * matrix_rows,
const size_t row_size,
const bool src1_cont,
const void * wdata) {

GGML_TENSOR_BINARY_OP_LOCALS

const enum ggml_type type = src0->type;

ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;

const int64_t blck_0 = 16;
const int64_t blck_1 = 16;

float tmp[16];

for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) {
const int64_t _i12 = ir1; // logical row index for this expert

struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
const int id = row_mapping.i1; // selected expert index

const int64_t i11 = id % ne11;
const int64_t i12 = row_mapping.i2; // row index in src1

const int64_t i1 = id; // selected expert index
const int64_t i2 = i12; // row

// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11)*row_size
: (i11*nb11 + i12*nb12));

float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));

for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
}

memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
}
}
}
}

static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
Expand All @@ -7629,7 +7697,6 @@ static void ggml_compute_forward_mul_mat_id(

const bool src1_cont = ggml_is_contiguous(src1);

ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;

Expand All @@ -7651,11 +7718,6 @@ static void ggml_compute_forward_mul_mat_id(
(char *) params->wdata :
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));

struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};

int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]

Expand All @@ -7670,8 +7732,8 @@ static void ggml_compute_forward_mul_mat_id(
GGML_ASSERT(src1->type == GGML_TYPE_F32);

for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
for (int64_t i12 = ith; i12 < ne12; i12 += nth) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
ne10);
Expand All @@ -7680,9 +7742,10 @@ static void ggml_compute_forward_mul_mat_id(
}
}

#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]

if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);

// initialize matrix_row_counts
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));

Expand All @@ -7701,92 +7764,79 @@ static void ggml_compute_forward_mul_mat_id(

ggml_barrier(params->threadpool);

// compute each matrix multiplication in sequence
const int64_t rows_total = ggml_nelements(ids);
int64_t rows_processed = 0;

for (int cur_a = 0; cur_a < n_as; ++cur_a) {
const int64_t cne1 = matrix_row_counts[cur_a];

if (cne1 == 0) {
continue;
}

const char * src0_cur = (const char *) src0->data + cur_a*nb02;
rows_processed += cne1;

const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const char * src0_cur = (const char *) src0->data + cur_a * nb02;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);

const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1; // src1 rows

// distribute the thread work across the inner or outer loop based on which one is larger

const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows

const int64_t ith0 = ith % nth0;
const int64_t ith1 = ith / nth0;

const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
const int64_t nr0 = ne01;
const int64_t nr1 = cne1;

const int64_t ir010 = dr0*ith0;
const int64_t ir011 = MIN(ir010 + dr0, nr0);
int chunk_size = 16;
if (nr0 == 1 || nr1 == 1) {
chunk_size = 64;
}

const int64_t ir110 = dr1*ith1;
const int64_t ir111 = MIN(ir110 + dr1, nr1);
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;

// threads with no work simply yield (not sure if it helps)
//if (ir010 >= ir011 || ir110 >= ir111) {
// sched_yield();
// continue;
//}
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
nchunk0 = nr0 > nr1 ? nth : 1;
nchunk1 = nr0 > nr1 ? 1 : nth;
}

// block-tiling attempt
const int64_t blck_0 = 16;
const int64_t blck_1 = 16;
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;

// attempt to reduce false-sharing (does not seem to make a difference)
float tmp[16];
int current_chunk = ith;

for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
const int64_t _i12 = ir1; // logical row index for this expert
while (current_chunk < nchunk0 * nchunk1) {
const int64_t ith0 = current_chunk % nchunk0;
const int64_t ith1 = current_chunk / nchunk0;

struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
const int id = row_mapping.i1; // selected expert index
const int64_t ir0_start = dr0 * ith0;
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);

const int64_t i11 = id % ne11;
const int64_t i12 = row_mapping.i2; // row index in src1
const int64_t ir1_start = dr1 * ith1;
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);

const int64_t i1 = id; // selected expert index
const int64_t i2 = i12; // row
ggml_compute_forward_mul_mat_id_one_chunk(
dst, src0, src1, cur_a,
ir0_start, ir0_end, ir1_start, ir1_end,
src0_cur, matrix_rows, row_size, src1_cont, wdata
);

// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11)*row_size
: (i11*nb11 + i12*nb12));
if (nth >= nchunk0 * nchunk1) {
break;
}

float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
}

//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
//}
if (rows_processed == rows_total) {
break;
}

for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
}
ggml_barrier(params->threadpool);

memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
}
}
if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
}
}

#undef MMID_MATRIX_ROW
ggml_barrier(params->threadpool);
}
}

// ggml_compute_forward_out_prod
Expand Down
17 changes: 16 additions & 1 deletion tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4329,6 +4329,21 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
}
}

#if 0
for (int bs : {1, 64}) {
for (ggml_type type_a : {GGML_TYPE_Q4_0}) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
int n_experts = 256;
int n_used = 8;
int n_embd = 7168;
int n_ff = 2048;
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_experts, n_used, true, n_embd, bs, n_ff));
//test_cases.emplace_back(new test_mul_mat(type_a, type_b, n_embd, bs, n_ff, {1, 1}, {1, 1}));
}
}
}
#endif

for (int K : {3, 5}) {
for (int IC : {256, 2560}) {
for (int IW_IH : {32, 64, 256}) {
Expand Down Expand Up @@ -4462,7 +4477,7 @@ int main(int argc, char ** argv) {
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
if (ggml_backend_set_n_threads_fn) {
// TODO: better value for n_threads
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency() / 2);
}

printf(" Device description: %s\n", ggml_backend_dev_description(dev));
Expand Down
Loading