Skip to content

Commit f73300f

Browse files
committed
llamafile_sgemm MMA for Q8_0
Signed-off-by: amritahs <[email protected]>
1 parent 057dc7c commit f73300f

File tree

3 files changed

+692
-35
lines changed

3 files changed

+692
-35
lines changed

examples/benchmark/benchmark-matmult.cpp

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,20 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
3232
}
3333

3434
static float tensor_sum_elements(const ggml_tensor * tensor) {
35-
double sum = 0;
35+
double sum1;
36+
//printf("sum inside = %f\n", sum1);
3637
if (tensor->type == GGML_TYPE_F32) {
3738
for (int j = 0; j < tensor->ne[1]; j++) {
3839
for (int k = 0; k < tensor->ne[0]; k++) {
39-
sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
40+
//printf("sum inside = %f\n", sum1);
41+
printf("%f \t ", ((float *) tensor->data)[j*tensor->ne[0] + k]);
42+
sum1 = sum1 + ((float *) tensor->data)[j*tensor->ne[0] + k];
43+
//printf("sum inside = %f\n", sum1);
4044
}
45+
printf("\n");
4146
}
4247
}
43-
return sum;
48+
return sum1;
4449
}
4550

4651
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
@@ -109,9 +114,14 @@ int main(int argc, char ** argv) {
109114

110115
#undef VERBOSE_DEBUGGING
111116
#ifndef VERBOSE_DEBUGGING
117+
/*
112118
const int sizey = 4096;
113119
const int sizex = 11008;
114120
const int sizez = 128;
121+
*/
122+
const int sizey = 40;
123+
const int sizex = 32*128;
124+
const int sizez = 2;
115125
#else
116126
/* Working - let's increase size */
117127
const int sizey = 1;
@@ -126,13 +136,14 @@ int main(int argc, char ** argv) {
126136
//printf("Memsize required = %i\n", sizex*sizex);
127137

128138
// TODO: perform the bench for all types or for a user specified type
129-
const ggml_type qtype = GGML_TYPE_Q4_1;
139+
const ggml_type qtype = GGML_TYPE_Q8_0;
130140

131141
size_t ctx_size = 0;
132142
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
133143
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
134144
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
135145
ctx_size += ggml_row_size(qtype, sizex*sizey);
146+
ctx_size += ggml_row_size(qtype, sizex*sizez);
136147
ctx_size += ggml_row_size(qtype, sizex*sizey);
137148
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
138149
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
@@ -156,15 +167,15 @@ int main(int argc, char ** argv) {
156167
printf("Creating new tensors\n");
157168
// printf("Creating new tensor m1\n");
158169
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
159-
ggml_set_f32(m11, 1.0f);
170+
ggml_set_f32(m11, -1.23f);
160171

161172
// printf("Creating new tensor m1\n");
162173
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
163174
ggml_set_f32(m12, 1.5f);
164175

165176
// printf("Creating new tensor m2\n");
166177
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
167-
ggml_set_f32(m2, 2.0f);
178+
ggml_set_f32(m2, -12.23f);
168179

169180
printf("\n------ Test 1 - Matrix Mult via F32 code\n");
170181
// printf("Creating new tensor m11xm2\n");
@@ -176,27 +187,34 @@ int main(int argc, char ** argv) {
176187

177188
printf("n_threads=%i\n", benchmark_params.n_threads);
178189

179-
TENSOR_DUMP(m11);
180-
TENSOR_DUMP(m2);
190+
//TENSOR_DUMP(m11);
191+
//TENSOR_DUMP(m2);
181192

182193
std::vector<uint8_t> work_buffer;
183194

184195
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
185196

186-
TENSOR_DUMP(ggml_graph_node(gf, 0));
197+
//TENSOR_DUMP(ggml_graph_node(gf, 0));
187198

188199
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
189200

190201
int32_t nelements = sizex*sizey;
202+
int32_t nelements2 = sizex*sizez;
191203

192204
// Set up a the benchmark matrices
193205
// printf("Creating new tensor q11 & Running quantize\n");
194206
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
195207
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
208+
//TENSOR_DUMP(q11);
196209

210+
// printf("Creating new tensor q2 & Running quantize\n");
211+
struct ggml_tensor * q2 = ggml_new_tensor_2d(ctx, qtype, sizex, sizez);
212+
ggml_quantize_chunk(qtype, (const float *) m2->data, q2->data, 0, nelements2/m2->ne[0], m2->ne[0], nullptr);
213+
//TENSOR_DUMP(q2);
214+
197215
// Set up a the compute graph
198216
// printf("Creating new tensor q31\n");
199-
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
217+
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, q2);
200218

201219
// printf("Creating compute graph\n");
202220
struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
@@ -235,6 +253,7 @@ int main(int argc, char ** argv) {
235253
long long int start = ggml_time_us();
236254
//printf("Running ggml_graph_compute\n");
237255
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
256+
TENSOR_DUMP(ggml_graph_node(gf31, 0));
238257

239258
long long int stop = ggml_time_us();
240259
long long int usec = stop-start;
@@ -247,7 +266,7 @@ int main(int argc, char ** argv) {
247266
usec,gflops);
248267

249268
#ifdef VERBOSE_DEBUGGING
250-
TENSOR_DUMP("res",gf31.nodes[0])
269+
//TENSOR_DUMP("res",gf31.nodes[0])
251270
#endif
252271

253272
// Check that the matrix multiplication result is in the right ballpark

ggml/src/ggml.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2068,7 +2068,8 @@ inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, co
20682068
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
20692069
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
20702070
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
2071-
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
2071+
//inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
2072+
inline static void ggml_vec_set_f32 (const int n, float * x, float v) { for (int i = 0; i < n; ++i) x[i] = v++; }
20722073
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
20732074
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
20742075
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
@@ -4209,9 +4210,10 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
42094210
} break;
42104211
case GGML_TYPE_F32:
42114212
{
4213+
float v = value;
42124214
assert(tensor->nb[0] == sizeof(float));
42134215
for (int i = 0; i < n; i++) {
4214-
ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
4216+
ggml_vec_set_f32(nc, (float *)(data + i*n1), v++);
42154217
}
42164218
} break;
42174219
default:

0 commit comments

Comments
 (0)