llamafile_sgemm MMA for Q8_0

amritahs-ibm · amritahs-ibm · commit f73300f49b45 · 2024-11-15T10:19:54.000-06:00
Signed-off-by: amritahs &lt;amritahs@linux.vnet.ibm.com&gt;
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
@@ -32,15 +32,20 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 }
 
 static float tensor_sum_elements(const ggml_tensor * tensor) {
-    double sum = 0;
+    double sum1;
+    //printf("sum inside = %f\n", sum1);
     if (tensor->type == GGML_TYPE_F32) {
         for (int j = 0; j < tensor->ne[1]; j++) {
             for (int k = 0; k < tensor->ne[0]; k++) {
-                sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
+			//printf("sum inside = %f\n", sum1);
+                printf("%f \t ", ((float *) tensor->data)[j*tensor->ne[0] + k]);
+		    sum1 = sum1 + ((float *) tensor->data)[j*tensor->ne[0] + k];
+		    //printf("sum inside = %f\n", sum1);
             }
+	    printf("\n");
         }
     }
-    return sum;
+    return sum1;
 }
 
 static void tensor_dump(const ggml_tensor * tensor, const char * name) {
@@ -109,9 +114,14 @@ int main(int argc, char ** argv)  {
 
 #undef VERBOSE_DEBUGGING
 #ifndef VERBOSE_DEBUGGING
+    /*
     const int sizey = 4096;
     const int sizex = 11008;
     const int sizez = 128;
+    */
+    const int sizey = 40;
+    const int sizex = 32*128;
+    const int sizez = 2;
 #else
     /* Working - let's increase size */
     const int sizey = 1;
@@ -126,13 +136,14 @@ int main(int argc, char ** argv)  {
     //printf("Memsize required = %i\n", sizex*sizex);
 
     // TODO: perform the bench for all types or for a user specified type
-    const ggml_type qtype = GGML_TYPE_Q4_1;
+    const ggml_type qtype = GGML_TYPE_Q8_0;
 
     size_t ctx_size = 0;
     ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
     ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
     ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
     ctx_size += ggml_row_size(qtype,         sizex*sizey);
+    ctx_size += ggml_row_size(qtype,         sizex*sizez);
     ctx_size += ggml_row_size(qtype,         sizex*sizey);
     ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
     ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
@@ -156,15 +167,15 @@ int main(int argc, char ** argv)  {
     printf("Creating new tensors\n");
     // printf("Creating new tensor m1\n");
     struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
-    ggml_set_f32(m11, 1.0f);
+    ggml_set_f32(m11, -1.23f);
 
     // printf("Creating new tensor m1\n");
     struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
     ggml_set_f32(m12, 1.5f);
 
     // printf("Creating new tensor m2\n");
     struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
-    ggml_set_f32(m2, 2.0f);
+    ggml_set_f32(m2, -12.23f);
 
     printf("\n------ Test 1 - Matrix Mult via F32 code\n");
     // printf("Creating new tensor m11xm2\n");
@@ -176,27 +187,34 @@ int main(int argc, char ** argv)  {
 
     printf("n_threads=%i\n", benchmark_params.n_threads);
 
-    TENSOR_DUMP(m11);
-    TENSOR_DUMP(m2);
+    //TENSOR_DUMP(m11);
+    //TENSOR_DUMP(m2);
 
     std::vector<uint8_t> work_buffer;
 
     ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
 
-    TENSOR_DUMP(ggml_graph_node(gf, 0));
+    //TENSOR_DUMP(ggml_graph_node(gf, 0));
 
     printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
 
     int32_t nelements = sizex*sizey;
+    int32_t nelements2 = sizex*sizez;
 
     // Set up a the benchmark matrices
     // printf("Creating new tensor q11 & Running quantize\n");
     struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
     ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
+    //TENSOR_DUMP(q11);
 
+    // printf("Creating new tensor q2 & Running quantize\n");
+    struct ggml_tensor * q2 = ggml_new_tensor_2d(ctx, qtype, sizex, sizez);
+    ggml_quantize_chunk(qtype, (const float *) m2->data, q2->data, 0, nelements2/m2->ne[0], m2->ne[0], nullptr);
+    //TENSOR_DUMP(q2);
+     
     // Set up a the compute graph
     // printf("Creating new tensor q31\n");
-    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
+    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, q2);
 
     // printf("Creating compute graph\n");
     struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
@@ -235,6 +253,7 @@ int main(int argc, char ** argv)  {
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
         ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
+        TENSOR_DUMP(ggml_graph_node(gf31, 0));
 
         long long int stop = ggml_time_us();
         long long int usec = stop-start;
@@ -247,7 +266,7 @@ int main(int argc, char ** argv)  {
             usec,gflops);
 
 #ifdef VERBOSE_DEBUGGING
-        TENSOR_DUMP("res",gf31.nodes[0])
+        //TENSOR_DUMP("res",gf31.nodes[0])
 #endif
 
         // Check that the matrix multiplication result is in the right ballpark
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -2068,7 +2068,8 @@ inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, co
 inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
 inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
 inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
-inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
+//inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
+inline static void ggml_vec_set_f32 (const int n, float * x, float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v++;           }
 inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
 inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
@@ -4209,9 +4210,10 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
             } break;
         case GGML_TYPE_F32:
             {
+		float v = value;    
                 assert(tensor->nb[0] == sizeof(float));
                 for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
+                    ggml_vec_set_f32(nc, (float *)(data + i*n1), v++);
                 }
             } break;
         default:
diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp