zdnn: initial matmul refactor

taronaeo · taronaeo · commit 06b41e65c1e5 · 2025-09-23T00:13:54.000+08:00
Signed-off-by: Aaron Teo &lt;aaron.teo1@ibm.com&gt;
diff --git a/ggml/src/ggml-zdnn/.gitignore b/ggml/src/ggml-zdnn/.gitignore
@@ -0,0 +1 @@
+zdnn.h
diff --git a/ggml/src/ggml-zdnn/common.hpp b/ggml/src/ggml-zdnn/common.hpp
@@ -0,0 +1,10 @@
+#ifndef GGML_ZDNN_COMMON_HPP
+#define GGML_ZDNN_COMMON_HPP
+
+#include "ggml.h"
+#include "ggml-impl.h"
+#include "ggml-zdnn-impl.h"
+
+#include "zdnn.h"
+
+#endif  // GGML_ZDNN_COMMON_HPP
diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
@@ -1,10 +1,11 @@
-#include "zdnn.h"
-#include "ggml-zdnn.h"
 #include "ggml-zdnn-impl.h"
-
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
+#include "ggml-zdnn/common.hpp"
+#include "ggml-zdnn/mmf.hpp"
+#include "ggml.h"
+
 #include <vector>
 #include <memory>
 #include <csignal>
@@ -88,80 +89,6 @@ inline void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_
     ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor));
 }
 
-static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const enum ggml_type type = src0->type;
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    const ggml_tensor * weights = src0;
-    const ggml_tensor * inputs  = src1;
-          ggml_tensor * output  = dst;
-
-    ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
-    ggml_backend_zdnn_buffer * inputs_extra  = (ggml_backend_zdnn_buffer *)inputs->extra;
-    ggml_backend_zdnn_buffer * output_extra  = (ggml_backend_zdnn_buffer *)output->extra;
-    ggml_backend_zdnn_buffer * bias_extra    = (ggml_backend_zdnn_buffer *)output_extra->extra;
-
-    const int64_t weights_rows = ne01;
-    const int64_t weights_cols = ne00;
-    const int64_t inputs_rows  = ne11;
-    const int64_t inputs_cols  = ne10;
-
-    assert(inputs_cols == weights_cols);
-
-    const int64_t output_rows = ne1;
-    const int64_t output_cols = ne0;
-
-    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
-    //               __func__, weights_extra->name,
-    //               weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
-    //               weights_extra->pre_tfm_desc.dim1,
-    //               weights_extra->pre_tfm_desc.dim2,
-    //               weights_extra->pre_tfm_desc.dim3,
-    //               weights_extra->pre_tfm_desc.dim4);
-
-    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
-    //               __func__, inputs_extra->name,
-    //               inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0],
-    //               inputs_extra->pre_tfm_desc.dim1,
-    //               inputs_extra->pre_tfm_desc.dim2,
-    //               inputs_extra->pre_tfm_desc.dim3,
-    //               inputs_extra->pre_tfm_desc.dim4);
-
-    GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]");
-    GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]");
-    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1  == inputs->ne[0]  && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
-    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2  == inputs->ne[1]  && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");
-
-    ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &bias_extra->ztensor,
-                                        false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
-    // TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
-    ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));
-
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(weights_rows);
-    GGML_UNUSED(weights_cols);
-    GGML_UNUSED(inputs_rows);
-    GGML_UNUSED(inputs_cols);
-    GGML_UNUSED(output_rows);
-    GGML_UNUSED(output_cols);
-}
-
 static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     // debug helpers
     // GGML_LOG_INFO("%s: use_mul_mat_vec   = %d\n", __func__, use_mul_mat_vec);
@@ -174,7 +101,7 @@ static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const gg
     // GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     // GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
-    ggml_zdnn_mul_mat_op(ctx, src0, src1, dst);
+    ggml_zdnn_mul_mat_f(ctx, src0, src1, dst);
 }
 
 static bool ggml_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-zdnn/mmf.cpp b/ggml/src/ggml-zdnn/mmf.cpp
@@ -0,0 +1,76 @@
+#include "ggml.h"
+#include "mmf.hpp"
+
+static void ggml_zdnn_mul_mat_f(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const enum ggml_type type = src0->type;
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    const ggml_tensor * weights = src0;
+    const ggml_tensor * inputs  = src1;
+          ggml_tensor * output  = dst;
+
+    ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
+    ggml_backend_zdnn_buffer * inputs_extra  = (ggml_backend_zdnn_buffer *)inputs->extra;
+    ggml_backend_zdnn_buffer * output_extra  = (ggml_backend_zdnn_buffer *)output->extra;
+    ggml_backend_zdnn_buffer * bias_extra    = (ggml_backend_zdnn_buffer *)output_extra->extra;
+
+    const int64_t weights_rows = ne01;
+    const int64_t weights_cols = ne00;
+    const int64_t inputs_rows  = ne11;
+    const int64_t inputs_cols  = ne10;
+
+    assert(inputs_cols == weights_cols);
+
+    const int64_t output_rows = ne1;
+    const int64_t output_cols = ne0;
+
+    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
+    //               __func__, weights_extra->name,
+    //               weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
+    //               weights_extra->pre_tfm_desc.dim1,
+    //               weights_extra->pre_tfm_desc.dim2,
+    //               weights_extra->pre_tfm_desc.dim3,
+    //               weights_extra->pre_tfm_desc.dim4);
+
+    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
+    //               __func__, inputs_extra->name,
+    //               inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0],
+    //               inputs_extra->pre_tfm_desc.dim1,
+    //               inputs_extra->pre_tfm_desc.dim2,
+    //               inputs_extra->pre_tfm_desc.dim3,
+    //               inputs_extra->pre_tfm_desc.dim4);
+
+    GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]");
+    GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]");
+    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1  == inputs->ne[0]  && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
+    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2  == inputs->ne[1]  && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");
+
+    ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &bias_extra->ztensor,
+                                        false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
+    // TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
+    ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));
+
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(weights_rows);
+    GGML_UNUSED(weights_cols);
+    GGML_UNUSED(inputs_rows);
+    GGML_UNUSED(inputs_cols);
+    GGML_UNUSED(output_rows);
+    GGML_UNUSED(output_cols);
+}
diff --git a/ggml/src/ggml-zdnn/mmf.hpp b/ggml/src/ggml-zdnn/mmf.hpp
@@ -0,0 +1,8 @@
+#ifndef GGML_ZDNN_MMF_HPP
+#define GGML_ZDNN_MMF_HPP
+
+#include "common.hpp"
+
+static void ggml_zdnn_mul_mat_f(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+#endif  // GGML_ZDNN_MMF_HPP