Merged functions, fixed the errors in the workflow.

AZ9tumas · AZ9tumas · commit 26f7e086daf1 · 2025-10-09T10:09:45.000+05:30
diff --git a/include/cten.h b/include/cten.h
@@ -273,23 +273,6 @@ Tensor Tensor_divf(Tensor self, float other);
  */
 Tensor Tensor_powf(Tensor self, float other);
 
-/**
- * @brief Performs batch matrix multiplication for two 3D tensors.
- *        For each batch index, multiplies the corresponding {m, n} and {n, p} matrices:
- *        - self: shape {batch, m, n}
- *        - other: shape {batch, n, p}
- *        Returns a tensor of shape {batch, m, p} where each slice is the matrix product of the input slices.
- *        Only supports strictly matched batch sizes and no broadcasting.
- *        Each batch slice is extracted using Tensor_batch_slice, and standard Tensor_matmul is applied.
- *        Prints the dimensions for each batch multiplication for debugging.
- *        The output tensor contains all resulting batch matrix products.
- *
- * @param self Input tensor of shape {batch, m, n}
- * @param other Input tensor of shape {batch, n, p}
- * @return Output tensor of shape {batch, m, p} with the results of all batch multiplications
- */
-Tensor Tensor_matmul_batch(Tensor self, Tensor other);
-
 /**
  * @brief Matrix multiplication of two tensors
  * @param self First tensor (left operand)
diff --git a/src/operator.c b/src/operator.c
@@ -229,126 +229,89 @@ static Tensor GradFn_matmul(Tensor self, int i) {
     return Tensor_transpose(Tensor_detach(self.node->inputs[1 - i]));
 }
 
-Tensor Tensor_batch_slice(Tensor t, int batch_idx, int group_idx) {
-    int dim = TensorShape_dim(t.shape);
-
-    int m, n, offset;
-    TensorShape slice_shape = {0, 0, 0, 0};
-
-    if (dim == 3) {
-        int b = t.shape[0]; m = t.shape[1]; n = t.shape[2];
-        assert(batch_idx >= 0 && batch_idx < b);
-
-        offset = batch_idx * m * n;
-        slice_shape[0] = m; slice_shape[1] = n;
-    } else if (dim == 4) {
-        int b = t.shape[0], g = t.shape[1];
-        m = t.shape[2]; n = t.shape[3];
-        
-        assert(batch_idx >= 0 && batch_idx < b);
-        assert(group_idx >= 0 && group_idx < g);
-        offset = (batch_idx * g + group_idx) * m * n;
-        slice_shape[0] = m; slice_shape[1] = n;
-    } else {
-        assert(0);
-    }
-
-    Tensor res = Tensor_new(slice_shape, t.node != NULL);
-    memcpy(res.data->flex, t.data->flex + offset, sizeof(float) * m * n);
-    return res;
-}
-
-Tensor Tensor_matmul_batch(Tensor self, Tensor other) {
+Tensor Tensor_matmul(Tensor self, Tensor other) {
     int self_dim = TensorShape_dim(self.shape);
     int other_dim = TensorShape_dim(other.shape);
 
-    assert((self_dim == 3 || self_dim == 4) && (other_dim == 3 || other_dim == 4));
+    assert(self_dim >= 2);
+    assert(other_dim >= 2);
 
-    // broadcasting
-    int batch = (self.shape[0] > other.shape[0]) ? self.shape[0] : other.shape[0];
+    int batch_self = (self_dim >= 3) ? self.shape[0] : 1;
+    int batch_other = (other_dim >= 3) ? other.shape[0] : 1;
+    int batch = (batch_self > batch_other) ? batch_self : batch_other;
 
-    int self_g = (self_dim == 4) ? self.shape[1] : 1;
-    int other_g = (other_dim == 4) ? other.shape[1] : 1;
-    int group = (self_g > other_g) ? self_g : other_g;
+    int group_self = (self_dim == 4) ? self.shape[1] : 1;
+    int group_other = (other_dim == 4) ? other.shape[1] : 1;
+    int group = (group_self > group_other) ? group_self : group_other;
 
     int m = self.shape[self_dim - 2];
     int n = self.shape[self_dim - 1];
     int p = other.shape[other_dim - 1];
-    // {b,g,m,n} * {b,g,n,p} -> {b,g,m,p} (g=1 for 3D)
-    
-    assert(n == other.shape[other_dim - 2]);
 
-    TensorShape res_shape = {batch, m, p, 0};
-    if (group > 1) {
-        res_shape[0] = batch;
-        res_shape[1] = group;
-        res_shape[2] = m;
-        res_shape[3] = p;
-    }
-
-    Tensor res = Tensor_new(res_shape, self.node != NULL || other.node != NULL);
-    for(int b = 0; b < batch; b++) {
-        int selfbatch = self.shape[0] <= b ? self.shape[0] - 1 : b;
-        int otherbatch = other.shape[0] <= b ? other.shape[0] - 1 : b;
-
-        for(int g = 0; g < group; g++) {
-            int selfgroup = self_g <= g ? self_g - 1 : g;
-            int othergroup = other_g <= g ? other_g - 1 : g;
+    assert(n == other.shape[other_dim - 2]);
 
-            Tensor self_slice = Tensor_batch_slice(self, selfbatch, selfgroup);
-            Tensor other_slice = Tensor_batch_slice(other, otherbatch, othergroup);
-            Tensor res_slice = Tensor_matmul(self_slice, other_slice);
+    bool has4D = (self_dim == 4 || other_dim == 4);
 
-            int offset = ((batch > 1) ? b * group + g : g) * m * p;
-            memcpy(res.data->flex + offset, res_slice.data->flex, sizeof(float) * m * p);
+    TensorShape res_shape = {0, 0, 0, 0};
+    if (self_dim <= 2 && other_dim <= 2) {
+        res_shape[0] = m;
+        res_shape[1] = p;
+    } else {
+        res_shape[0] = batch;
+        if (has4D) {
+            res_shape[1] = group;
+            res_shape[2] = m;
+            res_shape[3] = p;
+        } else {
+            res_shape[1] = m;
+            res_shape[2] = p;
+            res_shape[3] = 0;
         }
     }
 
-    if(res.node != NULL) {
-        res.node->grad_fn = GradFn_matmul;
-        res.node->inputs[0] = self;
-        res.node->inputs[1] = other;
-        res.node->n_inputs = 2;
-        res.node->name = "MatmulBatch";
-    }
-    return res;
-}
-
-Tensor Tensor_matmul(Tensor self, Tensor other) {
-    int self_dim = TensorShape_dim(self.shape);
-    int other_dim = TensorShape_dim(other.shape);
+    Tensor res = Tensor_new(res_shape, self.node != NULL || other.node != NULL);
 
-    assert(self_dim >= 2);
-    assert(other_dim >= 2);
+    for (int b = 0; b < batch; b++) {
+        int self_b = (batch_self <= b) ? batch_self - 1 : b;
+        int other_b = (batch_other <= b) ? batch_other - 1 : b;
 
-    if (self_dim > 2 || other_dim > 2) {
-        return Tensor_matmul_batch(self, other);
-    }
+        for (int g = 0; g < group; g++) {
+            int self_g = (group_self <= g) ? group_self - 1 : g;
+            int other_g = (group_other <= g) ? group_other - 1 : g;
 
-    int m = self.shape[self_dim - 2];
-    int n = self.shape[self_dim - 1];
-    int p = other.shape[other_dim - 1];
+            int offset_self = 0;
+            if (self_dim == 4) {
+                offset_self = self_b * self.shape[1] * m * n + self_g * m * n;
+            } else if (self_dim == 3) {
+                offset_self = self_b * m * n;
+            }
 
-    assert(n == other.shape[other_dim - 2]);
+            int offset_other = 0;
+            if (other_dim == 4) {
+                offset_other = other_b * other.shape[1] * n * p + other_g * n * p;
+            } else if (other_dim == 3) {
+                offset_other = other_b * n * p;
+            }
 
-    TensorShape res_shape;
-    memcpy(res_shape, self.shape, sizeof(TensorShape));
-    res_shape[self_dim - 1] = p;
+            int offset_res = ((batch > 1) ? b * group + g : g) * m * p;
 
-    // here weight/bias have .node != NULL, so res have GradNode
-    Tensor res = Tensor_new(res_shape, self.node != NULL || other.node != NULL);
+            float* self_ptr = self.data->flex + offset_self;
+            float* other_ptr = other.data->flex + offset_other;
+            float* res_ptr = res.data->flex + offset_res;
 
-    for(int i = 0; i < m; i++) {
-        for(int j = 0; j < p; j++) {
-            float sum = 0;
-            for(int k = 0; k < n; k++) {
-                sum += self.data->flex[i * n + k] * other.data->flex[k * p + j];
+            for (int i = 0; i < m; i++) {
+                for (int j = 0; j < p; j++) {
+                    float sum = 0;
+                    for (int k = 0; k < n; k++) {
+                        sum += self_ptr[i * n + k] * other_ptr[k * p + j];
+                    }
+                    res_ptr[i * p + j] = sum;
+                }
             }
-            res.data->flex[i * p + j] = sum;
         }
     }
 
-    if(res.node != NULL) {
+    if (res.node != NULL) {
         res.node->grad_fn = GradFn_matmul;
         res.node->inputs[0] = self;
         res.node->inputs[1] = other;
diff --git a/src/utils.c b/src/utils.c
@@ -222,6 +222,35 @@ TensorMaxMinResult Tensor_min_dim(Tensor self, int dim) {
     return result;
 }
 
+Tensor Tensor_batch_slice(Tensor t, int batch_idx, int group_idx) {
+    int dim = TensorShape_dim(t.shape);
+
+    int m, n, offset;
+    TensorShape slice_shape = {0, 0, 0, 0};
+
+    if (dim == 3) {
+        int b = t.shape[0]; m = t.shape[1]; n = t.shape[2];
+        assert(batch_idx >= 0 && batch_idx < b);
+
+        offset = batch_idx * m * n;
+        slice_shape[0] = m; slice_shape[1] = n;
+    } else if (dim == 4) {
+        int b = t.shape[0], g = t.shape[1];
+        m = t.shape[2]; n = t.shape[3];
+        
+        assert(batch_idx >= 0 && batch_idx < b);
+        assert(group_idx >= 0 && group_idx < g);
+        offset = (batch_idx * g + group_idx) * m * n;
+        slice_shape[0] = m; slice_shape[1] = n;
+    } else {
+        assert(0);
+    }
+
+    Tensor res = Tensor_new(slice_shape, t.node != NULL);
+    memcpy(res.data->flex, t.data->flex + offset, sizeof(float) * m * n);
+    return res;
+}
+
 void cten_assert(bool cond, const char* fmt, ...) {
     if(!cond) {
         va_list args;
diff --git a/tests/Operator/test_matmul.c b/tests/Operator/test_matmul.c
@@ -282,18 +282,17 @@ void test_matmul_operator() {
             float exp_d[] = {1.0745f, 1.4433f, 0.7899f, 1.5456f, 1.4509f, 0.6064f,
             0.9774f, 0.4197f, 1.1520f, 1.0043f, 1.7620f, 1.9396f, 1.4062f, 1.9461f, 1.9424f,
             0.5314f, 0.8391f, 0.8748f, 0.3471f, 1.1284f, 1.1388f, 1.1492f, 1.0333f,
-            0.8970f, 1.6950f, 0.9817f, 1.0865f, 1.0302f, 0.7693f, 1.6373f};
+            0.8970f, 1.6950f, 0.9817f, 1.0865f, 1.0302f, 0.7693f, 1.6372f};
 
             Tensor t1 = create_test_tensor(s1_shape, d1, false);
             Tensor t2 = create_test_tensor(s2_shape, d2, false);
             Tensor expected_res = create_test_tensor(exp_shape, exp_d, false);
             Tensor actual_res = Tensor_matmul(t1, t2);
 
-            compare_tensors(&actual_res, &expected_res, op_name, tc_name, 1,
-            TEST_FLOAT_TOLERANCE);
+            compare_tensors(&actual_res, &expected_res, op_name, tc_name, 1, TEST_FLOAT_TOLERANCE);
         }
 
-        // Sub-test case 1.1: Batch matrix multiplication using integers only (2x3x4 * 2x4x5)
+        // Sub-test case 2: Batch matrix multiplication using integers only (2x3x4 * 2x4x5)
         {
             TensorShape s1_shape = {2, 3, 4};
             float d1[] = {
@@ -332,10 +331,10 @@ void test_matmul_operator() {
             Tensor expected_res = create_test_tensor(exp_shape, exp_d, false);
             Tensor actual_res = Tensor_matmul(t1, t2);
 
-            compare_tensors(&actual_res, &expected_res, op_name, tc_name, 5, TEST_FLOAT_TOLERANCE);
+            compare_tensors(&actual_res, &expected_res, op_name, tc_name, 2, TEST_FLOAT_TOLERANCE);
         }
 
-        // Sub-test 2: Batch of identity matrices — result should equal second operand
+        // Sub-test 3: Batch of identity matrices — result should equal second operand
         // s1: {3,2,2} (3 identity matrices), s2: {3,2,2}
         {
             TensorShape s1_shape = {3, 2, 2};
@@ -362,10 +361,10 @@ void test_matmul_operator() {
             Tensor expected_res = create_test_tensor(exp_shape, exp_d, false);
             Tensor actual_res = Tensor_matmul(t1, t2);
 
-            compare_tensors(&actual_res, &expected_res, op_name, tc_name, 2, TEST_FLOAT_TOLERANCE);
+            compare_tensors(&actual_res, &expected_res, op_name, tc_name, 3, TEST_FLOAT_TOLERANCE);
         }
 
-        // Sub-test 3: Rectangular per-batch multiply (2 batches): {2,1,3} @ {2,3,2} -> {2,1,2}
+        // Sub-test 4: Rectangular per-batch multiply (2 batches): {2,1,3} @ {2,3,2} -> {2,1,2}
         {
             TensorShape s1_shape = {2, 1, 3};
             float d1[] = {
@@ -388,10 +387,10 @@ void test_matmul_operator() {
             Tensor expected_res = create_test_tensor(exp_shape, exp_d, false);
             Tensor actual_res = Tensor_matmul(t1, t2);
 
-            compare_tensors(&actual_res, &expected_res, op_name, tc_name, 3, TEST_FLOAT_TOLERANCE);
+            compare_tensors(&actual_res, &expected_res, op_name, tc_name, 4, TEST_FLOAT_TOLERANCE);
         }
 
-        // Sub-test 4: Batch of column-result matrices using ones to test reduction (4 batches): {4,2,3}@{4,3,1} -> {4,2,1}
+        // Sub-test 5: Batch of column-result matrices using ones to test reduction (4 batches): {4,2,3}@{4,3,1} -> {4,2,1}
         {
             TensorShape s1_shape = {4, 2, 3};
             // each 2x3 filled with ones
@@ -411,7 +410,7 @@ void test_matmul_operator() {
             Tensor expected_res = create_test_tensor(exp_shape, exp_d, false);
             Tensor actual_res = Tensor_matmul(t1, t2);
 
-            compare_tensors(&actual_res, &expected_res, op_name, tc_name, 4, TEST_FLOAT_TOLERANCE);
+            compare_tensors(&actual_res, &expected_res, op_name, tc_name, 5, TEST_FLOAT_TOLERANCE);
         }
     }
 
@@ -463,10 +462,10 @@ void test_matmul_operator() {
 
             TensorShape exp_shape = {4, 3};
             float exp_d[] = {
-                0.7616f, 1.3740f, 1.5423f,  // Row 0
-                0.3637f, 1.5308f, 1.3906f,  // Row 1
-                0.5558f, 1.1748f, 1.4725f,  // Row 2
-                0.3675f, 0.9730f, 0.9582f,  // Row 3
+                0.7617f, 1.3740f, 1.5422f,  // Row 0
+                0.3638f, 1.5307f, 1.3906f,  // Row 1
+                0.5559f, 1.1747f, 1.4724f,  // Row 2
+                0.3675f, 0.9729f, 0.9581f,  // Row 3
             };
 
             Tensor t1 = create_test_tensor(s1_shape, d1, false);
@@ -537,8 +536,8 @@ void test_matmul_operator() {
                 0.4677f, 0.3816f,
                 1.1133f, 0.8875f,
                 
-                0.8504f, 0.6607f,
-                0.3593f, 0.1660f,
+                0.8505f, 0.6607f,
+                0.3593f, 0.1659f,
             };
 
             Tensor t1 = create_test_tensor(s1_shape, d1, false);
@@ -604,15 +603,16 @@ void test_matmul_operator() {
                 4.0f, 5.0f,
                 10.0f, 11.0f,
                 
-                5.0f, 8.0f,
-                14.0f, 20.0f,
+                4.0f, 8.0f,
+                13.0f, 20.0f,
             };
 
             Tensor t1 = create_test_tensor(s1_shape, d1, false);
             Tensor t2 = create_test_tensor(s2_shape, d2, false);
             Tensor expected_res = create_test_tensor(exp_shape, exp_d, false);
             Tensor actual_res = Tensor_matmul(t1, t2);
 
+
             compare_tensors(&actual_res, &expected_res, op_name, tc_name, 5, TEST_FLOAT_TOLERANCE);
         }
 
@@ -643,14 +643,14 @@ void test_matmul_operator() {
                 1.0f, 2.0f,
                 1.0f, 1.0f,
                 
-                2.0f, 2.0f,
+                3.0f, 2.0f,
                 3.0f, 2.0f,
                 
                 1.0f, 1.0f,
                 2.0f, 1.0f,
                 
-                0.0f, 2.0f,
                 1.0f, 3.0f,
+                3.0f, 3.0f,
             };
 
             Tensor t1 = create_test_tensor(s1_shape, d1, false);