pocketpy
diff --git a/‎include/cten.h‎
Lines changed: 2 additions & 1 deletion b/‎include/cten.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/basic.c‎
Lines changed: 26 additions & 1 deletion b/‎src/basic.c‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎src/operator.c‎
Lines changed: 33 additions & 4 deletions b/‎src/operator.c‎
Lines changed: 33 additions & 4 deletions
diff --git a/‎src/utils.c‎
Lines changed: 23 additions & 0 deletions b/‎src/utils.c‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎tests/Backward/test_linear_backward.c‎
Lines changed: 44 additions & 45 deletions b/‎tests/Backward/test_linear_backward.c‎
Lines changed: 44 additions & 45 deletions
@@ -134,4 +134,5 @@ void cten_assert_dim(const char* title, int a, int b);
 bool cten_elemwise_broadcast(Tensor* a, Tensor* b);
 int load_iris_dataset(const float (**X)[4], const int** y);
 Tensor Tensor_reduce_dim(Tensor self, int dim, const char* operation);
-Tensor reduce_gradient_for_broadcasting(Tensor grad, TensorShape original_shape, TensorShape broadcasted_shape);
+Tensor reduce_gradient_for_broadcasting(Tensor grad, TensorShape original_shape, TensorShape broadcasted_shape);
+Tensor Tensor_unsqueeze(Tensor self, int dim);
@@ -149,7 +149,32 @@ void Tensor_backward(Tensor self, Tensor grad) {
         // Step 1: Get the local gradient (the partial derivative). --> For z = f(x, y), this would be dz/dx or dz/dy.
         Tensor input_grad = self.node->grad_fn(self, i);
 
-        // Step 2: Apply the chain rule. --> The gradient flowing to the input is upstream_grad * local_grad.
+        // This is the gradient flowing from the output, which we need to propagate backwards.
+        Tensor grad = self.node->grad;
+        int input_ndim = TensorShape_dim(input_tensor.shape);
+        int grad_ndim = TensorShape_dim(grad.shape);
+        
+        if ((strcmp(self.node->name, "Sum") == 0 || strcmp(self.node->name, "Mean") == 0) && input_ndim > grad_ndim) {
+            // Find the dimension that was reduced. We assume the non-reduced dimensions match in size.
+            int unsqueeze_dim = -1;
+            int grad_idx = 0;
+            for (int dim_idx = 0; dim_idx < input_ndim; ++dim_idx) {
+                if (grad_idx >= grad_ndim || input_tensor.shape[dim_idx] != grad.shape[grad_idx]) {
+                    // Yes, this is the dimension that was removed.
+                    unsqueeze_dim = dim_idx;
+                    break;
+                }
+                grad_idx++;
+            }
+
+            if (unsqueeze_dim != -1) {
+                grad = Tensor_unsqueeze(grad, unsqueeze_dim);
+            } else {
+                cten_assert(false, "Could not deduce unsqueeze dimension.");
+            }
+        }
+        
+        // Step 2: Apply the chain rule (upstream_grad * local_grad)
         Tensor combined_grad;
         if(strcmp(self.node->name, "Matmul") == 0) {
             if (i == 0) {
 
@@ -105,11 +105,40 @@ void Tensor_argmax(Tensor self, int* out) {
 }
 
 Tensor GradFn_mean(Tensor self, int i) {
-    // f(x) = mean(x); f'(x) = 1 / x.numel()
-    Tensor res = Tensor_new(self.shape, false);
-    for(int i = 0; i < res.data->numel; i++) {
-        res.data->flex[i] = 1.0f / self.data->numel;
+    Tensor input_tensor = self.node->inputs[i];
+    int divisor;
+    
+    if (TensorShape_numel(self.shape) == 1 && TensorShape_numel(input_tensor.shape) > 1) {
+        divisor = TensorShape_numel(input_tensor.shape);
+    } else {
+        int input_ndim = TensorShape_dim(input_tensor.shape);
+        int output_ndim = TensorShape_dim(self.shape);
+        if (input_ndim > output_ndim) {
+            int out_idx = 0;
+            int reduced_dim_size = 1;
+            for(int d=0; d < input_ndim; ++d) {
+                if(out_idx >= output_ndim || input_tensor.shape[d] != self.shape[out_idx]) {
+                    reduced_dim_size = input_tensor.shape[d];
+                    break;
+                }
+                out_idx++;
+            }
+            divisor = reduced_dim_size;
+        } else {
+            // scalar input
+            divisor = TensorShape_numel(input_tensor.shape);
+        }
     }
+
+    // gradient ==> SAME SHAPE as the ORIGINAL INPUT.
+    Tensor res = Tensor_new(input_tensor.shape, false);
+    
+    // gradient value is 1 divided by the number of elements that were averaged.
+    float grad_val = 1.0f / divisor;
+    
+    for(int j = 0; j < res.data->numel; j++) {
+        res.data->flex[j] = grad_val;
+    }   
     return res;
 }
 
 
@@ -342,5 +342,28 @@ Tensor Tensor_reduce_dim(Tensor self, int dim, const char* operation) {
         }
     }
 
+    return res;
+}
+
+Tensor Tensor_unsqueeze(Tensor self, int dim) {
+    int old_ndim = TensorShape_dim(self.shape);
+    cten_assert(dim >= 0 && dim <= old_ndim, "Unsqueeze dim out of bounds");
+
+    TensorShape new_shape = {0};
+    int old_idx = 0;
+    // insert a '1' at the 'dim' position in the new shape.
+    for (int i = 0; i < old_ndim + 1 && i < 4; i++) {
+        if (i == dim) {
+            new_shape[i] = 1;
+        } else {
+            if(old_idx < 4) {
+               new_shape[i] = self.shape[old_idx++];
+            }
+        }
+    }
+
+    Tensor res = self;
+    memcpy(res.shape, new_shape, sizeof(TensorShape));
+    
     return res;
 }
@@ -163,66 +163,65 @@ void test_linear_backward() {
         }
     }
 
-    // TODO: Tensor_sum and Tensor_mean backward is in working progress
-    // // Test Case 4: Chained operations with linear
-    // {
-    //     const char* tc_name = "Chained_operations_with_linear";
-    //     // Sub-test 1: Linear followed by sum
-    //     {
-    //         TensorShape input_shape = {2, 3};  // batch_size=2, input_features=3
-    //         TensorShape weight_shape = {3, 2};  // input_features=3, output_features=2
-    //         TensorShape bias_shape = {1, 2};    // output_features=2
+    // Test Case 4: Chained operations with linear
+    {
+        const char* tc_name = "Chained_operations_with_linear";
+        // Sub-test 1: Linear followed by sum
+        {
+            TensorShape input_shape = {2, 3};  // batch_size=2, input_features=3
+            TensorShape weight_shape = {3, 2};  // input_features=3, output_features=2
+            TensorShape bias_shape = {1, 2};    // output_features=2
 
-    //         float input_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-    //         float weight_data[] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};
-    //         float bias_data[] = {0.1f, 0.2f};
+            float input_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+            float weight_data[] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};
+            float bias_data[] = {0.1f, 0.2f};
 
-    //         // Expected gradients
-    //         float exp_grad_bias[] = {1.0f, 1.0f};  // For sum reduction
+            // Expected gradients
+            float exp_grad_bias[] = {2.0f, 2.0f};  // For sum reduction
 
-    //         Tensor input = create_test_tensor(input_shape, input_data, true);
-    //         Tensor weight = create_test_tensor(weight_shape, weight_data, true);
-    //         Tensor bias = create_test_tensor(bias_shape, bias_data, true);
+            Tensor input = create_test_tensor(input_shape, input_data, true);
+            Tensor weight = create_test_tensor(weight_shape, weight_data, true);
+            Tensor bias = create_test_tensor(bias_shape, bias_data, true);
 
-    //         Tensor output = nn_linear(input, weight, bias);
-    //         Tensor sum_output = Tensor_sum(output);
+            Tensor output = nn_linear(input, weight, bias);
+            Tensor sum_output = Tensor_sum(output);
 
-    //         Tensor_backward(sum_output, (Tensor){0});
+            Tensor_backward(sum_output, (Tensor){0});
 
-    //         Tensor expected_grad_bias = create_test_tensor(bias_shape, exp_grad_bias, false);
+            Tensor expected_grad_bias = create_test_tensor(bias_shape, exp_grad_bias, false);
 
-    //         // Focus on bias gradient
-    //         compare_tensors(&bias.node->grad, &expected_grad_bias, op_name, tc_name, 1, TEST_FLOAT_TOLERANCE);
-    //     }
+            // Focus on bias gradient
+            compare_tensors(&bias.node->grad, &expected_grad_bias, op_name, tc_name, 1, TEST_FLOAT_TOLERANCE);
+        }
 
-    //     // Sub-test 2: Linear followed by mean
-    //     {
-    //         TensorShape input_shape = {2, 3};  // batch_size=2, input_features=3
-    //         TensorShape weight_shape = {3, 2};  // input_features=3, output_features=2
-    //         TensorShape bias_shape = {1, 2};    // output_features=2
+        // Sub-test 2: Linear followed by mean
+        {
+            TensorShape input_shape = {2, 3};  // batch_size=2, input_features=3
+            TensorShape weight_shape = {3, 2};  // input_features=3, output_features=2
+            TensorShape bias_shape = {1, 2};    // output_features=2
 
-    //         float input_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-    //         float weight_data[] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};
-    //         float bias_data[] = {0.1f, 0.2f};
+            float input_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+            float weight_data[] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};
+            float bias_data[] = {0.1f, 0.2f};
 
-    //         // Expected gradients
-    //         float exp_grad_bias[] = {0.25f, 0.25f};  // For mean reduction (1/4)
+            // Expected gradients
+            float exp_grad_bias[] = {0.5f, 0.5f};  // For mean reduction (1/2)
 
-    //         Tensor input = create_test_tensor(input_shape, input_data, true);
-    //         Tensor weight = create_test_tensor(weight_shape, weight_data, true);
-    //         Tensor bias = create_test_tensor(bias_shape, bias_data, true);
+            Tensor input = create_test_tensor(input_shape, input_data, true);
+            Tensor weight = create_test_tensor(weight_shape, weight_data, true);
+            Tensor bias = create_test_tensor(bias_shape, bias_data, true);
 
-    //         Tensor output = nn_linear(input, weight, bias);
-    //         Tensor mean_output = Tensor_mean(output);
+            Tensor output = nn_linear(input, weight, bias);
+            Tensor mean_output = Tensor_mean(output);
 
-    //         Tensor_backward(mean_output, (Tensor){0});
+            Tensor_backward(mean_output, (Tensor){0});
 
-    //         Tensor expected_grad_bias = create_test_tensor(bias_shape, exp_grad_bias, false);
+            Tensor expected_grad_bias = create_test_tensor(bias_shape, exp_grad_bias, false);
 
-    //         // Focus on bias gradient
-    //         compare_tensors(&bias.node->grad, &expected_grad_bias, op_name, tc_name, 2, TEST_FLOAT_TOLERANCE);
-    //     }
-    // }
+            // Focus on bias gradient
+            compare_tensors(&bias.node->grad, &expected_grad_bias, op_name, tc_name, 2, TEST_FLOAT_TOLERANCE);
+        }
+    }
 
     cten_free(pool_id);
 }