Skip to content

Commit 45025b0

Browse files
committed
feat: Add DepthwiseConv2D operation and update DepthwiseSeparableConvolutionalLayer
DepthwiseConv2D implements depthwise separable convolution where each input channel is convolved with its own set of filters (no channel mixing). Key features: - Each input channel filtered independently with multiplier filters - Output channels = in_channels * multiplier - Critical for MobileNets and efficient architectures - Dramatically reduces computation vs standard convolution Layer updates: - DepthwiseSeparableConvolutionalLayer now uses DepthwiseConv2D + Conv2D operations - Handles NHWC/NCHW format conversion between layer and operations - Full autodiff support with parameter gradient computation Total TensorOperations: 36 (35 previous + 1 new) Layers with autodiff: 18 (17 previous + 1 new)
1 parent b85a266 commit 45025b0

File tree

2 files changed

+460
-7
lines changed

2 files changed

+460
-7
lines changed

src/Autodiff/TensorOperations.cs

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3843,5 +3843,258 @@ void BackwardFunction(Tensor<T> gradient)
38433843

38443844
return node;
38453845
}
3846+
3847+
/// <summary>
3848+
/// Performs depthwise 2D convolution where each input channel is convolved with its own set of filters.
3849+
/// </summary>
3850+
/// <param name="input">Input tensor of shape [batch, in_channels, height, width]</param>
3851+
/// <param name="kernel">Kernel tensor of shape [in_channels, multiplier, kernel_height, kernel_width]</param>
3852+
/// <param name="bias">Optional bias tensor of shape [in_channels * multiplier]</param>
3853+
/// <param name="stride">Stride for the convolution, defaults to [1, 1]</param>
3854+
/// <param name="padding">Padding for the convolution, defaults to [0, 0]</param>
3855+
/// <returns>Output tensor of shape [batch, in_channels * multiplier, out_height, out_width]</returns>
3856+
/// <remarks>
3857+
/// <para>
3858+
/// Depthwise convolution applies a separate filter to each input channel independently, with no mixing
3859+
/// across channels. This is in contrast to standard convolution which mixes all input channels.
3860+
/// Each input channel gets 'multiplier' filters applied to it, producing 'multiplier' output channels.
3861+
/// The total output channels is in_channels * multiplier.
3862+
/// </para>
3863+
/// <para>
3864+
/// This operation is commonly used in MobileNets and other efficient architectures, often followed
3865+
/// by a pointwise (1x1) convolution to mix channels. The combination dramatically reduces
3866+
/// computational cost compared to standard convolution.
3867+
/// </para>
3868+
/// <para>
3869+
/// Forward pass computes the depthwise convolution by applying each filter only to its corresponding
3870+
/// input channel. Backward pass computes gradients with respect to input, kernel, and bias.
3871+
/// </para>
3872+
/// </remarks>
3873+
public static ComputationNode<T> DepthwiseConv2D(
3874+
ComputationNode<T> input,
3875+
ComputationNode<T> kernel,
3876+
ComputationNode<T>? bias = null,
3877+
int[]? stride = null,
3878+
int[]? padding = null)
3879+
{
3880+
var numOps = MathHelper.GetNumericOperations<T>();
3881+
var inputShape = input.Value.Shape;
3882+
var kernelShape = kernel.Value.Shape;
3883+
3884+
// Validate input shape (must be 4D: [batch, in_channels, height, width])
3885+
if (inputShape.Length != 4)
3886+
throw new ArgumentException("Input must be 4D tensor [batch, in_channels, height, width]");
3887+
3888+
// Validate kernel shape (must be 4D: [in_channels, multiplier, kernel_height, kernel_width])
3889+
if (kernelShape.Length != 4)
3890+
throw new ArgumentException("Kernel must be 4D tensor [in_channels, multiplier, kernel_height, kernel_width]");
3891+
3892+
if (inputShape[1] != kernelShape[0])
3893+
throw new ArgumentException($"Input channels ({inputShape[1]}) must match kernel input channels ({kernelShape[0]})");
3894+
3895+
// Default stride and padding
3896+
stride ??= new int[] { 1, 1 };
3897+
padding ??= new int[] { 0, 0 };
3898+
3899+
if (stride.Length != 2 || padding.Length != 2)
3900+
throw new ArgumentException("Stride and padding must be 2D arrays [height, width]");
3901+
3902+
int batch = inputShape[0];
3903+
int inChannels = inputShape[1];
3904+
int inHeight = inputShape[2];
3905+
int inWidth = inputShape[3];
3906+
int multiplier = kernelShape[1];
3907+
int kernelHeight = kernelShape[2];
3908+
int kernelWidth = kernelShape[3];
3909+
int strideH = stride[0];
3910+
int strideW = stride[1];
3911+
int padH = padding[0];
3912+
int padW = padding[1];
3913+
3914+
// Calculate output dimensions
3915+
int outHeight = (inHeight + 2 * padH - kernelHeight) / strideH + 1;
3916+
int outWidth = (inWidth + 2 * padW - kernelWidth) / strideW + 1;
3917+
int outChannels = inChannels * multiplier;
3918+
3919+
// Validate bias if provided
3920+
if (bias != null)
3921+
{
3922+
var biasShape = bias.Value.Shape;
3923+
if (biasShape.Length != 1 || biasShape[0] != outChannels)
3924+
throw new ArgumentException($"Bias must be 1D tensor of length {outChannels}");
3925+
}
3926+
3927+
var outputShape = new int[] { batch, outChannels, outHeight, outWidth };
3928+
var result = new Tensor<T>(outputShape);
3929+
3930+
// Forward pass: Depthwise convolution
3931+
// For each input channel c, apply multiplier filters to produce multiplier output channels
3932+
for (int b = 0; b < batch; b++)
3933+
{
3934+
for (int ic = 0; ic < inChannels; ic++)
3935+
{
3936+
for (int m = 0; m < multiplier; m++)
3937+
{
3938+
int oc = ic * multiplier + m; // Output channel index
3939+
for (int oh = 0; oh < outHeight; oh++)
3940+
{
3941+
for (int ow = 0; ow < outWidth; ow++)
3942+
{
3943+
T sum = numOps.Zero;
3944+
3945+
// Convolve with the kernel for this input channel and multiplier
3946+
for (int kh = 0; kh < kernelHeight; kh++)
3947+
{
3948+
for (int kw = 0; kw < kernelWidth; kw++)
3949+
{
3950+
int ih = oh * strideH + kh - padH;
3951+
int iw = ow * strideW + kw - padW;
3952+
3953+
// Check bounds (padding is implicit - zero outside bounds)
3954+
if (ih >= 0 && ih < inHeight && iw >= 0 && iw < inWidth)
3955+
{
3956+
T inputVal = input.Value[b, ic, ih, iw];
3957+
T kernelVal = kernel.Value[ic, m, kh, kw];
3958+
sum = numOps.Add(sum, numOps.Multiply(inputVal, kernelVal));
3959+
}
3960+
}
3961+
}
3962+
3963+
// Add bias if provided
3964+
if (bias != null)
3965+
sum = numOps.Add(sum, bias.Value[oc]);
3966+
3967+
result[b, oc, oh, ow] = sum;
3968+
}
3969+
}
3970+
}
3971+
}
3972+
}
3973+
3974+
void BackwardFunction(Tensor<T> gradient)
3975+
{
3976+
// Gradient w.r.t. input
3977+
if (input.RequiresGradient)
3978+
{
3979+
if (input.Gradient == null)
3980+
input.Gradient = new Tensor<T>(inputShape);
3981+
3982+
for (int b = 0; b < batch; b++)
3983+
{
3984+
for (int ic = 0; ic < inChannels; ic++)
3985+
{
3986+
for (int m = 0; m < multiplier; m++)
3987+
{
3988+
int oc = ic * multiplier + m;
3989+
for (int oh = 0; oh < outHeight; oh++)
3990+
{
3991+
for (int ow = 0; ow < outWidth; ow++)
3992+
{
3993+
T grad = gradient[b, oc, oh, ow];
3994+
3995+
for (int kh = 0; kh < kernelHeight; kh++)
3996+
{
3997+
for (int kw = 0; kw < kernelWidth; kw++)
3998+
{
3999+
int ih = oh * strideH + kh - padH;
4000+
int iw = ow * strideW + kw - padW;
4001+
4002+
if (ih >= 0 && ih < inHeight && iw >= 0 && iw < inWidth)
4003+
{
4004+
T kernelVal = kernel.Value[ic, m, kh, kw];
4005+
T delta = numOps.Multiply(grad, kernelVal);
4006+
input.Gradient[b, ic, ih, iw] = numOps.Add(
4007+
input.Gradient[b, ic, ih, iw], delta);
4008+
}
4009+
}
4010+
}
4011+
}
4012+
}
4013+
}
4014+
}
4015+
}
4016+
}
4017+
4018+
// Gradient w.r.t. kernel
4019+
if (kernel.RequiresGradient)
4020+
{
4021+
if (kernel.Gradient == null)
4022+
kernel.Gradient = new Tensor<T>(kernelShape);
4023+
4024+
for (int b = 0; b < batch; b++)
4025+
{
4026+
for (int ic = 0; ic < inChannels; ic++)
4027+
{
4028+
for (int m = 0; m < multiplier; m++)
4029+
{
4030+
int oc = ic * multiplier + m;
4031+
for (int oh = 0; oh < outHeight; oh++)
4032+
{
4033+
for (int ow = 0; ow < outWidth; ow++)
4034+
{
4035+
T grad = gradient[b, oc, oh, ow];
4036+
4037+
for (int kh = 0; kh < kernelHeight; kh++)
4038+
{
4039+
for (int kw = 0; kw < kernelWidth; kw++)
4040+
{
4041+
int ih = oh * strideH + kh - padH;
4042+
int iw = ow * strideW + kw - padW;
4043+
4044+
if (ih >= 0 && ih < inHeight && iw >= 0 && iw < inWidth)
4045+
{
4046+
T inputVal = input.Value[b, ic, ih, iw];
4047+
T delta = numOps.Multiply(grad, inputVal);
4048+
kernel.Gradient[ic, m, kh, kw] = numOps.Add(
4049+
kernel.Gradient[ic, m, kh, kw], delta);
4050+
}
4051+
}
4052+
}
4053+
}
4054+
}
4055+
}
4056+
}
4057+
}
4058+
}
4059+
4060+
// Gradient w.r.t. bias
4061+
if (bias != null && bias.RequiresGradient)
4062+
{
4063+
if (bias.Gradient == null)
4064+
bias.Gradient = new Tensor<T>(new int[] { outChannels });
4065+
4066+
for (int b = 0; b < batch; b++)
4067+
{
4068+
for (int oc = 0; oc < outChannels; oc++)
4069+
{
4070+
for (int oh = 0; oh < outHeight; oh++)
4071+
{
4072+
for (int ow = 0; ow < outWidth; ow++)
4073+
{
4074+
bias.Gradient[oc] = numOps.Add(bias.Gradient[oc], gradient[b, oc, oh, ow]);
4075+
}
4076+
}
4077+
}
4078+
}
4079+
}
4080+
}
4081+
4082+
var parents = bias != null
4083+
? new List<ComputationNode<T>> { input, kernel, bias }
4084+
: new List<ComputationNode<T>> { input, kernel };
4085+
4086+
var node = new ComputationNode<T>(
4087+
value: result,
4088+
requiresGradient: input.RequiresGradient || kernel.RequiresGradient || (bias?.RequiresGradient ?? false),
4089+
parents: parents,
4090+
backwardFunction: BackwardFunction,
4091+
name: null);
4092+
4093+
var tape = GradientTape<T>.Current;
4094+
if (tape != null && tape.IsRecording)
4095+
tape.RecordOperation(node);
4096+
4097+
return node;
4098+
}
38464099
}
38474100
}

0 commit comments

Comments
 (0)