microsoft
diff --git a/‎c_reference/include/conv1d.h‎
Lines changed: 27 additions & 17 deletions b/‎c_reference/include/conv1d.h‎
Lines changed: 27 additions & 17 deletions
diff --git a/‎c_reference/include/utils.h‎
Lines changed: 18 additions & 1 deletion b/‎c_reference/include/utils.h‎
Lines changed: 18 additions & 1 deletion
@@ -4,10 +4,14 @@
 #ifndef __CONV1D_H__
 #define __CONV1D_H__
 
+// Currently dilation is not supported. We have coded separate functions for regular and depthwise conv1d(and low-rank versions). They currently do not support the use of groups
+// We use a custom matVec with offset (in utils) for our conv codes. This allows us to decompose the operation using the low-rank property and decrease the run-time
+// The unoptimized version would be to first compute the weights and then perform the convolution
+
 /**
  * @brief Model parameters for the 1D Convolution Layer
- * @var   W    pointer to convolution weights W, size for regular = out_channels * in_channels * kernel_size, size for depth based = out_channels * kernel_size
- * @var   B    pointer to the bias vector for the convolution, size = out_channels
+ * @var   W    pointer to the flattened conv weights, original shape for regular = [out_channels, kernel_size, in_channels], shape for depthwise = [in_channels, kernel_size, 1]
+ * @var   B    pointer to the bias vector, original shape = [out_channels]
  */
 typedef struct ConvLayers_Params {
   const float* const W;
@@ -34,8 +38,8 @@ typedef struct ConvLayers_Params {
  *                                2: tanh
  *                                3: relu
  */
-int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal, 
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, 
+int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
+  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation);
 
 /**
@@ -58,15 +62,15 @@ int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const
  *                                2: tanh
  *                                3: relu
  */
-int conv1d_depth(float* output_signal, unsigned out_time, const float* input_signal, 
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, 
+int conv1d_depth(float* output_signal, unsigned out_time, const float* input_signal,
+  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation);
 
 /**
  * @brief Model parameters for the 1D Low Rank Convolution Layer.
- * @var    W1      pointer to the 1st low-rank component of the weights, size = out_channels * rank
- * @var    W2      pointer to the 2nd low-rank component of the weights, size for regular = rank * in_channels * kernel_size, size for depthwise = rank * kernel_size
- * @var    B       pointer to the bias vector for the convolution, shape = [out_channels]
+ * @var    W1      pointer to the flattened 1st low-rank component of the weights, original shape = [out_channels, rank]. For depthwise out_channels = in_channels
+ * @var    W2      pointer to the flattened 2nd low-rank component of the weights, original shape for regular = [rank, kernel_size, in_channels], shape for depthwise = [rank, kernel_size, 1]
+ * @var    B       pointer to the flattened bias vector for the convolution, original shape = [out_channels]
  * @var    rank    rank of the weight tensor. A low-rank decomposition typically used to reduce computation and storage
  */
 typedef struct ConvLayers_LR_Params {
@@ -78,8 +82,6 @@ typedef struct ConvLayers_LR_Params {
 
 /**
  * @brief Model definition for the 1D Low-Rank Convolution Layer. Currently only for dilation = 1
- * @brief Identical to the non-low-rank form. One modification is the multiplication of the weights handled within the layer
- * @brief The Weights W1 and W2 are multiplied within the layer using a matmul function from utils. Operation : W1 * W2
  * @param[out]   output_signal    pointer to the output signal, size = out_time * out_channels
  * @param[in]    out_time         number of time steps in the output
  * @param[in]    out_channels     number of output channels for the ouput of the conv layer
@@ -98,14 +100,12 @@ typedef struct ConvLayers_LR_Params {
  *                                2: tanh
  *                                3: relu
  */
-int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal, 
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, 
+int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
+  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation);
 
 /**
  * @brief Model definition for the 1D Low-Rank Depthwise Convolution Layer. Currently only for dilation = 1
- * @brief Identical to the non-low-rank form. One modification is the multiplication of the weights handled within the layer
- * @brief The Weights W1 and W2 are multiplied within the layer using a matmul function from utils. Operation : W1 * W2
  * @param[out]   output_signal    pointer to the output signal, size = out_time * in_channels
  *                                NOTE: out_channels == in_channels for depthwise conv
  * @param[in]    out_time         number of time steps in the output
@@ -123,9 +123,19 @@ int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, co
  *                                1: sigmoid
  *                                2: tanh
  *                                3: relu
+ Note for the usage of conv1d_depth_lr:
+ The depthwise with low-rank conv1d code currently uses an unoptimized implementation based on the computation of the conv weights, followed by the depthwise convolution
+ The reason for using the unoptimized implementation for the depthwise with low-rank conv1d is due to the violation of the depthwise constraints when the low-rank decomposition is applied
+ The use of depthwise convolution imposes a constraint on the out_channels of the weight matrix. When the low-rank decomposition is applied on top of this matrix, these constraints will be violated
+ The decomposition converts the depthwise conv into a fully-connected layer and a convolution layer with weight [rank, kernel_size, 1]
+ The new smaller weight matrix resembles a depthwise conv. But here, typically, in_channels > rank. This causes a violation in the matrix constraints for depthwise convolution
+ Hence, due to the violation, we cannot split the opeartion and would need to use the unoptimized solution with full-rank weight computation followed by convolution
+
+ The depthwise with low-rank code is recommended for extreme storage constraints with no major constraints on the computation cost
+ For all other puposes, we recommend the use of a combinantion of depthwise conv, low-rank conv and regular conv
  */
-int conv1d_depth_lr(float* output_signal, unsigned out_time, const float* input_signal, 
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, 
+int conv1d_depth_lr(float* output_signal, unsigned out_time, const float* input_signal,
+  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation);
 
 // Auxiliary Layers
 
@@ -31,6 +31,23 @@ void matVec(const float* const mat, const float* const vec,
   float alpha, float beta,
   float* const ret);
 
+/* Matrix-vector multiplication with a row offset
+   This function was developed primarily for the conv1d function. This helps bypass the permutation of the time and channel axis
+   ret is of size nrows, vec is of size ncols
+   mat is of size nrows * ncols, stored in row major
+   depthwise is to change the matVec to depthwise specific convolutions
+   row_stride is the offset factor between two adjacent rows
+   Note : This matrix-vector multiplication is useful for matrices where a certain number of columns are dropped
+   For a normal matVec case, this value will be ncols
+   Eg : for a 400 x 400 matrix and a 100 length vector, we can consider the top 400 x 100 elements for the multiplication. For this eg ncols will be 100 and row_stride will be 400
+   vec_stride is the offset fector between 2 elements in a vector i.e. the elements of a vector are placed at "n" intervals
+   For a normal matVec case, this value will be 1
+   Eg : For matVec with a 400 x 100 matrix a vector of length 100 is needed. So it's possible to enter a 400 length vector and consider every 4th element. For this ncols will be 100 and vec_stride will be 4*/
+void offset_matVec_conv1d(const float* mat, const float* vec,
+  unsigned nrows, unsigned ncols,
+  unsigned row_stride, unsigned vec_stride,
+  unsigned depthwise, float* ret);
+
 /* Scaled matrix-matrix multiplication: ret = alpha * ret + beta * matA * matB
    matA      first matrix; size = nrows * ncommon
    matB      second matrix; size = ncommon * ncols
@@ -41,7 +58,7 @@ void matVec(const float* const mat, const float* const vec,
    beta      scaling factor for the result of the multiplication (matA * matB)
    ret       matrix multiplication output
  */
-void matmul(const float* const matA, const float* const matB,
+void matMul(const float* const matA, const float* const matB,
   unsigned nrows, unsigned ncommon, unsigned ncols,
   float alpha, float beta,
   float* const ret);