microsoft
diff --git a/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 3 additions & 2 deletions b/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎onnxruntime/core/mlas/inc/mlas.h‎
Lines changed: 1 addition & 0 deletions b/‎onnxruntime/core/mlas/inc/mlas.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎onnxruntime/core/mlas/lib/convolve.cpp‎
Lines changed: 126 additions & 7 deletions b/‎onnxruntime/core/mlas/lib/convolve.cpp‎
Lines changed: 126 additions & 7 deletions
diff --git a/‎onnxruntime/core/mlas/lib/mlasi.h‎
Lines changed: 7 additions & 2 deletions b/‎onnxruntime/core/mlas/lib/mlasi.h‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎…core/mlas/lib/sconv_nchw_kernel_neon.cpp‎ ‎…ib/sconv_nchw_depthwise_multiplier_1.cpp‎onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp renamed to onnxruntime/core/mlas/lib/sconv_nchw_depthwise_multiplier_1.cpp
Lines changed: 47 additions & 40 deletions b/‎…core/mlas/lib/sconv_nchw_kernel_neon.cpp‎ ‎…ib/sconv_nchw_depthwise_multiplier_1.cpp‎onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp renamed to onnxruntime/core/mlas/lib/sconv_nchw_depthwise_multiplier_1.cpp
Lines changed: 47 additions & 40 deletions
@@ -23,6 +23,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qgemm.cpp
   ${MLAS_SRC_DIR}/qdwconv.cpp
   ${MLAS_SRC_DIR}/convolve.cpp
+  ${MLAS_SRC_DIR}/sconv_nchw_depthwise_multiplier_greater_than_1.cpp
   ${MLAS_SRC_DIR}/convsym.cpp
   ${MLAS_SRC_DIR}/pooling.cpp
   ${MLAS_SRC_DIR}/transpose.cpp
@@ -115,7 +116,7 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
         ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp
         ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
-        ${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp
+        ${MLAS_SRC_DIR}/sconv_nchw_depthwise_multiplier_1.cpp
       )
 
       set(mlas_platform_preprocess_srcs
@@ -488,7 +489,7 @@ else()
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.h
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
-          ${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp
+          ${MLAS_SRC_DIR}/sconv_nchw_depthwise_multiplier_1.cpp
         )
 
         # Conditionally add the SVE implementation if compiler supports it
 
@@ -877,6 +877,7 @@ enum MLAS_CONV_ALGORITHM {
     MlasConvAlgorithmGemmDirect,
     MlasConvAlgorithmExpandThenGemm,
     MlasConvAlgorithmExpandThenGemmSegmented,
+    MlasConvAlgorithmDepthwiseWithMultiplier,
 #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
     MlasConvAlgorithmDepthwise,
 #endif
 
@@ -892,6 +892,77 @@ Return Value:
 
 #endif
 
+void
+MlasDepthwiseWithMultiplierThreaded(
+    void* Context,
+    ptrdiff_t Index
+    )
+/*++
+
+Routine Description:
+
+    This routine is invoked from a worker thread to execute a segment of a
+    convolution operation.
+
+    If using this, the entire convolution operation is parallelized on the
+    (batch size * group count) parameter and this routine has logic to
+    perform a specific thread's shard of the entire Convolution operation.
+
+Arguments:
+
+    Context - Supplies the pointer to the context for the threaded operation.
+
+    Index - Supplies the current index of the threaded operation.
+
+Return Value:
+
+    None.
+
+--*/
+{
+    MLAS_CONV_WORK_BLOCK* WorkBlock = (MLAS_CONV_WORK_BLOCK*)Context;
+
+    const MLAS_CONV_PARAMETERS* Parameters = WorkBlock->Parameters;
+    const size_t GroupCount = Parameters->GroupCount;
+    const size_t BatchGroupCount = Parameters->BatchCount * GroupCount;
+
+    size_t BatchGroupStart;
+    size_t BatchGroupRemaining;
+
+    MlasPartitionWork(Index, WorkBlock->TargetThreadCount, BatchGroupCount,
+        &BatchGroupStart, &BatchGroupRemaining);
+
+    size_t BatchGroupEnd = BatchGroupStart + BatchGroupRemaining;
+
+    const size_t FilterCount = Parameters->FilterCount;
+    const size_t OutputSize = Parameters->OutputSize;
+    const size_t K = Parameters->K;
+
+    const size_t InputGroupSize = Parameters->InputChannels * Parameters->InputSize;
+    const size_t OutputGroupSize = FilterCount * OutputSize;
+    const size_t FilterGroupSize = FilterCount * K;
+
+    const float* input = WorkBlock->Input + BatchGroupStart * InputGroupSize;
+    float* output = WorkBlock->Output + BatchGroupStart * OutputGroupSize;
+
+    for (size_t bg = BatchGroupStart; bg < BatchGroupEnd; bg++) {
+        size_t group = bg % GroupCount;
+
+        const float* filter = WorkBlock->Filter + group * FilterGroupSize;
+        const float* bias = WorkBlock->Bias;
+        if (bias != nullptr) {
+            bias += group * FilterCount;
+        }
+
+        MlasConvDepthwiseWithMultiplierFloat_CHW(Parameters, input, filter, output);
+        MlasActivation(Parameters->Activation, output, bias, FilterCount,
+            OutputSize, OutputSize);
+
+        input += InputGroupSize;
+        output += OutputGroupSize;
+    }
+}
+
 inline
 bool
 MlasConvTryMultithread(
@@ -1106,7 +1177,6 @@ Return Value:
         return;
     }
 
-
 #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
 
     if (Algorithm == MlasConvAlgorithmDepthwise && ((BatchCount > 1) || (GroupCount > 1))) {
@@ -1135,6 +1205,28 @@ Return Value:
 
 #endif
 
+    if (Algorithm == MlasConvAlgorithmDepthwiseWithMultiplier && ((BatchCount > 1) || (GroupCount > 1))) {
+        const size_t BatchGroupCount = BatchCount * GroupCount;
+        ptrdiff_t TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool);
+
+        if (static_cast<size_t>(TargetThreadCount) >= BatchGroupCount) {
+            TargetThreadCount = static_cast<ptrdiff_t>(BatchGroupCount);
+        }
+
+        MLAS_CONV_WORK_BLOCK WorkBlock;
+        WorkBlock.Parameters = Parameters;
+        WorkBlock.Input = Input;
+        WorkBlock.Filter = Filter;
+        WorkBlock.Bias = Bias;
+        WorkBlock.WorkingBuffer = nullptr;
+        WorkBlock.Output = Output;
+        WorkBlock.TargetThreadCount = TargetThreadCount;
+
+        MlasExecuteThreaded(MlasDepthwiseWithMultiplierThreaded, &WorkBlock,
+            TargetThreadCount, ThreadPool);
+        return;
+    }
+
     //
     // Iterate over each batch and group.
     //
@@ -1209,6 +1301,13 @@ Return Value:
 
 #endif
 
+                case MlasConvAlgorithmDepthwiseWithMultiplier:
+                {
+                    MlasConvDepthwiseWithMultiplierFloat_CHW(Parameters, Input, filter, Output);
+                    MlasActivation(Parameters->Activation, Output, bias, FilterCount, OutputSize, OutputSize);
+                    break;
+                }
+
                 case MlasConvAlgorithmExpandThenGemmSegmented:
                 {
                     //
@@ -1453,6 +1552,26 @@ Return Value:
 
     } else {
 
+        // Commonly found in MobileNet like models, where the depthwise convolution with
+        // depth_multiplier = 2 is used together with 7x7 kernel shape, stride = 2 and dilation = 1.
+        // This is a very specific scenario, but it is worth to have a specialized kernel for it given
+        // the popularity of MobileNet models.
+        if (Dimensions == 2
+            // depthwise convolution
+            && Parameters->GroupCount > 1
+            && Parameters->InputChannels == 1
+            // depth_multiplier = 2
+            && Parameters->FilterCount == 2
+            // current scope for specialized kernel is for the 7x7 kernel shape
+            && Parameters->KernelShape[0] == 7 && Parameters->KernelShape[1] == 7
+            // keep this specialized kernel only for stride = 2x2
+            && Parameters->StrideShape[0] == 2 && Parameters->StrideShape[1] == 2
+            // keep this specialized kernel only for dilation = 1x1
+            && Parameters->DilationShape[0] == 1 && Parameters->DilationShape[1] == 1) {
+            Parameters->Algorithm = MlasConvAlgorithmDepthwiseWithMultiplier;
+            return;
+        }
+
 #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
 
         // Scalar (WASM_SCALAR) / vectorized (ARM64) direct conv for depthwise convolution.
@@ -1468,12 +1587,12 @@ Return Value:
     #endif
 
         if (Dimensions == 2
-                && Parameters->FilterCount == 1 && Parameters->InputChannels == 1
-                && Parameters->KernelShape[0] == 3 && Parameters->KernelShape[1] == 3
-                && Parameters->Padding[0] <= 1 && Parameters->Padding[1] <= 1
-                && Parameters->Padding[2] <= 1 && Parameters->Padding[3] <= 1
-                && depthwise_conv_stride_support_check
-                && Parameters->DilationShape[0] == 1 && Parameters->DilationShape[1] == 1) {
+            && Parameters->FilterCount == 1 && Parameters->InputChannels == 1
+            && Parameters->KernelShape[0] == 3 && Parameters->KernelShape[1] == 3
+            && Parameters->Padding[0] <= 1 && Parameters->Padding[1] <= 1
+            && Parameters->Padding[2] <= 1 && Parameters->Padding[3] <= 1
+            && depthwise_conv_stride_support_check
+            && Parameters->DilationShape[0] == 1 && Parameters->DilationShape[1] == 1) {
 
             *WorkingBufferSize = Parameters->InputShape[1] + 2;
             Parameters->Algorithm = MlasConvAlgorithmDepthwise;
 
@@ -1638,8 +1638,6 @@ MlasFp32FromBits(
 #endif
 
 #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
-
-
 void
 MLASCALL
 MlasConvDepthwiseFloat_CHW(
@@ -1652,6 +1650,13 @@ MlasConvDepthwiseFloat_CHW(
 
 #endif
 
+void
+MlasConvDepthwiseWithMultiplierFloat_CHW(
+    const MLAS_CONV_PARAMETERS* Parameters,
+    const float* Input,
+    const float* Filter,
+    float* Output
+    );
 
 //
 // Define the missing ARM64 NEON intrinsic macros from arm64_neon.h that enable
 
@@ -6,17 +6,18 @@ Licensed under the MIT License.
 
 Module Name:
 
-    sconv_nchw_kernel_neon.cpp
+    sconv_nchw_depthwise_multiplier_1.cpp
 
 Abstract:
 
-    This module implements the single precision NCHW convolution kernels for ARM NEON.
+    This module implements the single precision NCHW depthwise convolution kernels
+    for depth multiplier 1.
 
 --*/
 
 
 #include "mlasi.h"
-#include <arm_neon.h>
+#include <cassert>
 
 MLAS_FORCEINLINE float DepthwiseSampleValue(
     const float* row,
@@ -50,7 +51,7 @@ MLAS_FORCEINLINE float DepthwiseAccumulateRowScalar(
 }
 
 MLAS_FORCEINLINE void DepthwiseAccumulateRowVector(
-    float32x4_t& acc,
+    MLAS_FLOAT32X4& acc,
     const float* row,
     size_t base,
     float w0,
@@ -63,9 +64,9 @@ MLAS_FORCEINLINE void DepthwiseAccumulateRowVector(
     }
 
     const float* r = row + base;
-    const float32x4_t c0 = MlasLoadFloat32x4(r);
-    const float32x4_t c1 = MlasLoadFloat32x4(r + 1);
-    const float32x4_t c2 = MlasLoadFloat32x4(r + 2);
+    const MLAS_FLOAT32X4 c0 = MlasLoadFloat32x4(r);
+    const MLAS_FLOAT32X4 c1 = MlasLoadFloat32x4(r + 1);
+    const MLAS_FLOAT32X4 c2 = MlasLoadFloat32x4(r + 2);
 
     acc = MlasMultiplyAddFloat32x4(c0, w0, acc);
     acc = MlasMultiplyAddFloat32x4(c1, w1, acc);
@@ -107,12 +108,31 @@ MLAS_FORCEINLINE float DepthwiseComputeEdge(
     return acc;
 }
 
-static void DepthwiseConv3x3Stride1PadLe1Neon(
+static
+void
+MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
     const MLAS_CONV_PARAMETERS* Parameters,
     const float* Input,
     const float* Filter,
     float* Output
-)
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute convolution on one channel input with one filter channel.
+
+Arguments:
+
+    Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
+
+    Input - input channel data start. Input is NCHW, so this pointer points to single H x W image data.
+
+    Filter - Whole filters are of F x CpG x FH x FW, this filter points to single FH x FW filter data.
+
+    Output - whole output are of N x F x OH x OW. This pointer points to single OH x OW output image data.
+
+--*/
 {
     const size_t H = Parameters->InputShape[0];
     const size_t W = Parameters->InputShape[1];
@@ -185,14 +205,14 @@ static void DepthwiseConv3x3Stride1PadLe1Neon(
             }
 
             const size_t base = static_cast<size_t>(iw);
-            float32x4_t acc = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 acc = MlasZeroFloat32x4();
 
             DepthwiseAccumulateRowVector(acc, row0, base, w00, w01, w02);
             DepthwiseAccumulateRowVector(acc, row1, base, w10, w11, w12);
             DepthwiseAccumulateRowVector(acc, row2, base, w20, w21, w22);
 
             if (accumulate_output) {
-                const float32x4_t prev = MlasLoadFloat32x4(out_row + ow);
+                const MLAS_FLOAT32X4 prev = MlasLoadFloat32x4(out_row + ow);
                 acc = MlasMultiplyAddFloat32x4(prev, beta, acc);
             }
 
@@ -230,35 +250,6 @@ static void DepthwiseConv3x3Stride1PadLe1Neon(
     }
 }
 
-static
-void
-MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
-    const MLAS_CONV_PARAMETERS* Parameters,
-    const float* Input,
-    const float* Filter,
-    float* Output
-    )
-/*++
-
-Routine Description:
-
-    This routine is an inner kernel to compute convolution on one channel input with one filter channel.
-
-Arguments:
-
-    Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
-
-    Input - input channel data start. Input is NCHW, so this pointer points to single H x W image data.
-
-    Filter - Whole filters are of F x CpG x FH x FW, this filter points to single FH x FW filter data.
-
-    Output - whole output are of N x F x OH x OW. This pointer points to single OH x OW output image data.
-
---*/
-{
-        DepthwiseConv3x3Stride1PadLe1Neon(Parameters, Input, Filter, Output);
-}
-
 void MlasConvDepthwiseFloat_CHW(
     const MLAS_CONV_PARAMETERS* Parameters,
     const float* Input,
@@ -292,6 +283,22 @@ Routine Description:
 
 --*/
 {
+    assert(Parameters->Dimensions == 2);
+    assert(Parameters->FilterCount == 1);
+    assert(Parameters->InputChannels == 1);
+    assert(Parameters->KernelShape[0] == 3);
+    assert(Parameters->KernelShape[1] == 3);
+    assert(Parameters->StrideShape[0] == 1);
+    assert(Parameters->StrideShape[1] == 1);
+    assert(Parameters->DilationShape[0] == 1);
+    assert(Parameters->DilationShape[1] == 1);
+    assert(Parameters->Padding[0] <= 1);
+    assert(Parameters->Padding[1] <= 1);
+    assert(Parameters->Padding[2] <= 1);
+    assert(Parameters->Padding[3] <= 1);
+
     MLAS_UNREFERENCED_PARAMETER(Zeros);
+
+    // Kernel dispatch
     MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output);
 }