pytorch
diff --git a/‎bench/EmbeddingSpMDMNBitBenchmark.cc‎
Lines changed: 2 additions & 1 deletion b/‎bench/EmbeddingSpMDMNBitBenchmark.cc‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/fbgemm/Fbgemm.h‎
Lines changed: 0 additions & 8 deletions b/‎include/fbgemm/Fbgemm.h‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎src/Fbgemm.cc‎
Lines changed: 5 additions & 0 deletions b/‎src/Fbgemm.cc‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/FbgemmConv.cc‎
Lines changed: 65 additions & 10 deletions b/‎src/FbgemmConv.cc‎
Lines changed: 65 additions & 10 deletions
diff --git a/‎src/FbgemmI8Depthwise3DAvx2.cc‎
Lines changed: 71 additions & 16 deletions b/‎src/FbgemmI8Depthwise3DAvx2.cc‎
Lines changed: 71 additions & 16 deletions
diff --git a/‎src/FbgemmI8DepthwiseAvx2-inl.h‎
Lines changed: 2 additions & 1 deletion b/‎src/FbgemmI8DepthwiseAvx2-inl.h‎
Lines changed: 2 additions & 1 deletion
@@ -206,7 +206,8 @@ static int run_benchmark(
         /*output_bit_rate=*/-1);
 #endif
 
-    vector<OutType>& output = has_weight ? output_slws : output_sls;
+    [[maybe_unused]] vector<OutType>& output =
+        has_weight ? output_slws : output_sls;
     for (bool flush_cache : {false, true}) {
       bool success_ref = false;
       // Reference implementation
 
@@ -161,11 +161,7 @@ class PackMatrix {
    * @brief Actual packing of a block of the source matrix in pmat buffer.
    */
   void pack(const block_type_t& block) {
-#if defined(FBGEMM_FBCODE) || !defined(__aarch64__)
     static_cast<PT*>(this)->pack(block);
-#else
-    throw std::runtime_error("PackMatrix::pack() not implemented for aarch64");
-#endif // __aarch64__
   }
 
   std::int32_t numRows() const {
@@ -616,11 +612,9 @@ class FBGEMM_API PackWeightsForConv {
     return W_im2col_packed_;
   }
 
-#if defined(FBGEMM_FBCODE) || !defined(__aarch64__)
   std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWForDepthwise() {
     return W_dw_packed_;
   }
-#endif // __aarch64__
 
   std::shared_ptr<PackedDirectConvMatrix> getPackedWForDirectconv() {
     return W_dc_packed_;
@@ -672,10 +666,8 @@ class FBGEMM_API PackWeightsForConv {
   const conv_param_t<SPATIAL_DIM> conv_param_;
   // Packed weights if we use im2col based convolution implementation
   std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_;
-#if defined(FBGEMM_FBCODE) || !defined(__aarch64__)
   // Packed weights if we use depthwise convolution implementation
   std::shared_ptr<PackedDepthWiseConvMatrix> W_dw_packed_;
-#endif // __aarch64__
   // Packed weights if we use direct convolution implementation
   std::shared_ptr<PackedDirectConvMatrix> W_dc_packed_;
   // Packed weights if we use groupwise (small channels per group) convolution
 
@@ -203,6 +203,8 @@ void fbgemmPacked(
 
 template <int SPATIAL_DIM>
 bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p) {
+#if defined(__x86_64__) || defined(__i386__) || \
+    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
   if constexpr (SPATIAL_DIM == 1)
     return false;
 
@@ -255,6 +257,9 @@ bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p) {
              return areEqual(std::forward<decltype(PH1)>(PH1), 2);
            })) &&
       !conv_p.transposed;
+#else
+  return false;
+#endif
 }
 
 template FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<1>& conv_p);
 
@@ -12,6 +12,7 @@
 #include <numeric>
 #include <stdexcept> // for logic_error
 #include <vector>
+#include "RefImplementations.h"
 #include "fbgemm/Fbgemm.h"
 
 namespace fbgemm {
@@ -138,10 +139,6 @@ int fbgemmConv(
 
   switch (ConvFastPath<SPATIAL_DIM, ACC_T>(conv_p)) {
     case optimized_conv_t::depthwise: {
-#if defined(__aarch64__)
-      throw std::runtime_error(
-          "fbgemmConv<processOutputType, SPATIAL_DIM, ACC_T>(): No fallback available for aarch64");
-#else
       // 2D and 3D depthwise fast path
       // std::cout << "Depthwise fast path" << std::endl;
       if constexpr (SPATIAL_DIM == 3) {
@@ -220,7 +217,6 @@ int fbgemmConv(
         throw std::runtime_error(msg);
       }
       break;
-#endif // __aarch64__
     }
     case optimized_conv_t::groupwise: {
       // optimized groupwise convolution
@@ -242,6 +238,8 @@ int fbgemmConv(
       break;
     }
     case optimized_conv_t::pointwise: {
+#if defined(__x86_64__) || defined(__i386__) || \
+    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
       std::vector<int32_t> row_offset_buf(
           PackAWithRowOffset<uint8_t>::rowOffsetBufferSize(blocking_params));
       int image_dim = std::accumulate(
@@ -271,16 +269,42 @@ int fbgemmConv(
           thread_id,
           num_threads,
           blocking_params);
+#else
+      DoNothing<> doNothingObj{};
+      ReQuantizeOutput<
+          processOutputType::RELU_FUSED,
+          processOutputType::QGRANType,
+          typename processOutputType::BIAS_T>
+          reqObj(
+              doNothingObj,
+              outProcess.getCMultiplier(),
+              outProcess.getCZeroPoint(),
+              outProcess.getAZeroPoint(),
+              outProcess.getBZeroPoint(),
+              nullptr, /* row offset buffer */
+              outProcess.getColOffsets(),
+              outProcess.getBias(),
+              conv_p.OC,
+              conv_p.G,
+              outProcess.getActWScale());
+
+      conv_requant_ref(
+          conv_p,
+          activations,
+          packed_weights.getPackedWForPointwise()->getBuf(),
+          false,
+          out,
+          outBuffer,
+          reqObj,
+          thread_id,
+          num_threads);
+#endif
       break;
     }
     case optimized_conv_t::directconv: {
       // specialized direct convolution path
       // std::cout << "Directconv fast path" << std::endl;
       if constexpr (SPATIAL_DIM == 2) {
-#if defined(__aarch64__)
-        throw std::runtime_error(
-            "fbgemmConv<processOutputType, SPATIAL_DIM, ACC_T>(): No fallback available for aarch64");
-#else
         fbgemmDirectConv<SPATIAL_DIM, processOutputType::QGRANType>(
             conv_p,
             // Aint8,
@@ -292,7 +316,6 @@ int fbgemmConv(
             outProcess.getBias(),
             thread_id,
             num_threads);
-#endif
       } else {
         assert(false && "1d/3d direct conv not supported");
       }
@@ -302,6 +325,8 @@ int fbgemmConv(
       break;
     }
     case optimized_conv_t::im2col: {
+#if defined(__x86_64__) || defined(__i386__) || \
+    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
       // All other convolutions go through im2col-based implementation
       // std::cout << "Im2col path" << std::endl;
       std::vector<int32_t> row_offset_buf(
@@ -352,6 +377,36 @@ int fbgemmConv(
           thread_id,
           num_threads,
           blocking_params);
+#else
+      DoNothing<> doNothingObj{};
+      ReQuantizeOutput<
+          processOutputType::RELU_FUSED,
+          processOutputType::QGRANType,
+          typename processOutputType::BIAS_T>
+          reqObj(
+              doNothingObj,
+              outProcess.getCMultiplier(),
+              outProcess.getCZeroPoint(),
+              outProcess.getAZeroPoint(),
+              outProcess.getBZeroPoint(),
+              nullptr, /* row offset buffer */
+              outProcess.getColOffsets(),
+              outProcess.getBias(),
+              conv_p.OC,
+              conv_p.G,
+              outProcess.getActWScale());
+
+      conv_requant_ref(
+          conv_p,
+          activations,
+          packed_weights.getPackedWForIm2col()->getBuf(),
+          false,
+          out,
+          outBuffer,
+          reqObj,
+          thread_id,
+          num_threads);
+#endif
       break;
     }
   } // switch
 
@@ -980,6 +980,9 @@ void depthwise_3d_same_pad(
     // In C2, batch size 0 is allowed, so we should just early return.
     return;
   }
+
+#if defined(__x86_64__) || defined(__i386__) || \
+    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
   if (fuse_relu) {
     depthwise_3d_same_pad_<true /*FUSE_RELU*/, Q_GRAN>(
         conv_p,
@@ -1011,24 +1014,76 @@ void depthwise_3d_same_pad(
         thread_id,
         num_threads);
   }
+#else
+  DoNothing<> doNothingObj{};
+  if (fuse_relu) {
+    ReQuantizeOutput<true, Q_GRAN, BIAS_TYPE> reqObj(
+        doNothingObj,
+        C_multiplier,
+        C_zero_point,
+        A_zero_point,
+        B_zero_point,
+        nullptr, /* row offset buffer */
+        col_offsets,
+        bias,
+        conv_p.OC,
+        conv_p.G,
+        act_times_w_scale);
+
+    conv_requant_ref(
+        conv_p,
+        A,
+        B.PackedMat(),
+        false,
+        C,
+        nullptr,
+        reqObj,
+        thread_id,
+        num_threads);
+  } else {
+    ReQuantizeOutput<false, Q_GRAN, BIAS_TYPE> reqObj(
+        doNothingObj,
+        C_multiplier,
+        C_zero_point,
+        A_zero_point,
+        B_zero_point,
+        nullptr, /* row offset buffer */
+        col_offsets,
+        bias,
+        conv_p.OC,
+        conv_p.G,
+        act_times_w_scale);
+
+    conv_requant_ref(
+        conv_p,
+        A,
+        B.PackedMat(),
+        false,
+        C,
+        nullptr,
+        reqObj,
+        thread_id,
+        num_threads);
+  }
+#endif
 }
 
-#define INSTANTIATE_BASE(Q_GRAN, BIAS_TYPE)               \
-  template FBGEMM_API void                                \
-  depthwise_3d_same_pad<QuantizationGranularity::Q_GRAN>( \
-      const conv_param_t<3>& conv_p,                      \
-      int32_t A_zero_point,                               \
-      const uint8_t* A,                                   \
-      const int32_t* B_zero_point,                        \
-      const PackedDepthWiseConvMatrix& B,                 \
-      const float* C_multiplier,                          \
-      int32_t C_zero_point,                               \
-      uint8_t* C,                                         \
-      const int32_t* col_offsets,                         \
-      const BIAS_TYPE* bias,                              \
-      bool fuse_relu,                                     \
-      const float* act_times_w_scale,                     \
-      int thread_id,                                      \
+#define INSTANTIATE_BASE(Q_GRAN, BIAS_TYPE)                       \
+  template FBGEMM_API void                                        \
+  depthwise_3d_same_pad<fbgemm::QuantizationGranularity::Q_GRAN>( \
+      const fbgemm::conv_param_t<3>& conv_p,                      \
+      int32_t A_zero_point,                                       \
+      const uint8_t* A,                                           \
+      const int32_t* B_zero_point,                                \
+      const fbgemm::PackedDepthWiseConvMatrix& B,                 \
+      const float* C_multiplier,                                  \
+      int32_t C_zero_point,                                       \
+      uint8_t* C,                                                 \
+      const int32_t* col_offsets,                                 \
+      const BIAS_TYPE* bias,                                      \
+      bool fuse_relu,                                             \
+      const float* act_times_w_scale,                             \
+      int thread_id,                                              \
       int num_threads);
 
 #define INSTANTIATE_BIAS_T(Q_GRAN)  \
 
@@ -13,7 +13,8 @@
 #include <cmath> // for lrintf and sqrt
 #include <cstdint>
 #include <type_traits> // for is_same
-
+#include "RefImplementations.h"
+#include "fbgemm/Fbgemm.h"
 #if defined(__x86_64__) || defined(__i386__) || \
     (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
 #include <immintrin.h>