[CPU][ARM] int8 Convolution support (#30457)

alvoron · web-flow · commit 7de5a5dfa2eb · 2025-10-06T10:57:08.000Z
### Details: - This is the initial support of int8 convolution on ARM - Limitations: - s32 bias support only - i8 / u8 output support only ### Tickets: - CVS-167319
diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp
@@ -427,6 +427,13 @@ std::tuple<ov::element::Type, ov::element::Type> Convolution::getDstAndSumPrecis
         }
     };
 
+// ACL requires dst precision matches src precision for int8
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+    if (canBeExecutedInInt8()) {
+        return {getOriginalInputPrecisionAtPort(0), ov::element::dynamic};
+    }
+#endif
+
     auto dstType = getOriginalOutputPrecisionAtPort(0);
 
     // make sure dst type is equal to the output type of the last fused node
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp
@@ -26,7 +26,9 @@ static const std::unordered_map<int, ACLArgs> argConvert = {{ARG_SRC_0, ACL_SRC_
                                                             {ARG_BIAS, ACL_BIAS},
                                                             {ARG_WEI, ACL_WEI},
                                                             {ARG_DST, ACL_DST},
-                                                            {ARG_DST_DEQ_SCALE, ACL_DST_DEQ_SCALE}};
+                                                            {ARG_DST_DEQ_SCALE, ACL_DST_DEQ_SCALE},
+                                                            {ARG_ATTR_ZERO_POINTS | ARG_SRC_0, ACL_SRC_0_ZERO_POINTS},
+                                                            {ARG_ATTR_ZERO_POINTS | ARG_DST, ACL_DST_ZERO_POINTS}};
 
 using ACLTypes = std::array<arm_compute::DataType, ACLArgs::COUNT_OF_ARGS>;
 using ACLLayouts = std::array<arm_compute::DataLayout, ACLArgs::COUNT_OF_ARGS>;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp
@@ -12,11 +12,13 @@ namespace ov::intel_cpu {
 
 enum ACLArgs : uint8_t {
     ACL_SRC_0,
+    ACL_SRC_0_ZERO_POINTS,
     ACL_SRC_1,
     ACL_SRC_2,
     ACL_BIAS,
     ACL_WEI,
     ACL_DST,
+    ACL_DST_ZERO_POINTS,
     ACL_DST_DEQ_SCALE,
     COUNT_OF_ARGS
 };
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp
@@ -0,0 +1,164 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_conv.hpp"
+
+#include <arm_compute/core/CoreTypes.h>
+#include <arm_compute/core/Error.h>
+#include <arm_compute/core/QuantizationInfo.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/Types.h>
+#include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
+
+#include <any>
+#include <cmath>
+#include <memory>
+
+#include "acl_utils.hpp"
+#include "cpu_shape.h"
+#include "memory_desc/cpu_memory_desc.h"
+#include "nodes/common/cpu_convert.h"
+#include "nodes/executors/acl/acl_common_executor.hpp"
+#include "nodes/executors/convolution_config.hpp"
+#include "nodes/executors/debug_messages.hpp"
+#include "nodes/executors/executor.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "openvino/core/except.hpp"
+#include "openvino/core/type/element_type.hpp"
+#include "post_ops.hpp"
+#include "utils/general_utils.h"
+
+namespace ov::intel_cpu {
+
+ACLConvolutionExecutor::ACLConvolutionExecutor(const ConvAttrs& attrs,
+                                               const MemoryArgs& memory,
+                                               [[maybe_unused]] const ExecutorContext::CPtr& context)
+    : weightScale(attrs.dqScales) {
+    MemoryDescPtr srcMemPtr = memory.at(ARG_SRC_0)->getDescPtr();
+    MemoryDescPtr weiMemPtr = memory.at(ARG_WEI)->getDescPtr();
+    MemoryDescPtr dstMemPtr = memory.at(ARG_DST)->getDescPtr();
+
+    Shape weiShape = weiMemPtr->getShape();
+    Shape srcShape = srcMemPtr->getShape();
+    Shape dstShape = dstMemPtr->getShape();
+
+    const auto with_groups = static_cast<const int>(weiShape.getRank() == srcShape.getRank() + 1);
+    const int kh = weiShape.getDims()[with_groups + srcShape.getRank() - 2];
+    const int kw = weiShape.getDims()[with_groups + srcShape.getRank() - 1];
+    const int oc = dstShape.getDims()[1];
+
+    weightsInfo = arm_compute::WeightsInfo(false, kw, kh, oc, false, arm_compute::WeightFormat::UNSPECIFIED);
+    auto paddingLeft = (attrs.paddingL.size() >= 2U) ? attrs.paddingL[1] : attrs.paddingL[0];
+    auto paddingRight = (attrs.paddingR.size() >= 2U) ? attrs.paddingR[1] : attrs.paddingR[0];
+    auto paddingTop = (attrs.paddingL.size() >= 2U) ? attrs.paddingL[0] : 0;
+    auto paddingBottom = (attrs.paddingR.size() >= 2U) ? attrs.paddingR[0] : 0;
+    padStrideInfo = arm_compute::PadStrideInfo(attrs.stride[0],
+                                               attrs.stride[1],
+                                               paddingLeft,
+                                               paddingRight,
+                                               paddingTop,
+                                               paddingBottom,
+                                               arm_compute::DimensionRoundingType::FLOOR);
+    dilation = arm_compute::Size2D(attrs.dilation[1] + 1, attrs.dilation[0] + 1);
+
+    if (attrs.postOps.size() == 1) {
+        if (const auto* const activation = std::any_cast<ActivationPostOp>(attrs.postOps.data())) {
+            activationLayerInfo = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()),
+                                                         activation->alpha(),
+                                                         activation->beta(),
+                                                         activation->gamma());
+        } else if (const auto* const fq = std::any_cast<FakeQuantizePostOp>(attrs.postOps.data())) {
+            fqInputScale = fq->inputScale();
+            fqInputShift = fq->inputShift();
+            fqOutputScale = fq->outputScale();
+            fqOutputShift = fq->outputShift();
+            if (fqOutputScale.size() == 1 && fqOutputScale[0] == 1.0F && fqOutputShift.size() == 1 &&
+                fqOutputShift[0] == std::trunc(fqOutputShift[0])) {
+                for (auto& v : fqInputShift) {
+                    v += fqOutputShift[0];
+                }
+                fqOutputShift.clear();
+            }
+        } else {
+            OPENVINO_THROW("ACLConvolutionExecutor: the executor supports FakeQuantize and Activation post ops only");
+        }
+    } else if (attrs.postOps.size() > 1) {
+        OPENVINO_THROW("ACLConvolutionExecutor: ACL does not support more than 1 post op");
+    }
+}
+
+bool ACLConvolutionExecutor::supports(const ConvConfig& config) {
+    bool isQuantized = any_of(config.descs.at(ARG_SRC)->getPrecision(), ov::element::u8, ov::element::i8) &&
+                       config.descs.at(ARG_WEI)->getPrecision() == ov::element::i8;
+
+    VERIFY(isQuantized, UNSUPPORTED_SRC_PRECISIONS);
+    VERIFY(config.attrs.postOps.size() <= 1U, UNSUPPORTED_BY_EXECUTOR);
+
+    return true;
+}
+
+arm_compute::Status ACLConvolutionExecutor::validateTensorsInfo(const ACLInfos& aclMemoryInfos) {
+    // Note: LPT propagate dequantization scales from src and weights on conv output, and the result scale
+    // is applied as weight scale. So quantization configuration forms in the following way:
+    // - src quantization info is always trivial
+    // - weights: scale is equal to result dequantization scale after Convolution propagated by LPT
+    //            shift is not supported
+    // - destination: scale is formed based on requantization FakeQuantize parameters: scale = 1.0 / input scale
+    //                shift = input shift
+    aclMemoryInfos[ACLArgs::ACL_SRC_0]->set_quantization_info(arm_compute::QuantizationInfo(1.0));
+    aclMemoryInfos[ACLArgs::ACL_WEI]->set_quantization_info(
+        arm_compute::QuantizationInfo(weightScale.empty() ? 1.0F : weightScale[0]));
+    aclMemoryInfos[ACLArgs::ACL_DST]->set_quantization_info(
+        arm_compute::QuantizationInfo(fqInputScale.empty() ? 1.0F : 1.0F / fqInputScale[0],
+                                      fqInputShift.empty() ? 0 : fqInputShift[0]));
+
+    return arm_compute::NEConvolutionLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+                                                     aclMemoryInfos[ACLArgs::ACL_WEI].get(),
+                                                     aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
+                                                     aclMemoryInfos[ACLArgs::ACL_DST].get(),
+                                                     padStrideInfo,
+                                                     weightsInfo,
+                                                     dilation,
+                                                     activationLayerInfo);
+}
+
+ACLFunction ACLConvolutionExecutor::configureFunction(const ACLTensors& aclMemoryTensors) {
+    auto neConv = std::make_unique<arm_compute::NEConvolutionLayer>();
+
+    neConv->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+                      aclMemoryTensors[ACLArgs::ACL_WEI].get(),
+                      aclMemoryTensors[ACLArgs::ACL_BIAS].get(),
+                      aclMemoryTensors[ACLArgs::ACL_DST].get(),
+                      padStrideInfo,
+                      weightsInfo,
+                      dilation,
+                      activationLayerInfo);
+    return neConv;
+}
+
+std::shared_ptr<arm_compute::TensorInfo> ACLConvolutionExecutor::initTensorInfo(
+    const arm_compute::TensorShape& tensorShape,
+    const arm_compute::DataType& dataType,
+    const arm_compute::DataLayout& dataLayout) {
+    arm_compute::DataType result = arm_compute::DataType::UNKNOWN;
+    switch (dataType) {
+    case arm_compute::DataType::S8: {
+        result = arm_compute::DataType::QASYMM8_SIGNED;
+        break;
+    }
+    case arm_compute::DataType::U8: {
+        result = arm_compute::DataType::QASYMM8;
+        break;
+    }
+    default: {
+        result = dataType;
+        break;
+    }
+    }
+
+    return ACLCommonExecutor::initTensorInfo(tensorShape, result, dataLayout);
+}
+
+}  // namespace ov::intel_cpu
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.hpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "acl_common_executor.hpp"
+#include "nodes/executors/convolution_config.hpp"
+
+namespace ov::intel_cpu {
+
+class ACLConvolutionExecutor : public ACLCommonExecutor {
+public:
+    ACLConvolutionExecutor(const ConvAttrs& attrs, const MemoryArgs& memory, const ExecutorContext::CPtr& context);
+
+    static bool supports(const ConvConfig& config);
+    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
+    arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) override;
+    ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) override;
+
+protected:
+    std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                                            const arm_compute::DataType& dataType,
+                                                            const arm_compute::DataLayout& dataLayout) override;
+
+private:
+    ConvAttrs convAttrs;
+    arm_compute::PadStrideInfo padStrideInfo;
+    arm_compute::WeightsInfo weightsInfo;
+    arm_compute::Size2D dilation;
+    arm_compute::ActivationLayerInfo activationLayerInfo;
+
+    std::vector<float> fqInputScale;
+    std::vector<float> fqOutputScale;
+    std::vector<float> fqInputShift;
+    std::vector<float> fqOutputShift;
+    std::vector<float> weightScale;
+};
+
+using ACLConvolutionExecutorPtr = std::shared_ptr<ACLConvolutionExecutor>;
+
+}  // namespace ov::intel_cpu
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
@@ -41,8 +41,10 @@ static bool checkPostOps(const PostOps& postOps) {
         return false;
     }
 
-    const auto& activation = std::any_cast<const ActivationPostOp&>(postOps[0]);
-    return checkActivationLayerInfo(convertToEltwiseAlgorithm(activation.type()));
+    if (const auto& activation = std::any_cast<const ActivationPostOp>(postOps.data())) {
+        return checkActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()));
+    }
+    return false;
 }
 
 static void initFCAttrs(const FCAttrs& attrs,
diff --git a/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp
@@ -29,6 +29,10 @@
 #    include "nodes/executors/executor.hpp"
 #endif
 
+#if defined(OV_CPU_WITH_ACL)
+#    include "nodes/executors/acl/acl_conv.hpp"
+#endif
+
 namespace ov::intel_cpu {
 
 using namespace ov::element;
@@ -64,6 +68,12 @@ static const TypeMapping dnnlConvTypeMapping {
     {{_any, _any, _any, _any},                                {just<f32>(), just<f32>(), just<f32>(), just<f32>()}},
     // @todo explicitly cover configuration limitations for oneDNN on ARM
 };
+
+static const TypeMapping aclLowpConvTypeMapping {
+    // {src, wei, bia, dst}                            pt<src, wei, bias, dst>
+    {{_u8, _u8 | _i8, _any, _u8},                      {bypass(), bypass(), just<i32>(), bypass()}},
+    {{_i8, _i8, _any, _i8},                            {bypass(), bypass(), just<i32>(), bypass()}},
+};
 // clang-format on
 struct CreateOptimalConfigDefault {
     std::optional<ConvConfig> operator()(const ConvConfig& config) const {
@@ -73,6 +83,14 @@ struct CreateOptimalConfigDefault {
     LayoutConfig layoutConfig;
 };
 
+struct CreateOptimalConfigAclLowp {
+    std::optional<ConvConfig> operator()(const ConvConfig& config) const {
+        return createOptimalConfigCommon(config, aclLowpConvTypeMapping, layoutConfig, dnnlConvolutionMappingNotation);
+    }
+
+    LayoutConfig layoutConfig;
+};
+
 [[maybe_unused]] static inline bool isQuantized(const ConvConfig& config) {
     return any_of(config.descs.at(ARG_SRC)->getPrecision(), ov::element::u8, ov::element::i8) &&
            config.descs.at(ARG_WEI)->getPrecision() == ov::element::i8;
@@ -233,7 +251,7 @@ const std::vector<ExecutorImplementation<ConvAttrs>>& getImplementations() {
             [](const ConvConfig& config, const MemoryFormatFilter& memoryFormatFilter) -> bool {
                 VERIFY(MatchesMemoryFormatFilter(config.descs, LayoutConfig{LayoutType::nspc, LayoutType::ncsp, LayoutType::nspc, LayoutType::nspc},
                                                  memoryFormatFilter, dnnlConvolutionMappingNotation), MEMORY_FORMAT_MISMATCH);
-
+                VERIFY(!isQuantized(config), UNSUPPORTED_SRC_PRECISIONS);
                 return true;
             },
             CreateOptimalConfigDefault{{LayoutType::nspc, LayoutType::ncsp, LayoutType::nspc, LayoutType::nspc}},
@@ -256,6 +274,17 @@ const std::vector<ExecutorImplementation<ConvAttrs>>& getImplementations() {
             AcceptsAnyShape<ConvAttrs>,
             CreateDnnlDefault<DnnlConvolutionPrimitive, ConvAttrs>{}
             )
+        OV_CPU_INSTANCE_ACL(
+            "convolution_acl_lowp", ExecutorType::Acl, OperationType::Convolution,
+            // supports
+            [](const ConvConfig& config, [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool {
+                VERIFY(ACLConvolutionExecutor::supports(config), UNSUPPORTED_BY_EXECUTOR);
+                return true;
+            },
+            CreateOptimalConfigAclLowp{{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}},
+            AcceptsAnyShape<ConvAttrs>,
+            CreateDefault<ACLConvolutionExecutor, ConvAttrs>{}
+            )
     };
 
     return convolutionImplementations;
diff --git a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp
@@ -1781,11 +1781,11 @@ void FakeQuantize::executeReference() {
         parallel_nd(N, C, D, H, W, [&](dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) {
             size_t src_off = n * s_str[0];
             if (srcDims.size() == 5) {
-                src_off += d * s_str[2] + h * s_str[3] + w * s_str[4];
+                src_off += c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4];
             } else if (srcDims.size() == 4) {
-                src_off += h * s_str[2] + w * s_str[3];
+                src_off += c * s_str[1] + h * s_str[2] + w * s_str[3];
             } else if (srcDims.size() == 3) {
-                src_off += h * s_str[2];
+                src_off += c * s_str[1] + h * s_str[2];
             } else if (srcDims.size() == 2) {
                 src_off += c * s_str[1];
             }
@@ -1809,13 +1809,15 @@ void FakeQuantize::executeReference() {
             dst_val = roundf(dst_val);
             dst_val = dst_val * osc + osh;
 
-            size_t dst_off = n * d_str[0] + c * d_str[1];
+            size_t dst_off = n * d_str[0];
             if (dstDims.size() == 5) {
-                dst_off += d * d_str[2] + h * d_str[3] + w * d_str[4];
+                dst_off += c * d_str[1] + d * d_str[2] + h * d_str[3] + w * d_str[4];
             } else if (dstDims.size() == 4) {
-                dst_off += h * d_str[2] + w * d_str[3];
+                dst_off += c * d_str[1] + h * d_str[2] + w * d_str[3];
             } else if (dstDims.size() == 3) {
-                dst_off += h * d_str[2];
+                dst_off += c * d_str[1] + h * d_str[2];
+            } else if (dstDims.size() == 2) {
+                dst_off += c * d_str[1];
             }
 
             dst[dst_off] = dst_val;
diff --git a/src/plugins/intel_cpu/src/nodes_factory.cpp b/src/plugins/intel_cpu/src/nodes_factory.cpp
@@ -119,6 +119,7 @@
 #endif
 
 #if defined(OPENVINO_ARCH_ARM64)
+#    include "nodes/fake_quantize.h"
 #    include "nodes/paged_attn.h"
 #endif
 
@@ -246,6 +247,7 @@ Node::NodesFactory::NodesFactory() : Factory("NodesFactory") {
     INTEL_CPU_NODE(PagedAttention, Type::PagedAttention);
     INTEL_CPU_NODE(RMSNorm, Type::RMS);
 #elif defined(OPENVINO_ARCH_ARM64)
+    INTEL_CPU_NODE(FakeQuantize, Type::FakeQuantize);
     INTEL_CPU_NODE(PagedAttention, Type::PagedAttention);
 #endif
 }
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/conv_fq.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/conv_fq.cpp

Original file line number	Diff line number	Diff line change
`@@ -41,8 +41,10 @@ static bool checkPostOps(const PostOps& postOps) {`
`41`	`41`	`return false;`
`42`	`42`	`}`
`43`	`43`
`44`		`- const auto& activation = std::any_cast<const ActivationPostOp&>(postOps[0]);`
`45`		`- return checkActivationLayerInfo(convertToEltwiseAlgorithm(activation.type()));`
	`44`	`+ if (const auto& activation = std::any_cast<const ActivationPostOp>(postOps.data())) {`
	`45`	`+ return checkActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()));`
	`46`	`+ }`
	`47`	`+ return false;`
`46`	`48`	`}`
`47`	`49`
`48`	`50`	`static void initFCAttrs(const FCAttrs& attrs,`