Skip to content

Commit 7de5a5d

Browse files
authored
[CPU][ARM] int8 Convolution support (#30457)
### Details: - This is the initial support of int8 convolution on ARM - Limitations: - s32 bias support only - i8 / u8 output support only ### Tickets: - CVS-167319
1 parent 892e720 commit 7de5a5d

File tree

11 files changed

+440
-17
lines changed

11 files changed

+440
-17
lines changed

src/plugins/intel_cpu/src/nodes/conv.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,13 @@ std::tuple<ov::element::Type, ov::element::Type> Convolution::getDstAndSumPrecis
427427
}
428428
};
429429

430+
// ACL requires dst precision matches src precision for int8
431+
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
432+
if (canBeExecutedInInt8()) {
433+
return {getOriginalInputPrecisionAtPort(0), ov::element::dynamic};
434+
}
435+
#endif
436+
430437
auto dstType = getOriginalOutputPrecisionAtPort(0);
431438

432439
// make sure dst type is equal to the output type of the last fused node

src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ static const std::unordered_map<int, ACLArgs> argConvert = {{ARG_SRC_0, ACL_SRC_
2626
{ARG_BIAS, ACL_BIAS},
2727
{ARG_WEI, ACL_WEI},
2828
{ARG_DST, ACL_DST},
29-
{ARG_DST_DEQ_SCALE, ACL_DST_DEQ_SCALE}};
29+
{ARG_DST_DEQ_SCALE, ACL_DST_DEQ_SCALE},
30+
{ARG_ATTR_ZERO_POINTS | ARG_SRC_0, ACL_SRC_0_ZERO_POINTS},
31+
{ARG_ATTR_ZERO_POINTS | ARG_DST, ACL_DST_ZERO_POINTS}};
3032

3133
using ACLTypes = std::array<arm_compute::DataType, ACLArgs::COUNT_OF_ARGS>;
3234
using ACLLayouts = std::array<arm_compute::DataLayout, ACLArgs::COUNT_OF_ARGS>;

src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,13 @@ namespace ov::intel_cpu {
1212

1313
enum ACLArgs : uint8_t {
1414
ACL_SRC_0,
15+
ACL_SRC_0_ZERO_POINTS,
1516
ACL_SRC_1,
1617
ACL_SRC_2,
1718
ACL_BIAS,
1819
ACL_WEI,
1920
ACL_DST,
21+
ACL_DST_ZERO_POINTS,
2022
ACL_DST_DEQ_SCALE,
2123
COUNT_OF_ARGS
2224
};
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "acl_conv.hpp"
6+
7+
#include <arm_compute/core/CoreTypes.h>
8+
#include <arm_compute/core/Error.h>
9+
#include <arm_compute/core/QuantizationInfo.h>
10+
#include <arm_compute/core/TensorInfo.h>
11+
#include <arm_compute/core/TensorShape.h>
12+
#include <arm_compute/core/Types.h>
13+
#include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
14+
15+
#include <any>
16+
#include <cmath>
17+
#include <memory>
18+
19+
#include "acl_utils.hpp"
20+
#include "cpu_shape.h"
21+
#include "memory_desc/cpu_memory_desc.h"
22+
#include "nodes/common/cpu_convert.h"
23+
#include "nodes/executors/acl/acl_common_executor.hpp"
24+
#include "nodes/executors/convolution_config.hpp"
25+
#include "nodes/executors/debug_messages.hpp"
26+
#include "nodes/executors/executor.hpp"
27+
#include "nodes/executors/memory_arguments.hpp"
28+
#include "openvino/core/except.hpp"
29+
#include "openvino/core/type/element_type.hpp"
30+
#include "post_ops.hpp"
31+
#include "utils/general_utils.h"
32+
33+
namespace ov::intel_cpu {
34+
35+
ACLConvolutionExecutor::ACLConvolutionExecutor(const ConvAttrs& attrs,
36+
const MemoryArgs& memory,
37+
[[maybe_unused]] const ExecutorContext::CPtr& context)
38+
: weightScale(attrs.dqScales) {
39+
MemoryDescPtr srcMemPtr = memory.at(ARG_SRC_0)->getDescPtr();
40+
MemoryDescPtr weiMemPtr = memory.at(ARG_WEI)->getDescPtr();
41+
MemoryDescPtr dstMemPtr = memory.at(ARG_DST)->getDescPtr();
42+
43+
Shape weiShape = weiMemPtr->getShape();
44+
Shape srcShape = srcMemPtr->getShape();
45+
Shape dstShape = dstMemPtr->getShape();
46+
47+
const auto with_groups = static_cast<const int>(weiShape.getRank() == srcShape.getRank() + 1);
48+
const int kh = weiShape.getDims()[with_groups + srcShape.getRank() - 2];
49+
const int kw = weiShape.getDims()[with_groups + srcShape.getRank() - 1];
50+
const int oc = dstShape.getDims()[1];
51+
52+
weightsInfo = arm_compute::WeightsInfo(false, kw, kh, oc, false, arm_compute::WeightFormat::UNSPECIFIED);
53+
auto paddingLeft = (attrs.paddingL.size() >= 2U) ? attrs.paddingL[1] : attrs.paddingL[0];
54+
auto paddingRight = (attrs.paddingR.size() >= 2U) ? attrs.paddingR[1] : attrs.paddingR[0];
55+
auto paddingTop = (attrs.paddingL.size() >= 2U) ? attrs.paddingL[0] : 0;
56+
auto paddingBottom = (attrs.paddingR.size() >= 2U) ? attrs.paddingR[0] : 0;
57+
padStrideInfo = arm_compute::PadStrideInfo(attrs.stride[0],
58+
attrs.stride[1],
59+
paddingLeft,
60+
paddingRight,
61+
paddingTop,
62+
paddingBottom,
63+
arm_compute::DimensionRoundingType::FLOOR);
64+
dilation = arm_compute::Size2D(attrs.dilation[1] + 1, attrs.dilation[0] + 1);
65+
66+
if (attrs.postOps.size() == 1) {
67+
if (const auto* const activation = std::any_cast<ActivationPostOp>(attrs.postOps.data())) {
68+
activationLayerInfo = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()),
69+
activation->alpha(),
70+
activation->beta(),
71+
activation->gamma());
72+
} else if (const auto* const fq = std::any_cast<FakeQuantizePostOp>(attrs.postOps.data())) {
73+
fqInputScale = fq->inputScale();
74+
fqInputShift = fq->inputShift();
75+
fqOutputScale = fq->outputScale();
76+
fqOutputShift = fq->outputShift();
77+
if (fqOutputScale.size() == 1 && fqOutputScale[0] == 1.0F && fqOutputShift.size() == 1 &&
78+
fqOutputShift[0] == std::trunc(fqOutputShift[0])) {
79+
for (auto& v : fqInputShift) {
80+
v += fqOutputShift[0];
81+
}
82+
fqOutputShift.clear();
83+
}
84+
} else {
85+
OPENVINO_THROW("ACLConvolutionExecutor: the executor supports FakeQuantize and Activation post ops only");
86+
}
87+
} else if (attrs.postOps.size() > 1) {
88+
OPENVINO_THROW("ACLConvolutionExecutor: ACL does not support more than 1 post op");
89+
}
90+
}
91+
92+
bool ACLConvolutionExecutor::supports(const ConvConfig& config) {
93+
bool isQuantized = any_of(config.descs.at(ARG_SRC)->getPrecision(), ov::element::u8, ov::element::i8) &&
94+
config.descs.at(ARG_WEI)->getPrecision() == ov::element::i8;
95+
96+
VERIFY(isQuantized, UNSUPPORTED_SRC_PRECISIONS);
97+
VERIFY(config.attrs.postOps.size() <= 1U, UNSUPPORTED_BY_EXECUTOR);
98+
99+
return true;
100+
}
101+
102+
arm_compute::Status ACLConvolutionExecutor::validateTensorsInfo(const ACLInfos& aclMemoryInfos) {
103+
// Note: LPT propagate dequantization scales from src and weights on conv output, and the result scale
104+
// is applied as weight scale. So quantization configuration forms in the following way:
105+
// - src quantization info is always trivial
106+
// - weights: scale is equal to result dequantization scale after Convolution propagated by LPT
107+
// shift is not supported
108+
// - destination: scale is formed based on requantization FakeQuantize parameters: scale = 1.0 / input scale
109+
// shift = input shift
110+
aclMemoryInfos[ACLArgs::ACL_SRC_0]->set_quantization_info(arm_compute::QuantizationInfo(1.0));
111+
aclMemoryInfos[ACLArgs::ACL_WEI]->set_quantization_info(
112+
arm_compute::QuantizationInfo(weightScale.empty() ? 1.0F : weightScale[0]));
113+
aclMemoryInfos[ACLArgs::ACL_DST]->set_quantization_info(
114+
arm_compute::QuantizationInfo(fqInputScale.empty() ? 1.0F : 1.0F / fqInputScale[0],
115+
fqInputShift.empty() ? 0 : fqInputShift[0]));
116+
117+
return arm_compute::NEConvolutionLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
118+
aclMemoryInfos[ACLArgs::ACL_WEI].get(),
119+
aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
120+
aclMemoryInfos[ACLArgs::ACL_DST].get(),
121+
padStrideInfo,
122+
weightsInfo,
123+
dilation,
124+
activationLayerInfo);
125+
}
126+
127+
ACLFunction ACLConvolutionExecutor::configureFunction(const ACLTensors& aclMemoryTensors) {
128+
auto neConv = std::make_unique<arm_compute::NEConvolutionLayer>();
129+
130+
neConv->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
131+
aclMemoryTensors[ACLArgs::ACL_WEI].get(),
132+
aclMemoryTensors[ACLArgs::ACL_BIAS].get(),
133+
aclMemoryTensors[ACLArgs::ACL_DST].get(),
134+
padStrideInfo,
135+
weightsInfo,
136+
dilation,
137+
activationLayerInfo);
138+
return neConv;
139+
}
140+
141+
std::shared_ptr<arm_compute::TensorInfo> ACLConvolutionExecutor::initTensorInfo(
142+
const arm_compute::TensorShape& tensorShape,
143+
const arm_compute::DataType& dataType,
144+
const arm_compute::DataLayout& dataLayout) {
145+
arm_compute::DataType result = arm_compute::DataType::UNKNOWN;
146+
switch (dataType) {
147+
case arm_compute::DataType::S8: {
148+
result = arm_compute::DataType::QASYMM8_SIGNED;
149+
break;
150+
}
151+
case arm_compute::DataType::U8: {
152+
result = arm_compute::DataType::QASYMM8;
153+
break;
154+
}
155+
default: {
156+
result = dataType;
157+
break;
158+
}
159+
}
160+
161+
return ACLCommonExecutor::initTensorInfo(tensorShape, result, dataLayout);
162+
}
163+
164+
} // namespace ov::intel_cpu
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "acl_common_executor.hpp"
8+
#include "nodes/executors/convolution_config.hpp"
9+
10+
namespace ov::intel_cpu {
11+
12+
class ACLConvolutionExecutor : public ACLCommonExecutor {
13+
public:
14+
ACLConvolutionExecutor(const ConvAttrs& attrs, const MemoryArgs& memory, const ExecutorContext::CPtr& context);
15+
16+
static bool supports(const ConvConfig& config);
17+
void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
18+
arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) override;
19+
ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) override;
20+
21+
protected:
22+
std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
23+
const arm_compute::DataType& dataType,
24+
const arm_compute::DataLayout& dataLayout) override;
25+
26+
private:
27+
ConvAttrs convAttrs;
28+
arm_compute::PadStrideInfo padStrideInfo;
29+
arm_compute::WeightsInfo weightsInfo;
30+
arm_compute::Size2D dilation;
31+
arm_compute::ActivationLayerInfo activationLayerInfo;
32+
33+
std::vector<float> fqInputScale;
34+
std::vector<float> fqOutputScale;
35+
std::vector<float> fqInputShift;
36+
std::vector<float> fqOutputShift;
37+
std::vector<float> weightScale;
38+
};
39+
40+
using ACLConvolutionExecutorPtr = std::shared_ptr<ACLConvolutionExecutor>;
41+
42+
} // namespace ov::intel_cpu

src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,10 @@ static bool checkPostOps(const PostOps& postOps) {
4141
return false;
4242
}
4343

44-
const auto& activation = std::any_cast<const ActivationPostOp&>(postOps[0]);
45-
return checkActivationLayerInfo(convertToEltwiseAlgorithm(activation.type()));
44+
if (const auto& activation = std::any_cast<const ActivationPostOp>(postOps.data())) {
45+
return checkActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()));
46+
}
47+
return false;
4648
}
4749

4850
static void initFCAttrs(const FCAttrs& attrs,

src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@
2929
# include "nodes/executors/executor.hpp"
3030
#endif
3131

32+
#if defined(OV_CPU_WITH_ACL)
33+
# include "nodes/executors/acl/acl_conv.hpp"
34+
#endif
35+
3236
namespace ov::intel_cpu {
3337

3438
using namespace ov::element;
@@ -64,6 +68,12 @@ static const TypeMapping dnnlConvTypeMapping {
6468
{{_any, _any, _any, _any}, {just<f32>(), just<f32>(), just<f32>(), just<f32>()}},
6569
// @todo explicitly cover configuration limitations for oneDNN on ARM
6670
};
71+
72+
static const TypeMapping aclLowpConvTypeMapping {
73+
// {src, wei, bia, dst} pt<src, wei, bias, dst>
74+
{{_u8, _u8 | _i8, _any, _u8}, {bypass(), bypass(), just<i32>(), bypass()}},
75+
{{_i8, _i8, _any, _i8}, {bypass(), bypass(), just<i32>(), bypass()}},
76+
};
6777
// clang-format on
6878
struct CreateOptimalConfigDefault {
6979
std::optional<ConvConfig> operator()(const ConvConfig& config) const {
@@ -73,6 +83,14 @@ struct CreateOptimalConfigDefault {
7383
LayoutConfig layoutConfig;
7484
};
7585

86+
struct CreateOptimalConfigAclLowp {
87+
std::optional<ConvConfig> operator()(const ConvConfig& config) const {
88+
return createOptimalConfigCommon(config, aclLowpConvTypeMapping, layoutConfig, dnnlConvolutionMappingNotation);
89+
}
90+
91+
LayoutConfig layoutConfig;
92+
};
93+
7694
[[maybe_unused]] static inline bool isQuantized(const ConvConfig& config) {
7795
return any_of(config.descs.at(ARG_SRC)->getPrecision(), ov::element::u8, ov::element::i8) &&
7896
config.descs.at(ARG_WEI)->getPrecision() == ov::element::i8;
@@ -233,7 +251,7 @@ const std::vector<ExecutorImplementation<ConvAttrs>>& getImplementations() {
233251
[](const ConvConfig& config, const MemoryFormatFilter& memoryFormatFilter) -> bool {
234252
VERIFY(MatchesMemoryFormatFilter(config.descs, LayoutConfig{LayoutType::nspc, LayoutType::ncsp, LayoutType::nspc, LayoutType::nspc},
235253
memoryFormatFilter, dnnlConvolutionMappingNotation), MEMORY_FORMAT_MISMATCH);
236-
254+
VERIFY(!isQuantized(config), UNSUPPORTED_SRC_PRECISIONS);
237255
return true;
238256
},
239257
CreateOptimalConfigDefault{{LayoutType::nspc, LayoutType::ncsp, LayoutType::nspc, LayoutType::nspc}},
@@ -256,6 +274,17 @@ const std::vector<ExecutorImplementation<ConvAttrs>>& getImplementations() {
256274
AcceptsAnyShape<ConvAttrs>,
257275
CreateDnnlDefault<DnnlConvolutionPrimitive, ConvAttrs>{}
258276
)
277+
OV_CPU_INSTANCE_ACL(
278+
"convolution_acl_lowp", ExecutorType::Acl, OperationType::Convolution,
279+
// supports
280+
[](const ConvConfig& config, [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool {
281+
VERIFY(ACLConvolutionExecutor::supports(config), UNSUPPORTED_BY_EXECUTOR);
282+
return true;
283+
},
284+
CreateOptimalConfigAclLowp{{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}},
285+
AcceptsAnyShape<ConvAttrs>,
286+
CreateDefault<ACLConvolutionExecutor, ConvAttrs>{}
287+
)
259288
};
260289

261290
return convolutionImplementations;

src/plugins/intel_cpu/src/nodes/fake_quantize.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1781,11 +1781,11 @@ void FakeQuantize::executeReference() {
17811781
parallel_nd(N, C, D, H, W, [&](dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) {
17821782
size_t src_off = n * s_str[0];
17831783
if (srcDims.size() == 5) {
1784-
src_off += d * s_str[2] + h * s_str[3] + w * s_str[4];
1784+
src_off += c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4];
17851785
} else if (srcDims.size() == 4) {
1786-
src_off += h * s_str[2] + w * s_str[3];
1786+
src_off += c * s_str[1] + h * s_str[2] + w * s_str[3];
17871787
} else if (srcDims.size() == 3) {
1788-
src_off += h * s_str[2];
1788+
src_off += c * s_str[1] + h * s_str[2];
17891789
} else if (srcDims.size() == 2) {
17901790
src_off += c * s_str[1];
17911791
}
@@ -1809,13 +1809,15 @@ void FakeQuantize::executeReference() {
18091809
dst_val = roundf(dst_val);
18101810
dst_val = dst_val * osc + osh;
18111811

1812-
size_t dst_off = n * d_str[0] + c * d_str[1];
1812+
size_t dst_off = n * d_str[0];
18131813
if (dstDims.size() == 5) {
1814-
dst_off += d * d_str[2] + h * d_str[3] + w * d_str[4];
1814+
dst_off += c * d_str[1] + d * d_str[2] + h * d_str[3] + w * d_str[4];
18151815
} else if (dstDims.size() == 4) {
1816-
dst_off += h * d_str[2] + w * d_str[3];
1816+
dst_off += c * d_str[1] + h * d_str[2] + w * d_str[3];
18171817
} else if (dstDims.size() == 3) {
1818-
dst_off += h * d_str[2];
1818+
dst_off += c * d_str[1] + h * d_str[2];
1819+
} else if (dstDims.size() == 2) {
1820+
dst_off += c * d_str[1];
18191821
}
18201822

18211823
dst[dst_off] = dst_val;

src/plugins/intel_cpu/src/nodes_factory.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@
119119
#endif
120120

121121
#if defined(OPENVINO_ARCH_ARM64)
122+
# include "nodes/fake_quantize.h"
122123
# include "nodes/paged_attn.h"
123124
#endif
124125

@@ -246,6 +247,7 @@ Node::NodesFactory::NodesFactory() : Factory("NodesFactory") {
246247
INTEL_CPU_NODE(PagedAttention, Type::PagedAttention);
247248
INTEL_CPU_NODE(RMSNorm, Type::RMS);
248249
#elif defined(OPENVINO_ARCH_ARM64)
250+
INTEL_CPU_NODE(FakeQuantize, Type::FakeQuantize);
249251
INTEL_CPU_NODE(PagedAttention, Type::PagedAttention);
250252
#endif
251253
}

0 commit comments

Comments
 (0)