Skip to content

Commit 5e6848d

Browse files
authored
[Cherry-pick] Support compile for arm ft (#25241)
1 parent 4d8c10a commit 5e6848d

File tree

9 files changed

+71
-55
lines changed

9 files changed

+71
-55
lines changed

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}
8888
option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
8989
option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
9090
option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
91+
option(WITH_ARM "Compile PaddlePaddle with arm support" OFF)
9192

9293
# PY_VERSION
9394
if(NOT PY_VERSION)
@@ -199,6 +200,12 @@ if(WITH_AMD_GPU)
199200
include(hip)
200201
endif(WITH_AMD_GPU)
201202

203+
if(WITH_ARM)
204+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
205+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
206+
add_definitions(-DPADDLE_WITH_ARM)
207+
endif()
208+
202209
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
203210

204211
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

cmake/external/openblas.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ SET(CBLAS_SOURCE_DIR ${THIRD_PARTY_PATH}/openblas/src/extern_openblas)
1919
SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
2020
SET(CBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
2121
SET(CBLAS_TAG v0.3.7)
22+
IF(WITH_ARM)
23+
SET(CBLAS_TAG v0.2.18)
24+
ENDIF()
2225
cache_third_party(extern_openblas
2326
REPOSITORY ${CBLAS_REPOSITORY}
2427
TAG ${CBLAS_TAG}

cmake/flags.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ set(GPU_COMMON_FLAGS
187187
-Wno-error=unused-function # Warnings in Numpy Header.
188188
-Wno-error=array-bounds # Warnings in Eigen::array
189189
)
190-
if (NOT WITH_NV_JETSON)
190+
if (NOT WITH_NV_JETSON AND NOT WITH_ARM)
191191
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
192192
endif()
193193
endif(NOT WIN32)

paddle/fluid/operators/match_matrix_tensor_op.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,8 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
288288
auto* r_data = bottom_r_data + (offset_r[b] + j) * dim_in;
289289
auto* r_diff = bottom_r_diff + (offset_r[b] + j) * dim_in;
290290
if (diff != 0.0) {
291-
avx_axpy(r_data, l_trans_diff, dim_in, diff);
292-
avx_axpy(l_trans_data, r_diff, dim_in, diff);
291+
axpy(r_data, l_trans_diff, dim_in, diff);
292+
axpy(l_trans_data, r_diff, dim_in, diff);
293293
}
294294
}
295295
}

paddle/fluid/operators/pyramid_hash_op.cc

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -385,8 +385,8 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
385385
}
386386
auto weight_type = _blobs_0->type();
387387
if (_is_training == 0 && weight_type != framework::proto::VarType::INT8) {
388-
avx_axpy_noadd(top_data, top_data, top->dims()[0] * top->dims()[1],
389-
_drop_out_percent);
388+
axpy_noadd(top_data, top_data, top->dims()[0] * top->dims()[1],
389+
_drop_out_percent);
390390
}
391391
}
392392
};
@@ -451,7 +451,7 @@ class CPUPyramidHashOPGradKernel : public framework::OpKernel<T> {
451451
int _space_len) const {
452452
for (int j = 0; j != _num_emb; j += _rand_len) {
453453
unsigned int pos = XXH32(hash_id, len * sizeof(T), j) % _space_len;
454-
avx_axpy(top_pos + j, weights + pos, _rand_len, mlr);
454+
axpy(top_pos + j, weights + pos, _rand_len, mlr);
455455
}
456456
}
457457

@@ -525,9 +525,7 @@ REGISTER_OPERATOR(pyramid_hash_grad, ops::PyramidHashOpGrad);
525525

526526
REGISTER_OP_CPU_KERNEL(
527527
pyramid_hash, ops::CPUPyramidHashOPKernel<plt::CPUDeviceContext, float>,
528-
ops::CPUPyramidHashOPKernel<plt::CPUDeviceContext, double>,
529528
ops::CPUPyramidHashOPKernel<plt::CPUDeviceContext, int8_t>);
530529
REGISTER_OP_CPU_KERNEL(
531530
pyramid_hash_grad,
532-
ops::CPUPyramidHashOPGradKernel<plt::CPUDeviceContext, float>,
533-
ops::CPUPyramidHashOPGradKernel<plt::CPUDeviceContext, double>);
531+
ops::CPUPyramidHashOPGradKernel<plt::CPUDeviceContext, float>);

paddle/fluid/operators/search_compute.h

Lines changed: 46 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ limitations under the License. */
1414

1515
#pragma once
1616

17+
#if !defined(PADDLE_WITH_ARM)
1718
#include <immintrin.h>
19+
#endif
1820
#include <cfloat>
1921
#include <cmath>
2022
#include <cstring>
@@ -72,6 +74,8 @@ void call_gemm_batched(const framework::ExecutionContext& ctx,
7274
}
7375
}
7476

77+
#if !defined(PADDLE_WITH_ARM)
78+
7579
#define __m256x __m256
7680

7781
static const unsigned int AVX_STEP_SIZE = 8;
@@ -83,16 +87,25 @@ static const unsigned int AVX_CUT_LEN_MASK = 7U;
8387
#define _mm256_store_px _mm256_storeu_ps
8488
#define _mm256_broadcast_sx _mm256_broadcast_ss
8589

86-
#define _mm256_mul_pd _mm256_mul_pd
87-
#define _mm256_add_pd _mm256_add_pd
88-
#define _mm256_load_pd _mm256_loadu_pd
89-
#define _mm256_store_pd _mm256_storeu_pd
90-
#define _mm256_broadcast_sd _mm256_broadcast_sd
90+
#define __m128x __m128
91+
92+
static const unsigned int SSE_STEP_SIZE = 2;
93+
static const unsigned int SSE_CUT_LEN_MASK = 1U;
94+
95+
#define _mm_add_px _mm_add_ps
96+
#define _mm_mul_px _mm_mul_ps
97+
#define _mm_load_px _mm_loadu_ps
98+
#define _mm_store_px _mm_storeu_ps
99+
#define _mm_load1_px _mm_load1_ps
100+
101+
#endif
91102

92-
inline void avx_axpy(const float* x, float* y, size_t len, const float alpha) {
103+
template <typename T>
104+
inline void axpy(const T* x, T* y, size_t len, const T alpha) {
93105
unsigned int jjj, lll;
94106
jjj = lll = 0;
95107

108+
#ifdef PADDLE_WITH_AVX
96109
lll = len & ~AVX_CUT_LEN_MASK;
97110
__m256x mm_alpha = _mm256_broadcast_sx(&alpha);
98111
for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
@@ -101,66 +114,55 @@ inline void avx_axpy(const float* x, float* y, size_t len, const float alpha) {
101114
_mm256_add_px(_mm256_load_px(y + jjj),
102115
_mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
103116
}
104-
105-
for (; jjj < len; jjj++) {
106-
y[jjj] += alpha * x[jjj];
117+
#elif defined(PADDLE_WITH_ARM)
118+
PADDLE_THROW(platform::errors::Unimplemented("axpy is not supported"));
119+
#else
120+
lll = len & ~SSE_CUT_LEN_MASK;
121+
__m128x mm_alpha = _mm_load1_px(&alpha);
122+
for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
123+
_mm_store_px(y + jjj,
124+
_mm_add_px(_mm_load_px(y + jjj),
125+
_mm_mul_px(mm_alpha, _mm_load_px(x + jjj))));
107126
}
108-
}
109-
110-
inline void avx_axpy(const double* x, double* y, size_t len,
111-
const float alpha) {
112-
unsigned int jjj, lll;
113-
jjj = lll = 0;
114127

115-
lll = len & ~AVX_CUT_LEN_MASK;
116-
double alpha_d = static_cast<double>(alpha);
117-
118-
__m256d mm_alpha = _mm256_broadcast_sd(&alpha_d);
119-
for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
120-
_mm256_store_pd(
121-
y + jjj,
122-
_mm256_add_pd(_mm256_load_pd(y + jjj),
123-
_mm256_mul_pd(mm_alpha, _mm256_load_pd(x + jjj))));
124-
}
128+
#endif
125129

126130
for (; jjj < len; jjj++) {
127131
y[jjj] += alpha * x[jjj];
128132
}
129133
}
130-
inline void avx_axpy_noadd(const double* x, double* y, size_t len,
131-
const float alpha) {
132-
unsigned int jjj, lll;
133-
jjj = lll = 0;
134-
double alpha_d = static_cast<double>(alpha);
135-
lll = len & ~AVX_CUT_LEN_MASK;
136-
__m256d mm_alpha = _mm256_broadcast_sd(&alpha_d);
137-
for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
138-
_mm256_store_pd(y + jjj, _mm256_mul_pd(mm_alpha, _mm256_load_pd(x + jjj)));
139-
}
140134

141-
for (; jjj < len; jjj++) {
142-
y[jjj] = alpha * x[jjj];
143-
}
144-
}
145-
inline void avx_axpy_noadd(const float* x, float* y, size_t len,
146-
const float alpha) {
135+
template <typename T>
136+
inline void axpy_noadd(const T* x, T* y, size_t len, const T alpha) {
147137
unsigned int jjj, lll;
148138
jjj = lll = 0;
149139

140+
#ifdef PADDLE_WITH_AVX
150141
lll = len & ~AVX_CUT_LEN_MASK;
151142
__m256x mm_alpha = _mm256_broadcast_sx(&alpha);
152143
for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
153144
_mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)));
154145
}
146+
#elif defined(PADDLE_WITH_ARM)
147+
PADDLE_THROW(platform::errors::Unimplemented("axpy_noadd is not supported"));
148+
#else
149+
lll = len & ~SSE_CUT_LEN_MASK;
150+
__m128x mm_alpha = _mm_load1_px(&alpha);
151+
for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
152+
_mm_store_px(y + jjj, _mm_mul_px(mm_alpha, _mm_load_px(x + jjj)));
153+
}
154+
155+
#endif
155156

156157
for (; jjj < len; jjj++) {
157158
y[jjj] = alpha * x[jjj];
158159
}
159160
}
160-
inline void avx_axpy_noadd(const int8_t* x, int8_t* y, size_t len,
161-
const float alpha) {
161+
162+
inline void axpy_noadd(const int8_t* x, int8_t* y, size_t len,
163+
const float alpha) {
162164
PADDLE_THROW(platform::errors::Unimplemented(
163-
"int8_t input of avx_axpy_noadd is not supported"));
165+
"int8_t input of axpy_noadd is not supported"));
164166
}
165167

166168
} // namespace operators

paddle/fluid/platform/cpu_info.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
139139
if (cpu_isa == isa_any) {
140140
return true;
141141
} else {
142+
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
142143
int reg[4];
143144
cpuid(reg, 0);
144145
int nIds = reg[0];
@@ -168,6 +169,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
168169
}
169170
}
170171
#endif
172+
#endif
171173

172174
} // namespace platform
173175
} // namespace paddle

paddle/fluid/platform/cpu_info.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,14 @@ limitations under the License. */
4040
#ifdef _WIN32
4141
#define cpuid(reg, x) __cpuidex(reg, x, 0)
4242
#else
43+
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
4344
#include <cpuid.h>
4445
inline void cpuid(int reg[4], int x) {
4546
__cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
4647
}
4748
#endif
4849
#endif
50+
#endif
4951

5052
namespace paddle {
5153
namespace platform {

python/setup.py.in

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import shutil
66
import sys
77
import fnmatch
88
import errno
9+
import platform
910

1011
from contextlib import contextmanager
1112
from setuptools import Command
@@ -301,8 +302,9 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
301302
command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
302303
else:
303304
command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
304-
if os.system(command) != 0:
305-
raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
305+
if platform.machine() != 'aarch64':
306+
if os.system(command) != 0:
307+
raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
306308

307309
ext_modules = [Extension('_foo', ['stub.cc'])]
308310
if os.name == 'nt':

0 commit comments

Comments
 (0)