diff --git a/CMakeLists.txt b/CMakeLists.txt index 62b99d136..144c8b343 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,7 +112,7 @@ set(SOURCES src/cpu/backend.cc src/cpu/cpu_info.cc src/cpu/cpu_isa.cc - src/cpu/kernels.cc + #src/cpu/kernels.cc src/cpu/parallel.cc src/cpu/primitives.cc src/decoding.cc @@ -242,7 +242,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm64)|(aarch64)" elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(amd64)|(AMD64)") add_definitions(-DCT2_X86_BUILD) set(CT2_BUILD_ARCH "x86_64") - +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(PPC64LE)") + add_definitions(-DCT2_PPC64LE_BUILD) + set(CT2_BUILD_ARCH "ppc64le") + if(BUILD_SHARED_LIBS) set(CMAKE_POSITION_INDEPENDENT_CODE ON) endif() @@ -269,6 +272,8 @@ if(ENABLE_CPU_DISPATCH) endif() elseif(CT2_BUILD_ARCH STREQUAL "arm64") ct2_compile_kernels_for_isa(neon "-DUSE_NEON") + elseif(CT2_BUILD_ARCH STREQUAL "ppc64le") + ct2_compile_kernels_for_isa(ppc64le "-mcpu=power10 -O3 -flto") endif() endif() diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le new file mode 100644 index 000000000..e608d359a --- /dev/null +++ b/docker/Dockerfile.ppc64le @@ -0,0 +1,91 @@ +FROM ppc64le/ubuntu:22.04 as builder + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + wget \ + git \ + build-essential \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +WORKDIR /root + +RUN python3 -m pip --no-cache-dir install cmake==3.22.* + +RUN wget -qO- https://public.dhe.ibm.com/software/server/POWER/Linux/toolchain/at/ubuntu/dists/jammy/615d762f.gpg.key | tee /etc/apt/trusted.gpg.d/615d762f.asc && \ + echo "deb [signed-by=/etc/apt/trusted.gpg.d/615d762f.asc] https://public.dhe.ibm.com/software/server/POWER/Linux/toolchain/at/ubuntu jammy at17.0" >> /etc/apt/sources.list && \ + cat /etc/apt/sources.list && \ + cat /etc/apt/trusted.gpg.d/615d762f.asc && \ + apt update && \ + cat /etc/apt/sources.list && \ + apt install -y advance-toolchain-at17.0-runtime advance-toolchain-at17.0-devel advance-toolchain-at17.0-perf advance-toolchain-at17.0-mcore-libs + +ENV SLEEF_VERSION=3.6.1 +RUN wget -q https://github.com/shibatch/sleef/archive/refs/tags/${SLEEF_VERSION}.tar.gz && \ + tar xf *.tar.gz && \ + rm *.tar.gz && \ + cd sleef* && \ + mkdir build && \ + cd build && \ + cmake -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -std=gnu++11 -maltivec -mabi=altivec -mstrict-align ' -DCMAKE_C_COMPILER=/opt/at17.0/bin/gcc -DCMAKE_CXX_COMPILER=/opt/at17.0/bin/g++ -DAT_PATH=/opt/at17.0/ -DBUILD_SHARED_LIBS=FALSE -DBUILD_TESTS=FALSE -DENFORCE_VSX3=TRUE -DSLEEF_SHOW_CONFIG=1 -DCMAKE_BUILD_TYPE=Release .. && \ + cd .. && \ + cmake --build build -j --clean-first && \ + cmake --install build --prefix=/usr/ + + +ENV ONEDNN_VERSION=3.1.1 +RUN wget -q https://github.com/oneapi-src/oneDNN/archive/refs/tags/v${ONEDNN_VERSION}.tar.gz && \ + tar xf *.tar.gz && \ + rm *.tar.gz && \ + cd oneDNN-* && \ + cmake -DCMAKE_BUILD_TYPE=Release -DONEDNN_LIBRARY_TYPE=STATIC -DONEDNN_BUILD_EXAMPLES=OFF -DONEDNN_BUILD_TESTS=OFF -DONEDNN_ENABLE_WORKLOAD=INFERENCE -DONEDNN_ENABLE_PRIMITIVE="CONVOLUTION;REORDER" -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -maltivec' -DOPENMP_RUNTIME=COMP . && \ + make -j$(nproc) install && \ + cd .. && \ + rm -r oneDNN-* + +COPY third_party third_party +COPY cli cli +COPY include include +COPY src src +COPY cmake cmake +COPY python python +COPY CMakeLists.txt . + +ARG CXX_FLAGS +ENV CXX_FLAGS=${CXX_FLAGS:-"-mcpu=power10 -mtune=power10 -O3 -ffp-contract=off"} + +ENV CTRANSLATE2_ROOT=/opt/ctranslate2 + +RUN mkdir build && \ + cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=${CTRANSLATE2_ROOT} \ + -DWITH_CUDA=OFF -DWITH_MKL=OFF -DWITH_OPENBLAS=OFF \ + -DWITH_DNNL=ON -DOPENMP_RUNTIME=COMP \ + -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \ + -DCMAKE_BUILD_TYPE=Release \ + .. && \ + VERBOSE=1 make -j$(nproc) install + +ENV LANG=en_US.UTF-8 +COPY README.md . + +RUN cd python && \ + python3 -m pip --no-cache-dir install -r install_requirements.txt && \ + python3 setup-ppc64le.py bdist_wheel --dist-dir $CTRANSLATE2_ROOT + + +ENV CTRANSLATE2_ROOT=/opt/ctranslate2 +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CTRANSLATE2_ROOT/lib + +#COPY --from=builder $CTRANSLATE2_ROOT $CTRANSLATE2_ROOT +RUN pip3 install --force-reinstall ninja + + +RUN python3 -m pip --no-cache-dir install $CTRANSLATE2_ROOT/*.whl && \ + rm $CTRANSLATE2_ROOT/*.whl + +ENTRYPOINT ["/opt/ctranslate2/bin/ct2-translator"] diff --git a/docs/ppc64le.md b/docs/ppc64le.md new file mode 100644 index 000000000..4828aad2b --- /dev/null +++ b/docs/ppc64le.md @@ -0,0 +1,56 @@ +# IBM Power10 -ppc64le + +CTranslate2 fully supports IBM Power10 MMA and VSX extensions. Each Power10 core has 4 Matrix Math Accelerator units. For optimum performance use at least SMT4, in some cases SMT8 seems to perform better, but it is advicable to try out both. A simple way to test this is to use --intra_threads parameter to control the number of threads CTranslate2 is executing. At maximum this should be 8*number of physical cores (SMT-8). + +Based on preliminary testing Power10 core offer 27-42% higher tokens/s compared to Intel Gold Core. + +It should be possible to build for Power9, but missing MMA units will have significant impact on performance. + +OneDNN is used for int8 matrix math that is fully utilizing MMA units, it should be possible to build with OpenBLAS for 16bit MMA usage. + +## Build docker / podman container + +This is the easy way: +```git clone --recursive https://github.com/OpenNMT/CTranslate2/ +cd CTranslate2/docker +podman build -t elinar.ai/ct2-ppc64le -f Dockerfile.ppc64le .. + +``` + +Then run CTranslate2 container (substitue mount point, MODEL_LOCATION and SRC_FILE): +```podman run --security-opt=label=disable --ipc=host --ulimit=host -it --rm -v /tmp:/tmp elinar.ai/ct2-ppc64le --model MODEL_LOCATION --src SRC_FILE --intra_threads 16``` + +## Install from sources +This build has been tested on RHEL 9 / ppc64le and requires IBM Advance Toolchain 17.0 ( https://www.ibm.com/support/pages/advance-toolchain-linux-power ) +``` +#sleef: +git clone -b 3.6.1 https://github.com/shibatch/sleef + +cd sleef +mkdir build && cd build +cmake -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -std=gnu++11 -maltivec -mabi=altivec -mstrict-align ' -DCMAKE_C_COMPILER=/opt/at17.0/bin/gcc -DCMAKE_CXX_COMPILER=/opt/at17.0/bin/g++ -DAT_PATH=/opt/at17.0/ -DBUILD_SHARED_LIBS=FALSE -DBUILD_TESTS=FALSE -DENFORCE_VSX3=TRUE -DSLEEF_SHOW_CONFIG=1 -DCMAKE_BUILD_TYPE=Release .. + +cmake --build build -j --clean-first +sudo cmake --install build --prefix=/usr/ + + +#OneDNN; +git clone -b v3.2 --recursive https://github.com/oneapi-src/oneDNN +cd oneDNN +mkdir build && cd build +cmake -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -maltivec' -DOPENMP_RUNTIME=COMP .. +make -j16 +sudo make install + + +git clone --recursive https://github.com/Dagamies/CTranslate2 +cd CTranslate2 +mkdir build +cd build +cmake -DWITH_CUDA=OFF -DWITH_MKL=OFF -DWITH_OPENBLAS=OFF -DWITH_DNNL=ON -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -ffp-contract=off' -DOPENMP_RUNTIME=COMP .. +make -j16 +sudo make install +sudo ldconfig -v +export LD_LIBRARY_PATH=/usr/local/lib64/ + +``` \ No newline at end of file diff --git a/python/setup-ppc64le.py b/python/setup-ppc64le.py new file mode 100644 index 000000000..51a21fd43 --- /dev/null +++ b/python/setup-ppc64le.py @@ -0,0 +1,126 @@ +import glob +import os +import sys + +import pybind11 + +from pybind11.setup_helpers import ParallelCompile +from setuptools import Extension, find_packages, setup + +base_dir = os.path.dirname(os.path.abspath(__file__)) +include_dirs = [pybind11.get_include()] +library_dirs = [] + + +def _get_long_description(): + readme_path = os.path.join(base_dir, "README.md") + if not os.path.exists(readme_path): + return "" + with open(readme_path, encoding="utf-8") as readme_file: + return readme_file.read() + + +def _get_project_version(): + version_path = os.path.join(base_dir, "ctranslate2", "version.py") + version = {} + with open(version_path, encoding="utf-8") as fp: + exec(fp.read(), version) + return version["__version__"] + + +def _maybe_add_library_root(lib_name): + if "%s_ROOT" % lib_name in os.environ: + root = os.environ["%s_ROOT" % lib_name] + include_dirs.append("%s/include" % root) + for lib_dir in ("lib", "lib64"): + path = "%s/%s" % (root, lib_dir) + if os.path.exists(path): + library_dirs.append(path) + break + + +_maybe_add_library_root("CTRANSLATE2") + +cflags = ["-std=c++17", "-fvisibility=hidden"] +ldflags = [] +package_data = {} +if sys.platform == "darwin": + # std::visit requires macOS 10.14 + cflags.append("-mmacosx-version-min=10.14") + ldflags.append("-Wl,-rpath,/usr/local/lib") +elif sys.platform == "win32": + cflags = ["/std:c++17", "/d2FH4-"] + package_data["ctranslate2"] = ["*.dll"] + +ctranslate2_module = Extension( + "ctranslate2._ext", + sources=glob.glob(os.path.join("cpp", "*.cc")), + extra_compile_args=cflags, + extra_link_args=ldflags, + include_dirs=include_dirs, + library_dirs=library_dirs, + libraries=["ctranslate2"], +) + +ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL").install() + +setup( + name="ctranslate2", + version=_get_project_version(), + license="MIT", + description="Fast inference engine for Transformer models", + long_description=_get_long_description(), + long_description_content_type="text/markdown", + author="OpenNMT", + url="https://opennmt.net", + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Environment :: GPU :: NVIDIA CUDA :: 11.0", + "Environment :: GPU :: NVIDIA CUDA :: 11.1", + "Environment :: GPU :: NVIDIA CUDA :: 11.2", + "Environment :: GPU :: NVIDIA CUDA :: 11.3", + "Environment :: GPU :: NVIDIA CUDA :: 11.4", + "Environment :: GPU :: NVIDIA CUDA :: 11.5", + "Environment :: GPU :: NVIDIA CUDA :: 11.6", + "Environment :: GPU :: NVIDIA CUDA :: 11.7", + "Environment :: GPU :: NVIDIA CUDA :: 11.8", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + ], + project_urls={ + "Documentation": "https://opennmt.net/CTranslate2", + "Forum": "https://forum.opennmt.net", + "Gitter": "https://gitter.im/OpenNMT/CTranslate2", + "Source": "https://github.com/OpenNMT/CTranslate2", + }, + keywords="opennmt nmt neural machine translation cuda mkl inference quantization", + packages=find_packages(exclude=["bin"]), + package_data=package_data, + ext_modules=[ctranslate2_module], + python_requires=">=3.8", + install_requires=[ + "setuptools", + "numpy==1.25.2", + "pyyaml>=5.3,<7", + ], + entry_points={ + "console_scripts": [ + "ct2-fairseq-converter=ctranslate2.converters.fairseq:main", + "ct2-marian-converter=ctranslate2.converters.marian:main", + "ct2-openai-gpt2-converter=ctranslate2.converters.openai_gpt2:main", + "ct2-opennmt-py-converter=ctranslate2.converters.opennmt_py:main", + "ct2-opennmt-tf-converter=ctranslate2.converters.opennmt_tf:main", + "ct2-opus-mt-converter=ctranslate2.converters.opus_mt:main", + "ct2-transformers-converter=ctranslate2.converters.transformers:main", + ], + }, +) diff --git a/src/cpu/cpu_info.cc b/src/cpu/cpu_info.cc index 9030ac7a4..c320dae71 100644 --- a/src/cpu/cpu_info.cc +++ b/src/cpu/cpu_info.cc @@ -58,4 +58,20 @@ namespace ctranslate2 { } } +#elif defined(CT2_PPC64LE_BUILD) + +namespace ctranslate2 { + namespace cpu { + + const char* cpu_vendor() { + return "POWER"; + } + + bool cpu_supports_power10() { + return true; + } + + } +} + #endif diff --git a/src/cpu/cpu_info.h b/src/cpu/cpu_info.h index c2951bcc0..0c696805a 100644 --- a/src/cpu/cpu_info.h +++ b/src/cpu/cpu_info.h @@ -14,6 +14,8 @@ namespace ctranslate2 { bool cpu_supports_avx512(); #elif defined(CT2_ARM64_BUILD) bool cpu_supports_neon(); +#elif defined(CT2_PPC64LE_BUILD) + bool cpu_supports_power10(); #endif } diff --git a/src/cpu/cpu_isa.cc b/src/cpu/cpu_isa.cc index c16aeda22..c84c2a669 100644 --- a/src/cpu/cpu_isa.cc +++ b/src/cpu/cpu_isa.cc @@ -35,7 +35,11 @@ namespace ctranslate2 { #elif defined(CT2_ARM64_BUILD) case CpuIsa::NEON: return "NEON"; +#elif defined(CT2_PPC64LE_BUILD) + case CpuIsa::POWER10: + return "POWER10"; #endif + default: return "GENERIC"; } @@ -54,6 +58,9 @@ namespace ctranslate2 { #elif defined(CT2_ARM64_BUILD) if (env_isa == "NEON") return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon()); +#elif defined(CT2_PPC64LE_BUILD) + if (env_isa == "POWER10") + return try_isa(env_isa, CpuIsa::POWER10, cpu_supports_power10()); #endif if (env_isa == "GENERIC") return CpuIsa::GENERIC; @@ -71,6 +78,9 @@ namespace ctranslate2 { # elif defined(CT2_ARM64_BUILD) if (cpu_supports_neon()) return CpuIsa::NEON; +# elif defined(CT2_PPC64LE_BUILD) + if (cpu_supports_power10()) + return CpuIsa::POWER10; # endif #endif diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h index 4f42bdf26..b32379c7b 100644 --- a/src/cpu/cpu_isa.h +++ b/src/cpu/cpu_isa.h @@ -6,13 +6,15 @@ namespace ctranslate2 { namespace cpu { enum class CpuIsa { - GENERIC, + GENERIC,POWER10, #if defined(CT2_X86_BUILD) AVX, AVX2, AVX512, #elif defined(CT2_ARM64_BUILD) NEON, + /*#elif defined(CT2_PPC64LE_BUILD) + POWER10,*/ #endif }; @@ -54,6 +56,11 @@ namespace ctranslate2 { CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS)) \ CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ } +#elif defined(CT2_PPC64LE_BUILD) +# define CPU_ISA_DISPATCH(STMTS) \ + switch (cpu::get_cpu_isa()) { \ + CPU_ISA_DEFAULT(cpu::CpuIsa::POWER10, SINGLE_ARG(STMTS)) \ + } #endif #elif defined(__AVX512F__) # define CPU_ISA_DISPATCH(STMTS) \ diff --git a/src/cpu/kernels.cc b/src/cpu/kernels.cc index c1f48553d..cf186a74d 100644 --- a/src/cpu/kernels.cc +++ b/src/cpu/kernels.cc @@ -1,10 +1,13 @@ #include "cpu/kernels.h" - +//#include "cpu/cpu_isa.h" #include #if defined(__AVX512F__) # define TARGET_ISA CpuIsa::AVX512 # include "cpu/vec_avx512.h" +#elif defined(CT2_PPC64LE_BUILD) +# define TARGET_ISA CpuIsa::POWER10 +# include "cpu/vec_power10.h" #elif defined(__AVX2__) # define TARGET_ISA CpuIsa::AVX2 # include "cpu/vec_avx.h" @@ -14,6 +17,9 @@ #elif (defined(__ARM_NEON) && !defined(CT2_WITH_CPU_DISPATCH)) || defined(USE_NEON) # define TARGET_ISA CpuIsa::NEON # include "cpu/vec_neon.h" +//#elif defined(CT2_PPC64LE_BUILD) +//# define TARGET_ISA CpuIsa::GENERIC +//# include "cpu/vec_power10.h" #else # define TARGET_ISA CpuIsa::GENERIC # include "cpu/vec.h" diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h new file mode 100644 index 000000000..c9c71abe3 --- /dev/null +++ b/src/cpu/vec_power10.h @@ -0,0 +1,236 @@ +#pragma once + + +#include +#include +#include +#include +#include + +#include + +#include "vec.h" + +#if defined(__GNUC__) || defined(__clang__) +# define __ct2_align16__ __attribute__((aligned(16))) +#else +# define __ct2_align16__ +#endif + +namespace ctranslate2 { + namespace cpu { + + #define ALIGNMENT_VALUE 16u + + template<> + struct Vec { + + using value_type = __ct2_align16__ __vector float; + using mask_type = __ct2_align16__ __vector bool int; + static constexpr dim_t width = 4; + + static inline value_type unaligned_load(const float* ptr){ + return (value_type){*ptr,*(ptr+1),*(ptr+2),*(ptr+3)}; + } + + + static inline value_type load(float value) { + return (value_type){value,value,value,value}; + } + + static inline value_type load(const float* ptr) { + return (value_type){*ptr,*(ptr+1),*(ptr+2),*(ptr+3)}; + } + + static inline value_type load(const float* ptr, dim_t count, float default_value = float(0)) { + if (count == width) { + return load(ptr); + } else { + __ct2_align16__ float tmp_values[width]; + std::fill(tmp_values, tmp_values + width, default_value); + std::copy(ptr, ptr + count, tmp_values); + return load(tmp_values); + } + } + + static inline value_type load_and_convert(const int32_t* ptr) { + return vec_ctf((vector signed int){*ptr,*(ptr+1),*(ptr+2),*(ptr+3)},0); + } + + static inline value_type load_and_convert(const int32_t* ptr, + dim_t count, + int32_t default_value = 0) { + if (count == width) { + return load_and_convert(ptr); + } else { + __ct2_align16__ int32_t tmp_values[width]; + std::fill(tmp_values, tmp_values + width, default_value); + std::copy(ptr, ptr + count, tmp_values); + return load_and_convert(tmp_values); + } + } + static inline void unaligned_store(value_type value, float* ptr) { + vec_xst(value,0,ptr); + } + + static inline void store(value_type value, float* ptr) { + if (((uintptr_t)ptr % ALIGNMENT_VALUE) != 0) + { + unaligned_store(value,ptr); + } else + vec_st(value,0,ptr); + } + + static inline void store(value_type value, float* ptr, dim_t count) { + if (count == width) { + store(value,ptr); + } else { + __ct2_align16__ float tmp_values[width]; + store(value,tmp_values); + std::copy(tmp_values, tmp_values + count, ptr); + } + } + + static inline value_type bit_and(value_type a, value_type b) { + return vec_and(a,b); + } + + static inline value_type bit_xor(value_type a, value_type b) { + return vec_xor(a,b); + } + + static inline mask_type lt(value_type a, value_type b) { + return vec_cmplt(a,b); + } + + static inline value_type select(mask_type mask, value_type a, value_type b) { + return vec_sel(a,b,mask); + } + + static inline value_type abs(value_type a) { + return vec_abs(a); + } + + static inline value_type neg(value_type a) { + return vec_neg(a); + } + + static inline value_type rcp(value_type a) { + return vec_re(a); + } + + static inline value_type exp(value_type a) { + return Sleef_expf4_u10vsx3(a); + } + + static inline value_type log(value_type a) { + return Sleef_logf4_u35vsx3(a); + + } + static inline value_type sin(value_type a) { + return Sleef_sinf4_u35vsx3(a); + } + + static inline value_type cos(value_type a) { + return Sleef_cosf4_u35vsx3(a); + + } + + static inline value_type tanh(value_type a) { + return Sleef_tanhf4_u35vsx3(a); + + } + + static inline value_type erf(value_type a) { + return Sleef_erff4_u10vsx3(a); + } + + static inline value_type max(value_type a, value_type b) { + return vec_max(a, b); + } + + static inline value_type min(value_type a, value_type b) { + return vec_min(a, b); + } + + static inline value_type add(value_type a, value_type b) { + return vec_add(a,b); + } + + static inline value_type sub(value_type a, value_type b) { + return vec_sub(a,b); + } + + static inline value_type mul(value_type a, value_type b) { + return vec_mul(a,b); + } + + static inline value_type div(value_type a, value_type b) { + return vec_div(a,b); + } + + static inline value_type mul_add(value_type a, value_type b, value_type c) { + + return vec_madd(a,b,c); + } + + static inline float reduce_add(value_type a) { + + + unsigned long __element_selector_10 = 1 & 0x03; + unsigned long __element_selector_32 = (1 >> 2) & 0x03; + unsigned long __element_selector_54 = (1 >> 4) & 0x03; + unsigned long __element_selector_76 = (1 >> 6) & 0x03; + static const unsigned int __permute_selectors[4] = + { +#ifdef __LITTLE_ENDIAN__ + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C +#else + 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F +#endif + }; + __vector unsigned int __t; + __t[0] = __permute_selectors[__element_selector_10]; + __t[1] = __permute_selectors[__element_selector_32]; + __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; + __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; + + __vector unsigned long long v1 = vec_mergel((__vector unsigned long long)a,(__vector unsigned long long)a); + value_type v2 = (value_type)a + (value_type)v1; + value_type v3 = vec_perm (v2, v2,(__vector unsigned char) __t); + return v2[0]+v3[0]; + } + + static inline float reduce_max(value_type a) { + float t0 = a[0] > a[1] ? a[0] : a[1]; + float t1 = a[2] > a[3] ? a[2] : a[3]; + return t0 > t1 ? t0 : t1; + } + + static inline value_type round(value_type a) { + return vec_round(a); + } + + static inline void convert_and_store(value_type v, int8_t *a, dim_t count) { + auto i32 = vec_cts(v,0); + + int8_t tmp[4]; + tmp[0]=i32[0]; + tmp[1]=i32[1]; + tmp[2]=i32[2]; + tmp[3]=i32[3]; + std::copy(tmp, tmp + count, a); + } + + static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) { + auto u32 = vec_ctu(v,0); + uint8_t tmp[4]; + tmp[0]=u32[0]; + tmp[1]=u32[1]; + tmp[2]=u32[2]; + tmp[3]=u32[3]; + std::copy(tmp, tmp + count, a); + } + }; + } +} diff --git a/src/utils.cc b/src/utils.cc index 4f8bde57c..571fbb07c 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -42,6 +42,10 @@ namespace ctranslate2 { spdlog::info("CPU: {} (NEON={})", cpu::cpu_vendor(), cpu::cpu_supports_neon()); +#elif defined(CT2_PPC64LE_BUILD) + spdlog::info("CPU: {} (NEON={})", + cpu::cpu_vendor(), + cpu::cpu_supports_power10()); #endif spdlog::info(" - Selected ISA: {}", cpu::isa_to_str(cpu::get_cpu_isa())); spdlog::info(" - Use Intel MKL: {}", cpu::mayiuse_mkl());