diff --git a/.github/actions/multi-functest/action.yml b/.github/actions/multi-functest/action.yml index 252931918..64ccfebd1 100644 --- a/.github/actions/multi-functest/action.yml +++ b/.github/actions/multi-functest/action.yml @@ -119,7 +119,7 @@ runs: nix-verbose: ${{ inputs.nix-verbose }} gh_token: ${{ inputs.gh_token }} custom_shell: ${{ inputs.custom_shell }} - cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE" + cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE -mvsx" cross_prefix: powerpc64le-unknown-linux-gnu- exec_wrapper: qemu-ppc64le opt: ${{ inputs.opt }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f6f6fd39..b51ab25e2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -134,16 +134,15 @@ jobs: runs-on: ${{ matrix.target.runner }} steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - - name: build + test + - name: build + test (no-opt) uses: ./.github/actions/multi-functest with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} gh_token: ${{ secrets.GITHUB_TOKEN }} compile_mode: ${{ matrix.target.mode }} - # There is no native code yet on PPC64LE, R-V or AArch64_be, so no point running opt tests - opt: ${{ (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv64' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') && 'all' || 'no_opt' }} - - name: build + test (+debug+memsan+ubsan) + opt: 'no_opt' + - name: build + test (+debug+memsan+ubsan, native) uses: ./.github/actions/multi-functest if: ${{ matrix.target.mode == 'native' }} with: @@ -151,6 +150,17 @@ jobs: compile_mode: native cflags: "-DMLKEM_DEBUG -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all" check_namespace: 'false' + - name: build + test (+debug, cross, opt) + uses: ./.github/actions/multi-functest + # There is no native code yet on riscv64, riscv32 or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'riscv64' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + with: + nix-shell: ${{ matrix.target.nix_shell }} + nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} + gh_token: ${{ secrets.GITHUB_TOKEN }} + compile_mode: ${{ matrix.target.mode }} + cflags: "-DMLKEM_DEBUG" + opt: 'opt' backend_tests: name: AArch64 FIPS202 backends (${{ matrix.backend }}) strategy: diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index f10a15f6e..d75d368ef 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -29,6 +29,7 @@ source code and documentation. - [examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h](examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h) - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) + - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) - [integration/liboqs/config_x86_64.h](integration/liboqs/config_x86_64.h) - [mlkem/src/config.h](mlkem/src/config.h) - [mlkem/src/kem.c](mlkem/src/kem.c) diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md new file mode 100644 index 000000000..5125a40ea --- /dev/null +++ b/dev/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h new file mode 100644 index 000000000..8fec0c2ad --- /dev/null +++ b/dev/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_META_H +#define MLK_DEV_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_DEV_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 000000000..282b3566c --- /dev/null +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c new file mode 100644 index 000000000..c9c869a60 --- /dev/null +++ b/dev/ppc64le/src/consts.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[] = { + /* -Q */ + /* check-magic: -3329 == -1 * MLKEM_Q */ + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + /* QINV */ + /* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */ + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + /* Q */ + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + /* check-magic: 20159 == round(2^26 / MLKEM_Q) */ + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + /* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */ + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, +/* zetas for NTT */ +#include "consts_ntt.inc" + , +/* zetas for invNTT */ +#include "consts_intt.inc" +}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h new file mode 100644 index 000000000..59de765cf --- /dev/null +++ b/dev/ppc64le/src/consts.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_SRC_CONSTS_H +#define MLK_DEV_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +/* Offsets into the constant table */ +/* check-magic: off */ +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_NTT_OFFSET64 1104 +#define IZETA_NTT_OFFSET127 1616 +#define IZETA_NTT_OFFSET63 2128 +/* check-magic: on */ + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/consts_intt.inc b/dev/ppc64le/src/consts_intt.inc new file mode 100644 index 000000000..7cd95fcd0 --- /dev/null +++ b/dev/ppc64le/src/consts_intt.inc @@ -0,0 +1,90 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758 diff --git a/dev/ppc64le/src/consts_ntt.inc b/dev/ppc64le/src/consts_ntt.inc new file mode 100644 index 000000000..bfb64e722 --- /dev/null +++ b/dev/ppc64le/src/consts_ntt.inc @@ -0,0 +1,45 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S new file mode 100644 index 000000000..3d056c850 --- /dev/null +++ b/dev/ppc64le/src/intt_ppc.S @@ -0,0 +1,658 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +// Barrett reduce constants +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +// Montgomery reduce constants +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro Load_4Coeffs start next step + mr 9, \start // j + add 10, 7, 9 // J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 // r[j+len] + lxvd2x 32+12, 3, 17 // r[j+len] + lxvd2x 32+16, 3, 19 // r[j+len] + lxvd2x 32+20, 3, 21 // r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + vsubuhm 25, 8, 21 // r[j+len] - t + vsubuhm 26, 12, 22 // r[j+len] - t + vsubuhm 30, 16, 23 // r[j+len] - t + vsubuhm 31, 20, 24 // r[j+len] - t + vadduhm 8, 8, 21 // r[j+len] + t + vadduhm 12, 12, 22 // r[j+len] + t + vadduhm 16, 16, 23 // r[j+len] + t + vadduhm 20, 20, 24 // r[j+len] + t +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + // Multiply Odd/Even signed halfword; + // Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + // Right shift and pack lower halfword, + // results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + // Modulo multify-Low unsigned halfword; + // results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +//----------------------------------- +// MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +// +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + // Modular multiplication bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + // Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 // >> 1 + vsrah \_vo1, 20, 4 // >> 1 + vsrah \_vo2, 25, 4 // >> 1 + vsrah \_vo3, 30, 4 // >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 // V_NMKQ + xxlor 32+2, 2, 2 // V_QINV + xxlor 32+3, 3, 3 // 0 + xxlor 32+4, 4, 4 // 1 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) +.endm + +.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 + xxmrglw 32+12, \_vs0, 10 + xxmrghw 32+11, \_vs0, 10 + xxpermdi 10, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs1, 11 + xxmrghw 32+15, \_vs1, 11 + xxpermdi 11, 32+16, 32+15, 3 + xxmrglw 32+12, \_vs2, 12 + xxmrghw 32+11, \_vs2, 12 + xxpermdi 12, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs3, 13 + xxmrghw 32+15, \_vs3, 13 + xxpermdi 13, 32+16, 32+15, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 + xxpermdi 10, 10, \_vs0, 3 + xxpermdi 11, 11, \_vs1, 3 + xxpermdi 12, 12, \_vs2, 3 + xxpermdi 13, 13, \_vs3, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +// intt +// t = r[j]; +// r[j] = barrett_reduce(t + r[j + len]); +// r[j + len] = r[j + len] - t; +// r[j + len] = fqmul(zeta, r[j + len]); + +// +// mlk_intt_ppc(r) +// +.text +.global MLK_ASM_NAMESPACE(intt_ppc) +.balign 16 +MLK_ASM_FN_SYMBOL(intt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + // init vectors and constants + // Setup for Montgomery reduce + lxv 0, 0(4) + + lxv 32+V_QINV, QINV_OFFSET(4) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 + + // Setup for Barrett reduce + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 + + // xxspltiw 8, 26 # for power9 and above + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 + + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + +.align 4 + // + // 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + // Update zetas vectors, each vector has 2 zetas + addi 14, 4, IZETA_NTT_OFFSET127 + li 7, 4 + li 15, 4 + mtctr 15 + li 5, 0 +intt_ppc__Loop2: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz intt_ppc__Loop2 + +.align 4 + // 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + addi 14, 4, IZETA_NTT_OFFSET63 + li 5, 0 + li 7, 8 + li 15, 4 # loops + mtctr 15 +intt_ppc__Loop4: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz intt_ppc__Loop4 + +.align 4 + // 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + // addi 14, 14, 512 + li 7, 16 + li 5, 0 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + // 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + // addi 14, 14, 768 + li 5, 0 + li 7, 32 + + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 16 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 256 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 272 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + // 5. len = 32, start = 0, 64, 128, 192 + // addi 14, 14, 896 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + // 6. len = 64, start = 0, 128 + // addi 14, 14, 960 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + // 7. len = 128, start = 0 + // addi 14, 14, 992 + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + // + // Montgomery reduce loops with constant 1441 + // + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S new file mode 100644 index 000000000..1f6519ff4 --- /dev/null +++ b/dev/ppc64le/src/ntt_ppc.S @@ -0,0 +1,398 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +// montgomery_reduce +// t = a * QINV +// t = (a - (int32_t)t*_MLKEM_Q) >> 16 +// +//----------------------------------- +// MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) + +.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + mr 9, \start + add 10, 7, 9 // J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 // r[j+len] + lxvd2x 32+18, 3, 17 // r[j+len] + lxvd2x 32+23, 3, 19 // r[j+len] + lxvd2x 32+28, 3, 21 // r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 + + // fqmul = zeta * coefficient + // Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + // Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 // >> 1 + vsrah 18, 20, 4 // >> 1 + vsrah 23, 25, 4 // >> 1 + vsrah 28, 30, 4 // >> 1 + + lxvd2x 32+12, 3, 9 // r[j] + lxvd2x 32+17, 3, 16 // r[j] + lxvd2x 32+22, 3, 18 // r[j] + lxvd2x 32+27, 3, 20 // r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 + + // Since the result of the Montgomery multiplication is bounded + // by q in absolute value. + // Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 // r - t + vadduhm 15, 13, 12 // r + t + vsubuhm 21, 17, 18 // r - t + vadduhm 20, 18, 17 // r + t + vsubuhm 26, 22, 23 // r - t + vadduhm 25, 23, 22 // r + t + vsubuhm 31, 27, 28 // r - t + vadduhm 30, 28, 27 // r + t +.endm + +.macro Write_One + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 +.endm + +.macro Write_Two + xxpermdi 32+17, 32+16, 32+15, 3 + xxpermdi 32+22, 32+21, 32+20, 3 + xxpermdi 32+27, 32+26, 32+25, 3 + xxpermdi 32+29, 32+31, 32+30, 3 + + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Write_Three + xxmrglw 32+14, 32+16, 32+15 + xxmrghw 32+13, 32+16, 32+15 + xxpermdi 32+17, 32+13, 32+14, 3 + xxmrglw 32+19, 32+21, 32+20 + xxmrghw 32+18, 32+21, 32+20 + xxpermdi 32+22, 32+18, 32+19, 3 + xxmrglw 32+14, 32+26, 32+25 + xxmrghw 32+13, 32+26, 32+25 + xxpermdi 32+27, 32+13, 32+14, 3 + xxmrglw 32+24, 32+31, 32+30 + xxmrghw 32+23, 32+31, 32+30 + xxpermdi 32+29, 32+23, 32+24, 3 + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +// +// mlk_ntt_ppc(int16_t *r) +// +.text +.global MLK_ASM_NAMESPACE(ntt_ppc) +.balign 16 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + // get MLKEM_Q + lvx V_NMKQ,0,4 + + // zetas array + addi 14, 4, ZETA_NTT_OFFSET + + vxor 3, 3, 3 + vspltish 4, 1 + + lxv 32+V_QINV, QINV_OFFSET(4) + +.align 4 + // + // Compute coefficients of the NTT based on the following loop. + // for (len = 128; len ≥ 2; len = len/2) + // + // 1. len = 128, start = 0 + // + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + // 2. len = 64, start = 0, 128 + // k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + // 3. len = 32, start = 0, 64, 128, 192 + // k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + // li 5, 64 + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + // li 5, 128 + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + // li 5, 192 + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + // 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + // k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 + // 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + // k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + // 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + // k += 32 + li 15, 4 # loops + mtctr 15 + li 5, 0 + li 7, 8 +.align 4 +ntt_ppc__Len4: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + bdnz ntt_ppc__Len4 + + // 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + // k += 64 + // Update zetas vectors, each vector has 2 zetas + + addi 14, 4, ZETA_NTT_OFFSET64 + + li 15, 4 + mtctr 15 + li 5, 0 + li 7, 4 +.align 4 +ntt_ppc__Len2: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + bdnz ntt_ppc__Len2 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S new file mode 100644 index 000000000..a8628f43c --- /dev/null +++ b/dev/ppc64le/src/poly_tomont.S @@ -0,0 +1,157 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ + +// Poly_tomont: Inplace conversion of all coefficients of a polynomial +// from normal domain to Montgomery domain +// +// Arguments:*r: pointer to input/output polynomial + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +// montgomery_reduce +// t = a * QINV +// t = (a - (int32_t)t*_MLKEM_Q) >> 16 +// +//----------------------------------- +// MREDUCE_4X(_v0, _v1, _v2, _v3) + +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 // >> 1 + vsrah \_v1, 20, 4 // >> 1 + vsrah \_v2, 25, 4 // >> 1 + vsrah \_v3, 9, 4 // >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.text +.global MLK_ASM_NAMESPACE(poly_tomont_ppc) +.balign 16 +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S new file mode 100644 index 000000000..2ca8a5e35 --- /dev/null +++ b/dev/ppc64le/src/reduce.S @@ -0,0 +1,219 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +// poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +// for details of the Barrett reduction +// +// Arguments: *r: pointer to input/output polynomial + +// Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +// +// Conditional addition to get unsigned canonical representative +// +.macro To_unsigned_16 + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) +.endm + +.text +.global MLK_ASM_NAMESPACE(reduce_ppc) +.balign 16 +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + vxor 7, 7, 7 + + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + +.align 4 + // + // To unsigned canonical + // + addi 3, 3, -512 + xxspltib 32+9 ,0 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 7d8e50d4c..9c7fe672a 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index aa88537d3..f46dbfdbf 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 254d67478..1b01c4d42 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h new file mode 100644 index 000000000..2fa1cdbcf --- /dev/null +++ b/integration/liboqs/config_ppc64le.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS140_3_IG] + * Implementation Guidance for FIPS 140-3 and the Cryptographic Module + * Validation Program National Institute of Standards and Technology + * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements + */ + +#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H +#define MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H + +/****************************************************************************** + * Name: MLK_CONFIG_PARAMETER_SET + * + * Description: Specifies the parameter set for ML-KEM + * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 + * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 + * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#ifndef MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_PARAMETER_SET \ + 768 /* Change this for different security strengths */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_NAMESPACE_PREFIX + * + * Description: The prefix to use to namespace global symbols from mlkem/. + * + * In a multi-level build (that is, if either + * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or + * - MLK_CONFIG_MULTILEVEL_NO_SHARED, + * are set, level-dependent symbols will additionally be prefixed + * with the parameter set (512/768/1024). + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if MLK_CONFIG_PARAMETER_SET == 512 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 768 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 1024 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + * + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + +/****************************************************************************** + * Name: MLK_CONFIG_ARITH_BACKEND_FILE + * + * Description: The arithmetic backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202_CUSTOM_HEADER \ + "../../integration/liboqs/fips202_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202-X4 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202x4.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER \ + "../../integration/liboqs/fips202x4_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_ZEROIZE + * + * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes + * intermediate stack buffers before returning from function calls. + * + * Set this option and define `mlk_zeroize` if you want to + * use a custom method to zeroize intermediate stack buffers. + * The default implementation uses SecureZeroMemory on Windows + * and a memset + compiler barrier otherwise. If neither of those + * is available on the target platform, compilation will fail, + * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide + * a custom implementation of `mlk_zeroize()`. + * + * WARNING: + * The explicit stack zeroization conducted by mlkem-native + * reduces the likelihood of data leaking on the stack, but + * does not eliminate it! The C standard makes no guarantee about + * where a compiler allocates structures and whether/where it makes + * copies of them. Also, in addition to entire structures, there + * may also be potentially exploitable leakage of individual values + * on the stack. + * + * If you need bullet-proof zeroization of the stack, you need to + * consider additional measures instead of of what this feature + * provides. In this case, you can set mlk_zeroize to a no-op. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_ZEROIZE + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES + * + * Description: mlkem-native does not provide a secure randombytes + * implementation. Such an implementation has to provided by the + * consumer. + * + * If this option is not set, mlkem-native expects a function + * void randombytes(uint8_t *out, size_t outlen). + * + * Set this option and define `mlk_randombytes` if you want to + * use a custom method to sample randombytes with a different name + * or signature. + * + *****************************************************************************/ +#define MLK_CONFIG_CUSTOM_RANDOMBYTES +#if !defined(__ASSEMBLER__) +#include +#include +#include "../../mlkem/src/sys.h" +static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) +{ + OQS_randombytes(ptr, len); +} +#endif /* !__ASSEMBLER__ */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_ASM + * + * Description: If this option is set, mlkem-native will be built without + * use of native code or inline assembly. + * + * By default, inline assembly is used to implement value barriers. + * Without inline assembly, mlkem-native will use a global volatile + * 'opt blocker' instead; see verify.h. + * + * Inline assembly is also used to implement a secure zeroization + * function on non-Windows platforms. If this option is set and + * the target platform is not Windows, you MUST set + * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization + * function. + * + * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and + * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no + *native backends will be used. + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_ASM */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT + * + * Description: Compliance with @[FIPS140_3_IG, p.87] requires a + * Pairwise Consistency Test (PCT) to be carried out on a freshly + * generated keypair before it can be exported. + * + * Set this option if such a check should be implemented. + * In this case, crypto_kem_keypair_derand and crypto_kem_keypair + * will return a non-zero error code if the PCT failed. + * + * NOTE: This feature will drastically lower the performance of + * key generation. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + * + * Description: If this option is set, the user must provide a runtime + * function `static inline int mlk_break_pct() { ... }` to + * indicate whether the PCT should be made fail. + * + * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + #if !defined(__ASSEMBLER__) + #include "sys.h" + static MLK_INLINE int mlk_break_pct(void) + { + ... return 0/1 depending on whether PCT should be broken ... + } + #endif +*/ + +/* Enable valgrind-based assertions in mlkem-native through macro + * from libOQS. */ +#if !defined(__ASSEMBLER__) +#include +#if defined(OQS_ENABLE_TEST_CONSTANT_TIME) +#define MLK_CONFIG_CT_TESTING_ENABLED +#endif +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H */ diff --git a/mlkem/mlkem_native.S b/mlkem/mlkem_native.S index bff040079..46bcb9772 100644 --- a/mlkem/mlkem_native.S +++ b/mlkem/mlkem_native.S @@ -85,6 +85,12 @@ #include "mlkem/src/native/x86_64/src/rej_uniform_asm.S" #include "mlkem/src/native/x86_64/src/tomont.S" #endif /* MLK_SYS_X86_64 */ +#if defined(MLK_SYS_PPC64LE) +#include "mlkem/src/native/ppc64le/src/intt_ppc.S" +#include "mlkem/src/native/ppc64le/src/ntt_ppc.S" +#include "mlkem/src/native/ppc64le/src/poly_tomont.S" +#include "mlkem/src/native/ppc64le/src/reduce.S" +#endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -458,6 +464,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 74c1f9387..0ee436f33 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -84,6 +84,9 @@ #include "src/native/x86_64/src/consts.c" #include "src/native/x86_64/src/rej_uniform_table.c" #endif +#if defined(MLK_SYS_PPC64LE) +#include "src/native/ppc64le/src/consts.c" +#endif #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -445,6 +448,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index f2b9b848b..e39188323 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -18,4 +18,8 @@ #include "x86_64/meta.h" #endif +#ifdef MLK_SYS_PPC64LE +#include "ppc64le/meta.h" +#endif + #endif /* !MLK_NATIVE_META_H */ diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md new file mode 100644 index 000000000..5125a40ea --- /dev/null +++ b/mlkem/src/native/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h new file mode 100644 index 000000000..c5694f9c2 --- /dev/null +++ b/mlkem/src/native/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 000000000..7ab3226c4 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c new file mode 100644 index 000000000..c9c869a60 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[] = { + /* -Q */ + /* check-magic: -3329 == -1 * MLKEM_Q */ + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + /* QINV */ + /* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */ + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + /* Q */ + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + /* check-magic: 20159 == round(2^26 / MLKEM_Q) */ + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + /* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */ + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, +/* zetas for NTT */ +#include "consts_ntt.inc" + , +/* zetas for invNTT */ +#include "consts_intt.inc" +}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h new file mode 100644 index 000000000..38246248d --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#define MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +/* Offsets into the constant table */ +/* check-magic: off */ +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_NTT_OFFSET64 1104 +#define IZETA_NTT_OFFSET127 1616 +#define IZETA_NTT_OFFSET63 2128 +/* check-magic: on */ + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/consts_intt.inc b/mlkem/src/native/ppc64le/src/consts_intt.inc new file mode 100644 index 000000000..7cd95fcd0 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_intt.inc @@ -0,0 +1,90 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758 diff --git a/mlkem/src/native/ppc64le/src/consts_ntt.inc b/mlkem/src/native/ppc64le/src/consts_ntt.inc new file mode 100644 index 000000000..bfb64e722 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_ntt.inc @@ -0,0 +1,45 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S new file mode 100644 index 000000000..de8b634a4 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -0,0 +1,3190 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/intt_ppc.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 16 +.global MLK_ASM_NAMESPACE(intt_ppc) +MLK_ASM_FN_SYMBOL(intt_ppc) + + .cfi_startproc + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 52, 128(1) + stxv 53, 144(1) + stxv 54, 160(1) + stxv 55, 176(1) + stxv 56, 192(1) + stxv 57, 208(1) + stxv 58, 224(1) + stxv 59, 240(1) + stxv 60, 256(1) + stxv 61, 272(1) + stxv 62, 288(1) + stxv 63, 304(1) + lxv 0, 0(4) + lxv 34, 16(4) + xxlxor 35, 35, 35 + vspltish 4, 1 + xxlor 2, 34, 34 + xxlor 3, 35, 35 + xxlor 4, 36, 36 + lxv 6, 32(4) + lxv 32, 48(4) + lxv 7, 0(4) + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 40, 40 + vspltisw 9, 1 + vsubuwm 10, 8, 9 + vslw 9, 9, 10 + xxlor 7, 41, 41 + ori 2, 2, 0 + addi 14, 4, 1616 + li 7, 4 + li 15, 4 + mtctr 15 + li 5, 0 + +Lintt_ppc__Loop2: + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxmrglw 44, 45, 10 + xxmrghw 43, 45, 10 + xxmrgld 10, 44, 43 + xxmrglw 48, 50, 11 + xxmrghw 47, 50, 11 + xxmrgld 11, 48, 47 + xxmrglw 44, 55, 12 + xxmrghw 43, 55, 12 + xxmrgld 12, 44, 43 + xxmrglw 48, 60, 13 + xxmrghw 47, 60, 13 + xxmrgld 13, 48, 47 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 + addi 5, 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxmrglw 44, 45, 10 + xxmrghw 43, 45, 10 + xxmrgld 10, 44, 43 + xxmrglw 48, 50, 11 + xxmrghw 47, 50, 11 + xxmrgld 11, 48, 47 + xxmrglw 44, 55, 12 + xxmrghw 43, 55, 12 + xxmrgld 12, 44, 43 + xxmrglw 48, 60, 13 + xxmrghw 47, 60, 13 + xxmrgld 13, 48, 47 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 + addi 5, 5, 64 + bdnz Lintt_ppc__Loop2 + nop + ori 2, 2, 0 + addi 14, 4, 2128 + li 5, 0 + li 7, 8 + li 15, 4 + mtctr 15 + +Lintt_ppc__Loop4: + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxmrgld 10, 10, 45 + xxmrgld 11, 11, 50 + xxmrgld 12, 12, 55 + xxmrgld 13, 13, 60 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 + addi 5, 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxmrgld 10, 10, 45 + xxmrgld 11, 11, 50 + xxmrgld 12, 12, 55 + xxmrgld 13, 13, 60 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 + addi 5, 5, 64 + bdnz Lintt_ppc__Loop4 + nop + ori 2, 2, 0 + li 7, 16 + li 5, 0 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 384 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 32 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + addi 14, 14, -64 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 272 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + addi 14, 14, -64 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 384 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 42, -16(14) + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 320 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 42, -16(14) + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + xxlor 9, 42, 42 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + xxlor 42, 9, 9 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + xxlor 42, 9, 9 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 192 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + xxlor 42, 9, 9 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + nop + ori 2, 2, 0 + addi 14, 4, 64 + lvx 10, 0, 14 + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 6, 15, 4 + vsrah 7, 20, 4 + vsrah 8, 25, 4 + vsrah 9, 30, 4 + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxv 38, -128(3) + stxv 39, -112(3) + stxv 40, -96(3) + stxv 41, -80(3) + stxv 45, -64(3) + stxv 50, -48(3) + stxv 55, -32(3) + stxv 60, -16(3) + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 6, 15, 4 + vsrah 7, 20, 4 + vsrah 8, 25, 4 + vsrah 9, 30, 4 + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxv 38, -128(3) + stxv 39, -112(3) + stxv 40, -96(3) + stxv 41, -80(3) + stxv 45, -64(3) + stxv 50, -48(3) + stxv 55, -32(3) + stxv 60, -16(3) + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 6, 15, 4 + vsrah 7, 20, 4 + vsrah 8, 25, 4 + vsrah 9, 30, 4 + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxv 38, -128(3) + stxv 39, -112(3) + stxv 40, -96(3) + stxv 41, -80(3) + stxv 45, -64(3) + stxv 50, -48(3) + stxv 55, -32(3) + stxv 60, -16(3) + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 6, 15, 4 + vsrah 7, 20, 4 + vsrah 8, 25, 4 + vsrah 9, 30, 4 + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxv 38, -128(3) + stxv 39, -112(3) + stxv 40, -96(3) + stxv 41, -80(3) + stxv 45, -64(3) + stxv 50, -48(3) + stxv 55, -32(3) + stxv 60, -16(3) + lxv 52, 128(1) + lxv 53, 144(1) + lxv 54, 160(1) + lxv 55, 176(1) + lxv 56, 192(1) + lxv 57, 208(1) + lxv 58, 224(1) + lxv 59, 240(1) + lxv 60, 256(1) + lxv 61, 272(1) + lxv 62, 288(1) + lxv 63, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + mtlr 0 + addi 1, 1, 352 + blr + .cfi_endproc + +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S new file mode 100644 index 000000000..dd0aef877 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -0,0 +1,1661 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/ntt_ppc.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 16 +.global MLK_ASM_NAMESPACE(ntt_ppc) +MLK_ASM_FN_SYMBOL(ntt_ppc) + + .cfi_startproc + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 52, 128(1) + stxv 53, 144(1) + stxv 54, 160(1) + stxv 55, 176(1) + stxv 56, 192(1) + stxv 57, 208(1) + stxv 58, 224(1) + stxv 59, 240(1) + stxv 60, 256(1) + stxv 61, 272(1) + stxv 62, 288(1) + stxv 63, 304(1) + lvx 5, 0, 4 + addi 14, 4, 96 + vxor 3, 3, 3 + vspltish 4, 1 + lxv 34, 16(4) + ori 2, 2, 0 + li 5, 0 + li 7, 256 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 192 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 128 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 256 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 320 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 64 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 128 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 256 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 384 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 32 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 272 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 16 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 128 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 256 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 384 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 15, 4 + mtctr 15 + li 5, 0 + li 7, 8 + nop + nop + ori 2, 2, 0 + +Lntt_ppc__Len4: + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 49, 48, 47 + xxmrgld 54, 53, 52 + xxmrgld 59, 58, 57 + xxmrgld 61, 63, 62 + stxvx 49, 3, 9 + stxvx 54, 3, 16 + stxvx 59, 3, 18 + stxvx 61, 3, 20 + addi 5, 5, 64 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 49, 48, 47 + xxmrgld 54, 53, 52 + xxmrgld 59, 58, 57 + xxmrgld 61, 63, 62 + stxvx 49, 3, 9 + stxvx 54, 3, 16 + stxvx 59, 3, 18 + stxvx 61, 3, 20 + addi 5, 5, 64 + bdnz Lntt_ppc__Len4 + addi 14, 4, 1104 + li 15, 4 + mtctr 15 + li 5, 0 + li 7, 4 + nop + ori 2, 2, 0 + +Lntt_ppc__Len2: + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrglw 46, 48, 47 + xxmrghw 45, 48, 47 + xxmrgld 49, 45, 46 + xxmrglw 51, 53, 52 + xxmrghw 50, 53, 52 + xxmrgld 54, 50, 51 + xxmrglw 46, 58, 57 + xxmrghw 45, 58, 57 + xxmrgld 59, 45, 46 + xxmrglw 56, 63, 62 + xxmrghw 55, 63, 62 + xxmrgld 61, 55, 56 + stxvx 49, 3, 9 + stxvx 54, 3, 16 + stxvx 59, 3, 18 + stxvx 61, 3, 20 + addi 5, 5, 64 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrglw 46, 48, 47 + xxmrghw 45, 48, 47 + xxmrgld 49, 45, 46 + xxmrglw 51, 53, 52 + xxmrghw 50, 53, 52 + xxmrgld 54, 50, 51 + xxmrglw 46, 58, 57 + xxmrghw 45, 58, 57 + xxmrgld 59, 45, 46 + xxmrglw 56, 63, 62 + xxmrghw 55, 63, 62 + xxmrgld 61, 55, 56 + stxvx 49, 3, 9 + stxvx 54, 3, 16 + stxvx 59, 3, 18 + stxvx 61, 3, 20 + addi 5, 5, 64 + bdnz Lntt_ppc__Len2 + lxv 52, 128(1) + lxv 53, 144(1) + lxv 54, 160(1) + lxv 55, 176(1) + lxv 56, 192(1) + lxv 57, 208(1) + lxv 58, 224(1) + lxv 59, 240(1) + lxv 60, 256(1) + lxv 61, 272(1) + lxv 62, 288(1) + lxv 63, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + mtlr 0 + addi 1, 1, 352 + blr + .cfi_endproc + +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S new file mode 100644 index 000000000..344579c4d --- /dev/null +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -0,0 +1,333 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ + +// Poly_tomont: Inplace conversion of all coefficients of a polynomial +// from normal domain to Montgomery domain +// +// Arguments:*r: pointer to input/output polynomial + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/poly_tomont.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 16 +.global MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + + .cfi_startproc + stdu 1, -320(1) + mflr 0 + stxv 52, 128(1) + stxv 53, 144(1) + stxv 54, 160(1) + stxv 55, 176(1) + stxv 56, 192(1) + stxv 57, 208(1) + stxv 58, 224(1) + stxv 59, 240(1) + stxv 60, 256(1) + stxv 61, 272(1) + stxv 62, 288(1) + lxv 37, 0(4) + lxv 34, 16(4) + lxv 32, 80(4) + vxor 3, 3, 3 + vspltish 4, 1 + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxv 52, 128(1) + lxv 53, 144(1) + lxv 54, 160(1) + lxv 55, 176(1) + lxv 56, 192(1) + lxv 57, 208(1) + lxv 58, 224(1) + lxv 59, 240(1) + lxv 60, 256(1) + lxv 61, 272(1) + lxv 62, 288(1) + mtlr 0 + addi 1, 1, 320 + blr + .cfi_endproc + +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S new file mode 100644 index 000000000..0cf7783a4 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -0,0 +1,703 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/reduce.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 16 +.global MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + + .cfi_startproc + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 52, 128(1) + stxv 53, 144(1) + stxv 54, 160(1) + stxv 55, 176(1) + stxv 56, 192(1) + vxor 7, 7, 7 + lxv 35, 32(4) + lxv 32, 48(4) + vspltisw 2, 13 + vadduwm 2, 2, 2 + vspltisw 4, 1 + vsubuwm 5, 2, 4 + vslw 1, 4, 5 + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + li 14, 16 + li 15, 32 + li 16, 48 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + nop + nop + ori 2, 2, 0 + addi 3, 3, -512 + xxspltib 41, 0 + vspltish 10, 15 + vmr 11, 3 + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 52, 128(1) + lxv 53, 144(1) + lxv 54, 160(1) + lxv 55, 176(1) + lxv 56, 192(1) + mtlr 0 + addi 1, 1, 224 + blr + .cfi_endproc + +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/scripts/autogen b/scripts/autogen index 819f00def..d80d0b724 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -1220,6 +1220,10 @@ def x86_64(c): return "/x86_64/" in c +def ppc64le(c): + return "/ppc64le/" in c + + def native_fips202(c): return native(c) and fips202(c) @@ -1252,6 +1256,10 @@ def native_arith_x86_64(c): return native_arith(c) and x86_64(c) +def native_arith_ppc64le(c): + return native_arith(c) and ppc64le(c) + + def native_arith_core(c): return ( native_arith(c) and not native_arith_x86_64(c) and not native_arith_aarch64(c) @@ -1431,6 +1439,10 @@ def gen_monolithic_source_file(dry_run=False): for c in filter(native_arith_x86_64, c_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + for c in filter(native_arith_ppc64le, c_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -1515,6 +1527,10 @@ def gen_monolithic_asm_file(dry_run=False): for c in filter(native_arith_x86_64, asm_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + for c in filter(native_arith_ppc64le, asm_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -1766,6 +1782,10 @@ def update_via_simpasm( source_arch = "aarch64" elif "x86_64" in infile_full: source_arch = "x86_64" + elif "ppc64le" in infile_full: + source_arch = "ppc64le" + elif "riscv64" in infile_full: + source_arch = "riscv64" else: raise Exception(f"Could not detect architecture of source file {infile_full}.") # Check native architecture @@ -1775,7 +1795,14 @@ def update_via_simpasm( native_arch = "x86_64" if native_arch != source_arch: - cross_prefix = f"{source_arch}-unknown-linux-gnu-" + arch_to_cross_prefix = { + "aarch64": "aarch64-unknown-linux-gnu-", + "x86_64": "x86_64-unknown-linux-gnu-", + "ppc64le": "powerpc64le-unknown-linux-gnu-", + "riscv64": "riscv64-unknown-linux-gnu-", + } + + cross_prefix = arch_to_cross_prefix[source_arch] cross_gcc = cross_prefix + "gcc" # Check if cross-compiler is present if shutil.which(cross_gcc) is None: @@ -1788,13 +1815,12 @@ def update_via_simpasm( with tempfile.NamedTemporaryFile(suffix=".S") as tmp: try: # Determine architecture from filename - arch = "aarch64" if "aarch64" in infile_full else "x86_64" cmd = [ "./scripts/simpasm", "--objdump=llvm-objdump", "--cfify", - "--arch=" + arch, + "--arch=" + source_arch, "-i", infile_full, "-o", @@ -2058,49 +2084,55 @@ def synchronize_backends( ), ) - synchronize_backend( - f"dev/aarch64_{ty}/src", - "mlkem/src/native/aarch64/src", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - cflags="-Imlkem/src/native/aarch64/src", - ) - synchronize_backend( - "dev/fips202/aarch64/src", - "mlkem/src/fips202/native/aarch64/src", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - cflags="-Imlkem/src/fips202/native/aarch64/src -march=armv8.4-a+sha3", - ) - synchronize_backend( - "dev/fips202/aarch64", - "mlkem/src/fips202/native/aarch64", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - cflags="-Imlkem/src/fips202/native/aarch64 -march=armv8.4-a+sha3", - ) - synchronize_backend( - "dev/x86_64/src", - "mlkem/src/native/x86_64/src", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - # Turn off control-flow protection (CET) explicitly. Newer versions of - # clang turn it on by default and insert endbr64 instructions at every - # global symbol. - # We insert endbr64 instruction manually via the MLK_ASM_FN_SYMBOL - # macro. - # This leads to duplicate endbr64 instructions causing a failure when - # comparing the object code before and after simplification. - cflags="-Imlkem/src/native/x86_64/src/ -mavx2 -mbmi2 -msse4 -fcf-protection=none", - ) + # Triples of + # - input backend directory under dev/ + # - output backend directory under mlkem/ + # - cflags + worklist = [ + ( + f"dev/aarch64_{ty}/src", + "mlkem/src/native/aarch64/src", + "-Imlkem/src/native/aarch64/src", + ), + ( + "dev/fips202/aarch64/src", + "mlkem/src/fips202/native/aarch64/src", + "-Imlkem/src/fips202/native/aarch64/src -march=armv8.4-a+sha3", + ), + ( + "dev/fips202/aarch64", + "mlkem/src/fips202/native/aarch64", + "-Imlkem/src/fips202/native/aarch64 -march=armv8.4-a+sha3", + ), + ( + "dev/x86_64/src", + "mlkem/src/native/x86_64/src", + # Turn off control-flow protection (CET) explicitly. Newer versions of + # clang turn it on by default and insert endbr64 instructions at every + # global symbol. + # We insert endbr64 instruction manually via the MLK_ASM_FN_SYMBOL + # macro. + # This leads to duplicate endbr64 instructions causing a failure when + # comparing the object code before and after simplification. + "-Imlkem/src/native/x86_64/src/ -mavx2 -mbmi2 -msse4 -fcf-protection=none", + ), + ( + "dev/ppc64le/src", + "mlkem/src/native/ppc64le/src", + "-Imlkem/src/native/ppc64le/src -mvsx", + ), + ] + + for in_dir, out_dir, cflags in worklist: + synchronize_backend( + in_dir, + out_dir, + dry_run=dry_run, + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags=cflags, + ) def adjust_header_guard_for_filename(content, header_file): diff --git a/scripts/cfify b/scripts/cfify index a08d23707..fca0381fd 100755 --- a/scripts/cfify +++ b/scripts/cfify @@ -226,6 +226,19 @@ def add_cfi_directives(text, arch): i += 1 continue + elif arch == "riscv64": + # No special handling of riscv64 for now + pass + elif arch == "ppc64le": + # ppc64le: blr -> .cfi_endproc after blr + match = re.match(r"(\s*)blr\s*$", line, re.IGNORECASE) + if match: + indent = match.group(1) + result.append(line) + result.append(f"{indent}.cfi_endproc") + i += 1 + continue + result.append(line) i += 1 @@ -246,7 +259,7 @@ def main(): ) parser.add_argument( "--arch", - choices=["aarch64", "x86_64"], + choices=["aarch64", "x86_64", "riscv64", "ppc64le"], default="aarch64", help="Target architecture (default: aarch64)", ) diff --git a/scripts/simpasm b/scripts/simpasm index 5afa6bd9a..5a02221d6 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -246,7 +246,7 @@ def simplify(logger, args, asm_input, asm_output=None): logger.debug(f"Using raw global symbol {sym} going forward ...") cmd = [args.objdump, "--disassemble", tmp_objfile0] - if platform.system() == "Darwin": + if platform.system() == "Darwin" and args.arch == "aarch64": cmd += ["--triple=aarch64"] logger.debug(f"Disassembling temporary object file {tmp_objfile0} ...") @@ -255,6 +255,12 @@ def simplify(logger, args, asm_input, asm_output=None): logger.debug("Patching up disassembly ...") simplified = patchup_disasm(disasm, cfify=args.cfify) + # On ppc64le we're using 16 byte alignment + if args.arch == "ppc64le": + align = 16 + else: + align = 4 + autogen_header = [ "", "/*", @@ -264,7 +270,7 @@ def simplify(logger, args, asm_input, asm_output=None): "", "", ".text", - ".balign 4", + f".balign {align}", ] if args.preserve_preprocessor_directives is False: diff --git a/test/mk/components.mk b/test/mk/components.mk index cdcc3eb5d..88158f703 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -8,6 +8,7 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) + SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif