From 379b393df264e0b06e2b394c1262120a696bf3d4 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 8 Sep 2025 11:53:55 -0400 Subject: [PATCH 1/6] Added optimized ppc64le support functions for ML-KEM. The supported native functions include: 1. MLK_USE_NATIVE_NTT (ntt_ppc.S) 2. MLK_USE_NATIVE_INTT (intt_ppc.S) 3. MLK_USE_NATIVE_POLY_REDUCE (reduce.S) 4. MLK_USE_NATIVE_POLY_TOMONT (poly_tomont.S) And other interface functions and headers. Signed-off-by: Danny Tsen --- BIBLIOGRAPHY.md | 1 + dev/ppc64le/README.md | 6 + dev/ppc64le/meta.h | 53 ++ dev/ppc64le/src/arith_native_ppc64le.h | 24 + dev/ppc64le/src/consts.c | 155 ++++ dev/ppc64le/src/consts.h | 26 + dev/ppc64le/src/intt_ppc.S | 672 +++++++++++++++++ dev/ppc64le/src/ntt_ppc.S | 408 +++++++++++ dev/ppc64le/src/poly_tomont.S | 161 ++++ dev/ppc64le/src/reduce.S | 223 ++++++ integration/liboqs/ML-KEM-1024_META.yml | 19 + integration/liboqs/ML-KEM-512_META.yml | 19 + integration/liboqs/ML-KEM-768_META.yml | 19 + integration/liboqs/config_ppc64le.h | 266 +++++++ mlkem/mlkem_native.S | 27 + mlkem/mlkem_native.c | 27 + mlkem/src/native/meta.h | 4 + mlkem/src/native/ppc64le/README.md | 6 + mlkem/src/native/ppc64le/meta.h | 53 ++ .../native/ppc64le/src/arith_native_ppc64le.h | 24 + mlkem/src/native/ppc64le/src/consts.c | 155 ++++ mlkem/src/native/ppc64le/src/consts.h | 26 + mlkem/src/native/ppc64le/src/intt_ppc.S | 685 ++++++++++++++++++ mlkem/src/native/ppc64le/src/ntt_ppc.S | 412 +++++++++++ mlkem/src/native/ppc64le/src/poly_tomont.S | 165 +++++ mlkem/src/native/ppc64le/src/reduce.S | 228 ++++++ test/mk/components.mk | 1 + 27 files changed, 3865 insertions(+) create mode 100644 dev/ppc64le/README.md create mode 100644 dev/ppc64le/meta.h create mode 100644 dev/ppc64le/src/arith_native_ppc64le.h create mode 100644 dev/ppc64le/src/consts.c create mode 100644 dev/ppc64le/src/consts.h create mode 100644 dev/ppc64le/src/intt_ppc.S create mode 100644 dev/ppc64le/src/ntt_ppc.S create mode 100644 dev/ppc64le/src/poly_tomont.S create mode 100644 dev/ppc64le/src/reduce.S create mode 100644 integration/liboqs/config_ppc64le.h create mode 100644 mlkem/src/native/ppc64le/README.md create mode 100644 mlkem/src/native/ppc64le/meta.h create mode 100644 mlkem/src/native/ppc64le/src/arith_native_ppc64le.h create mode 100644 mlkem/src/native/ppc64le/src/consts.c create mode 100644 mlkem/src/native/ppc64le/src/consts.h create mode 100644 mlkem/src/native/ppc64le/src/intt_ppc.S create mode 100644 mlkem/src/native/ppc64le/src/ntt_ppc.S create mode 100644 mlkem/src/native/ppc64le/src/poly_tomont.S create mode 100644 mlkem/src/native/ppc64le/src/reduce.S diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index f10a15f6e..d75d368ef 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -29,6 +29,7 @@ source code and documentation. - [examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h](examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h) - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) + - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) - [integration/liboqs/config_x86_64.h](integration/liboqs/config_x86_64.h) - [mlkem/src/config.h](mlkem/src/config.h) - [mlkem/src/kem.c](mlkem/src/kem.c) diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md new file mode 100644 index 000000000..5125a40ea --- /dev/null +++ b/dev/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h new file mode 100644 index 000000000..34f8cbec6 --- /dev/null +++ b/dev/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_META_H +#define MLK_DEV_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_DEV_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 000000000..1c7534668 --- /dev/null +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c new file mode 100644 index 000000000..4c2fbdf61 --- /dev/null +++ b/dev/ppc64le/src/consts.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h new file mode 100644 index 000000000..d424601ac --- /dev/null +++ b/dev/ppc64le/src/consts.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_SRC_CONSTS_H +#define MLK_DEV_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_NTT_OFFSET64 1104 +#define IZETA_NTT_OFFSET127 1616 +#define IZETA_NTT_OFFSET63 2128 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S new file mode 100644 index 000000000..1f4b48e42 --- /dev/null +++ b/dev/ppc64le/src/intt_ppc.S @@ -0,0 +1,672 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +.machine "any" +.text + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +# Montgomery reduce constatnts +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro Load_4Coeffs start next step + mr 9, \start # j + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +#----------------------------------- +# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) +.endm + +.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 + xxmrglw 32+12, \_vs0, 10 + xxmrghw 32+11, \_vs0, 10 + xxpermdi 10, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs1, 11 + xxmrghw 32+15, \_vs1, 11 + xxpermdi 11, 32+16, 32+15, 3 + xxmrglw 32+12, \_vs2, 12 + xxmrghw 32+11, \_vs2, 12 + xxpermdi 12, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs3, 13 + xxmrghw 32+15, \_vs3, 13 + xxpermdi 13, 32+16, 32+15, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 + xxpermdi 10, 10, \_vs0, 3 + xxpermdi 11, 11, \_vs1, 3 + xxpermdi 12, 12, \_vs2, 3 + xxpermdi 13, 13, \_vs3, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +# intt +# t = r[j]; +# r[j] = barrett_reduce(t + r[j + len]); +# r[j + len] = r[j + len] - t; +# r[j + len] = fqmul(zeta, r[j + len]); + +# +# mlk_intt_ppc(r) +# +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # init vectors and constants + # Setup for Montgomery reduce + lxv 0, 0(4) + + lxv 32+V_QINV, QINV_OFFSET(4) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 + + # Setup for Barrett reduce + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 + + #xxspltiw 8, 26 # for power9 and above + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 + + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + +.align 4 +#__Len2: + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addi 14, 4, IZETA_NTT_OFFSET127 + li 7, 4 + li 15, 4 + mtctr 15 + li 5, 0 +intt_ppc__Loop2: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz intt_ppc__Loop2 + +.align 4 +#__Len4: + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + addi 14, 4, IZETA_NTT_OFFSET63 + li 5, 0 + li 7, 8 + li 15, 4 # loops + mtctr 15 +intt_ppc__Loop4: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz intt_ppc__Loop4 + +.align 4 +#__Len8: + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + #addi 14, 14, 512 + li 7, 16 + li 5, 0 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +#__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + #addi 14, 14, 768 + li 5, 0 + li 7, 32 + + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 16 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 256 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 272 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +#__Len32: + # + # 5. len = 32, start = 0, 64, 128, 192 + #addi 14, 14, 896 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +#__Len64: + # + # 6. len = 64, start = 0, 128 + #addi 14, 14, 960 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +#__Len128: + # 7. len = 128, start = 0 + # + #addi 14, 14, 992 + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S new file mode 100644 index 000000000..5bc1c34b8 --- /dev/null +++ b/dev/ppc64le/src/ntt_ppc.S @@ -0,0 +1,408 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + mr 9, \start + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 + + # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 + + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t +.endm + +.macro Write_One + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 +.endm + +.macro Write_Two + xxpermdi 32+17, 32+16, 32+15, 3 + xxpermdi 32+22, 32+21, 32+20, 3 + xxpermdi 32+27, 32+26, 32+25, 3 + xxpermdi 32+29, 32+31, 32+30, 3 + + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Write_Three + xxmrglw 32+14, 32+16, 32+15 + xxmrghw 32+13, 32+16, 32+15 + xxpermdi 32+17, 32+13, 32+14, 3 + xxmrglw 32+19, 32+21, 32+20 + xxmrghw 32+18, 32+21, 32+20 + xxpermdi 32+22, 32+18, 32+19, 3 + xxmrglw 32+14, 32+26, 32+25 + xxmrghw 32+13, 32+26, 32+25 + xxpermdi 32+27, 32+13, 32+14, 3 + xxmrglw 32+24, 32+31, 32+30 + xxmrghw 32+23, 32+31, 32+30 + xxpermdi 32+29, 32+23, 32+24, 3 + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +# +# mlk_ntt_ppc(int16_t *r) +# +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # get MLKEM_Q + lvx V_NMKQ,0,4 + + # zetas array + addi 14, 4, ZETA_NTT_OFFSET + + vxor 3, 3, 3 + vspltish 4, 1 + + lxv 32+V_QINV, QINV_OFFSET(4) + +.align 4 +#__Len128: + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +#__Len64: + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +#__Len32: + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 64 + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 128 + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 192 + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +#__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 +#__Len8: + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + li 5, 0 + li 7, 8 +.align 4 +ntt_ppc__Len4: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + bdnz ntt_ppc__Len4 + + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas + + addi 14, 4, ZETA_NTT_OFFSET64 + + li 15, 4 + mtctr 15 + li 5, 0 + li 7, 4 +.align 4 +ntt_ppc__Len2: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + bdnz ntt_ppc__Len2 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S new file mode 100644 index 000000000..b7b010aaf --- /dev/null +++ b/dev/ppc64le/src/poly_tomont.S @@ -0,0 +1,161 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# Poly_tomont: Inplace conversion of all coefficients of a polynomial +# from normal domain to Montgomery domain +# +# Arguments:*r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_v0, _v1, _v2, _v3) +# +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S new file mode 100644 index 000000000..dfb634392 --- /dev/null +++ b/dev/ppc64le/src/reduce.S @@ -0,0 +1,223 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# +# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +# for details of the Barrett reduction +# +# Arguments: *r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +# +# Conditional addition to get unsigned canonical representative +# +.macro To_unsigned_16 + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + vxor 7, 7, 7 + + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # +.align 4 + addi 3, 3, -512 + xxspltib 32+9 ,0 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 7d8e50d4c..9c7fe672a 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index aa88537d3..f46dbfdbf 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 254d67478..1b01c4d42 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h new file mode 100644 index 000000000..2fa1cdbcf --- /dev/null +++ b/integration/liboqs/config_ppc64le.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS140_3_IG] + * Implementation Guidance for FIPS 140-3 and the Cryptographic Module + * Validation Program National Institute of Standards and Technology + * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements + */ + +#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H +#define MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H + +/****************************************************************************** + * Name: MLK_CONFIG_PARAMETER_SET + * + * Description: Specifies the parameter set for ML-KEM + * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 + * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 + * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#ifndef MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_PARAMETER_SET \ + 768 /* Change this for different security strengths */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_NAMESPACE_PREFIX + * + * Description: The prefix to use to namespace global symbols from mlkem/. + * + * In a multi-level build (that is, if either + * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or + * - MLK_CONFIG_MULTILEVEL_NO_SHARED, + * are set, level-dependent symbols will additionally be prefixed + * with the parameter set (512/768/1024). + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if MLK_CONFIG_PARAMETER_SET == 512 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 768 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 1024 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + * + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + +/****************************************************************************** + * Name: MLK_CONFIG_ARITH_BACKEND_FILE + * + * Description: The arithmetic backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202_CUSTOM_HEADER \ + "../../integration/liboqs/fips202_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202-X4 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202x4.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER \ + "../../integration/liboqs/fips202x4_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_ZEROIZE + * + * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes + * intermediate stack buffers before returning from function calls. + * + * Set this option and define `mlk_zeroize` if you want to + * use a custom method to zeroize intermediate stack buffers. + * The default implementation uses SecureZeroMemory on Windows + * and a memset + compiler barrier otherwise. If neither of those + * is available on the target platform, compilation will fail, + * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide + * a custom implementation of `mlk_zeroize()`. + * + * WARNING: + * The explicit stack zeroization conducted by mlkem-native + * reduces the likelihood of data leaking on the stack, but + * does not eliminate it! The C standard makes no guarantee about + * where a compiler allocates structures and whether/where it makes + * copies of them. Also, in addition to entire structures, there + * may also be potentially exploitable leakage of individual values + * on the stack. + * + * If you need bullet-proof zeroization of the stack, you need to + * consider additional measures instead of of what this feature + * provides. In this case, you can set mlk_zeroize to a no-op. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_ZEROIZE + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES + * + * Description: mlkem-native does not provide a secure randombytes + * implementation. Such an implementation has to provided by the + * consumer. + * + * If this option is not set, mlkem-native expects a function + * void randombytes(uint8_t *out, size_t outlen). + * + * Set this option and define `mlk_randombytes` if you want to + * use a custom method to sample randombytes with a different name + * or signature. + * + *****************************************************************************/ +#define MLK_CONFIG_CUSTOM_RANDOMBYTES +#if !defined(__ASSEMBLER__) +#include +#include +#include "../../mlkem/src/sys.h" +static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) +{ + OQS_randombytes(ptr, len); +} +#endif /* !__ASSEMBLER__ */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_ASM + * + * Description: If this option is set, mlkem-native will be built without + * use of native code or inline assembly. + * + * By default, inline assembly is used to implement value barriers. + * Without inline assembly, mlkem-native will use a global volatile + * 'opt blocker' instead; see verify.h. + * + * Inline assembly is also used to implement a secure zeroization + * function on non-Windows platforms. If this option is set and + * the target platform is not Windows, you MUST set + * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization + * function. + * + * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and + * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no + *native backends will be used. + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_ASM */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT + * + * Description: Compliance with @[FIPS140_3_IG, p.87] requires a + * Pairwise Consistency Test (PCT) to be carried out on a freshly + * generated keypair before it can be exported. + * + * Set this option if such a check should be implemented. + * In this case, crypto_kem_keypair_derand and crypto_kem_keypair + * will return a non-zero error code if the PCT failed. + * + * NOTE: This feature will drastically lower the performance of + * key generation. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + * + * Description: If this option is set, the user must provide a runtime + * function `static inline int mlk_break_pct() { ... }` to + * indicate whether the PCT should be made fail. + * + * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + #if !defined(__ASSEMBLER__) + #include "sys.h" + static MLK_INLINE int mlk_break_pct(void) + { + ... return 0/1 depending on whether PCT should be broken ... + } + #endif +*/ + +/* Enable valgrind-based assertions in mlkem-native through macro + * from libOQS. */ +#if !defined(__ASSEMBLER__) +#include +#if defined(OQS_ENABLE_TEST_CONSTANT_TIME) +#define MLK_CONFIG_CT_TESTING_ENABLED +#endif +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H */ diff --git a/mlkem/mlkem_native.S b/mlkem/mlkem_native.S index bff040079..6f2a8b221 100644 --- a/mlkem/mlkem_native.S +++ b/mlkem/mlkem_native.S @@ -458,6 +458,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 74c1f9387..74903ed1d 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -445,6 +445,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index f2b9b848b..e39188323 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -18,4 +18,8 @@ #include "x86_64/meta.h" #endif +#ifdef MLK_SYS_PPC64LE +#include "ppc64le/meta.h" +#endif + #endif /* !MLK_NATIVE_META_H */ diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md new file mode 100644 index 000000000..5125a40ea --- /dev/null +++ b/mlkem/src/native/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h new file mode 100644 index 000000000..54b3ddd9c --- /dev/null +++ b/mlkem/src/native/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 000000000..dbcee3e3e --- /dev/null +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c new file mode 100644 index 000000000..4c2fbdf61 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h new file mode 100644 index 000000000..49f519d0c --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#define MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_NTT_OFFSET64 1104 +#define IZETA_NTT_OFFSET127 1616 +#define IZETA_NTT_OFFSET63 2128 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S new file mode 100644 index 000000000..1a4975ba0 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -0,0 +1,685 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +.machine "any" +.text + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +# Montgomery reduce constatnts +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro Load_4Coeffs start next step + mr 9, \start # j + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +#----------------------------------- +# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) +.endm + +.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 + xxmrglw 32+12, \_vs0, 10 + xxmrghw 32+11, \_vs0, 10 + xxpermdi 10, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs1, 11 + xxmrghw 32+15, \_vs1, 11 + xxpermdi 11, 32+16, 32+15, 3 + xxmrglw 32+12, \_vs2, 12 + xxmrghw 32+11, \_vs2, 12 + xxpermdi 12, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs3, 13 + xxmrghw 32+15, \_vs3, 13 + xxpermdi 13, 32+16, 32+15, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 + xxpermdi 10, 10, \_vs0, 3 + xxpermdi 11, 11, \_vs1, 3 + xxpermdi 12, 12, \_vs2, 3 + xxpermdi 13, 13, \_vs3, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +# intt +# t = r[j]; +# r[j] = barrett_reduce(t + r[j + len]); +# r[j + len] = r[j + len] - t; +# r[j + len] = fqmul(zeta, r[j + len]); + +# +# mlk_intt_ppc(r) +# +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # init vectors and constants + # Setup for Montgomery reduce + lxv 0, 0(4) + + lxv 32+V_QINV, QINV_OFFSET(4) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 + + # Setup for Barrett reduce + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 + + #xxspltiw 8, 26 # for power9 and above + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 + + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + +.align 4 +#__Len2: + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addi 14, 4, IZETA_NTT_OFFSET127 + li 7, 4 + li 15, 4 + mtctr 15 + li 5, 0 +intt_ppc__Loop2: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz intt_ppc__Loop2 + +.align 4 +#__Len4: + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + addi 14, 4, IZETA_NTT_OFFSET63 + li 5, 0 + li 7, 8 + li 15, 4 # loops + mtctr 15 +intt_ppc__Loop4: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz intt_ppc__Loop4 + +.align 4 +#__Len8: + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + #addi 14, 14, 512 + li 7, 16 + li 5, 0 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +#__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + #addi 14, 14, 768 + li 5, 0 + li 7, 32 + + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 16 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 256 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 272 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +#__Len32: + # + # 5. len = 32, start = 0, 64, 128, 192 + #addi 14, 14, 896 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +#__Len64: + # + # 6. len = 64, start = 0, 128 + #addi 14, 14, 960 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +#__Len128: + # 7. len = 128, start = 0 + # + #addi 14, 14, 992 + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S new file mode 100644 index 000000000..e9a8df81f --- /dev/null +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -0,0 +1,412 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + mr 9, \start + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 + + # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 + + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t +.endm + +.macro Write_One + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 +.endm + +.macro Write_Two + xxpermdi 32+17, 32+16, 32+15, 3 + xxpermdi 32+22, 32+21, 32+20, 3 + xxpermdi 32+27, 32+26, 32+25, 3 + xxpermdi 32+29, 32+31, 32+30, 3 + + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Write_Three + xxmrglw 32+14, 32+16, 32+15 + xxmrghw 32+13, 32+16, 32+15 + xxpermdi 32+17, 32+13, 32+14, 3 + xxmrglw 32+19, 32+21, 32+20 + xxmrghw 32+18, 32+21, 32+20 + xxpermdi 32+22, 32+18, 32+19, 3 + xxmrglw 32+14, 32+26, 32+25 + xxmrghw 32+13, 32+26, 32+25 + xxpermdi 32+27, 32+13, 32+14, 3 + xxmrglw 32+24, 32+31, 32+30 + xxmrghw 32+23, 32+31, 32+30 + xxpermdi 32+29, 32+23, 32+24, 3 + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +# +# mlk_ntt_ppc(int16_t *r) +# +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # get MLKEM_Q + lvx V_NMKQ,0,4 + + # zetas array + addi 14, 4, ZETA_NTT_OFFSET + + vxor 3, 3, 3 + vspltish 4, 1 + + lxv 32+V_QINV, QINV_OFFSET(4) + +.align 4 +#__Len128: + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +#__Len64: + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +#__Len32: + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 64 + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 128 + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 192 + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +#__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 +#__Len8: + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + li 5, 0 + li 7, 8 +.align 4 +ntt_ppc__Len4: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + bdnz ntt_ppc__Len4 + + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas + + addi 14, 4, ZETA_NTT_OFFSET64 + + li 15, 4 + mtctr 15 + li 5, 0 + li 7, 4 +.align 4 +ntt_ppc__Len2: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + bdnz ntt_ppc__Len2 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S new file mode 100644 index 000000000..eb770a631 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -0,0 +1,165 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# Poly_tomont: Inplace conversion of all coefficients of a polynomial +# from normal domain to Montgomery domain +# +# Arguments:*r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_v0, _v1, _v2, _v3) +# +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S new file mode 100644 index 000000000..558410955 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -0,0 +1,228 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# +# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +# for details of the Barrett reduction +# +# Arguments: *r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +# +# Conditional addition to get unsigned canonical representative +# +.macro To_unsigned_16 + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + vxor 7, 7, 7 + + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # +.align 4 + addi 3, 3, -512 + xxspltib 32+9 ,0 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ diff --git a/test/mk/components.mk b/test/mk/components.mk index cdcc3eb5d..88158f703 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -8,6 +8,7 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) + SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif From 7eef7dd94e3e1848101d620395a6dbb28b22747a Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Thu, 11 Sep 2025 09:03:15 +0100 Subject: [PATCH 2/6] autogen: Prepare for rv64 and ppc64le backends This commit prepares scripts/autogen and scripts/cfify for the work-in-progress addition of riscv64 and ppc64le backends. Specifically, simpasm needs to be invoked with the right cross compiler for those architectures, and scripts/cfify needs to accept riscv64 and ppc64le architecture parameters. Signed-off-by: Hanno Becker --- scripts/autogen | 112 ++++++++++++++++++++++++++++-------------------- scripts/cfify | 15 ++++++- scripts/simpasm | 10 ++++- 3 files changed, 88 insertions(+), 49 deletions(-) diff --git a/scripts/autogen b/scripts/autogen index 819f00def..74c68b650 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -1431,6 +1431,10 @@ def gen_monolithic_source_file(dry_run=False): for c in filter(native_arith_x86_64, c_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + for c in filter(native_arith_ppc64le, c_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -1766,6 +1770,10 @@ def update_via_simpasm( source_arch = "aarch64" elif "x86_64" in infile_full: source_arch = "x86_64" + elif "ppc64le" in infile_full: + source_arch = "ppc64le" + elif "riscv64" in infile_full: + source_arch = "riscv64" else: raise Exception(f"Could not detect architecture of source file {infile_full}.") # Check native architecture @@ -1775,7 +1783,14 @@ def update_via_simpasm( native_arch = "x86_64" if native_arch != source_arch: - cross_prefix = f"{source_arch}-unknown-linux-gnu-" + arch_to_cross_prefix = { + "aarch64": "aarch64-unknown-linux-gnu-", + "x86_64": "x86_64-unknown-linux-gnu-", + "ppc64le": "powerpc64le-unknown-linux-gnu-", + "riscv64": "riscv64-unknown-linux-gnu-", + } + + cross_prefix = arch_to_cross_prefix[source_arch] cross_gcc = cross_prefix + "gcc" # Check if cross-compiler is present if shutil.which(cross_gcc) is None: @@ -1788,13 +1803,12 @@ def update_via_simpasm( with tempfile.NamedTemporaryFile(suffix=".S") as tmp: try: # Determine architecture from filename - arch = "aarch64" if "aarch64" in infile_full else "x86_64" cmd = [ "./scripts/simpasm", "--objdump=llvm-objdump", "--cfify", - "--arch=" + arch, + "--arch=" + source_arch, "-i", infile_full, "-o", @@ -2058,49 +2072,55 @@ def synchronize_backends( ), ) - synchronize_backend( - f"dev/aarch64_{ty}/src", - "mlkem/src/native/aarch64/src", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - cflags="-Imlkem/src/native/aarch64/src", - ) - synchronize_backend( - "dev/fips202/aarch64/src", - "mlkem/src/fips202/native/aarch64/src", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - cflags="-Imlkem/src/fips202/native/aarch64/src -march=armv8.4-a+sha3", - ) - synchronize_backend( - "dev/fips202/aarch64", - "mlkem/src/fips202/native/aarch64", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - cflags="-Imlkem/src/fips202/native/aarch64 -march=armv8.4-a+sha3", - ) - synchronize_backend( - "dev/x86_64/src", - "mlkem/src/native/x86_64/src", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - # Turn off control-flow protection (CET) explicitly. Newer versions of - # clang turn it on by default and insert endbr64 instructions at every - # global symbol. - # We insert endbr64 instruction manually via the MLK_ASM_FN_SYMBOL - # macro. - # This leads to duplicate endbr64 instructions causing a failure when - # comparing the object code before and after simplification. - cflags="-Imlkem/src/native/x86_64/src/ -mavx2 -mbmi2 -msse4 -fcf-protection=none", - ) + # Triples of + # - input backend directory under dev/ + # - output backend directory under mlkem/ + # - cflags + worklist = [ + ( + f"dev/aarch64_{ty}/src", + "mlkem/src/native/aarch64/src", + "-Imlkem/src/native/aarch64/src", + ), + ( + "dev/fips202/aarch64/src", + "mlkem/src/fips202/native/aarch64/src", + "-Imlkem/src/fips202/native/aarch64/src -march=armv8.4-a+sha3", + ), + ( + "dev/fips202/aarch64", + "mlkem/src/fips202/native/aarch64", + "-Imlkem/src/fips202/native/aarch64 -march=armv8.4-a+sha3", + ), + ( + "dev/x86_64/src", + "mlkem/src/native/x86_64/src", + # Turn off control-flow protection (CET) explicitly. Newer versions of + # clang turn it on by default and insert endbr64 instructions at every + # global symbol. + # We insert endbr64 instruction manually via the MLK_ASM_FN_SYMBOL + # macro. + # This leads to duplicate endbr64 instructions causing a failure when + # comparing the object code before and after simplification. + "-Imlkem/src/native/x86_64/src/ -mavx2 -mbmi2 -msse4 -fcf-protection=none", + ), + ( + "dev/ppc64le/src", + "mlkem/src/native/ppc64le/src", + "-Imlkem/src/native/ppc64le/src -mvsx", + ), + ] + + for in_dir, out_dir, cflags in worklist: + synchronize_backend( + in_dir, + out_dir, + dry_run=dry_run, + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags=cflags, + ) def adjust_header_guard_for_filename(content, header_file): diff --git a/scripts/cfify b/scripts/cfify index a08d23707..fca0381fd 100755 --- a/scripts/cfify +++ b/scripts/cfify @@ -226,6 +226,19 @@ def add_cfi_directives(text, arch): i += 1 continue + elif arch == "riscv64": + # No special handling of riscv64 for now + pass + elif arch == "ppc64le": + # ppc64le: blr -> .cfi_endproc after blr + match = re.match(r"(\s*)blr\s*$", line, re.IGNORECASE) + if match: + indent = match.group(1) + result.append(line) + result.append(f"{indent}.cfi_endproc") + i += 1 + continue + result.append(line) i += 1 @@ -246,7 +259,7 @@ def main(): ) parser.add_argument( "--arch", - choices=["aarch64", "x86_64"], + choices=["aarch64", "x86_64", "riscv64", "ppc64le"], default="aarch64", help="Target architecture (default: aarch64)", ) diff --git a/scripts/simpasm b/scripts/simpasm index 5afa6bd9a..5a02221d6 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -246,7 +246,7 @@ def simplify(logger, args, asm_input, asm_output=None): logger.debug(f"Using raw global symbol {sym} going forward ...") cmd = [args.objdump, "--disassemble", tmp_objfile0] - if platform.system() == "Darwin": + if platform.system() == "Darwin" and args.arch == "aarch64": cmd += ["--triple=aarch64"] logger.debug(f"Disassembling temporary object file {tmp_objfile0} ...") @@ -255,6 +255,12 @@ def simplify(logger, args, asm_input, asm_output=None): logger.debug("Patching up disassembly ...") simplified = patchup_disasm(disasm, cfify=args.cfify) + # On ppc64le we're using 16 byte alignment + if args.arch == "ppc64le": + align = 16 + else: + align = 4 + autogen_header = [ "", "/*", @@ -264,7 +270,7 @@ def simplify(logger, args, asm_input, asm_output=None): "", "", ".text", - ".balign 4", + f".balign {align}", ] if args.preserve_preprocessor_directives is False: From 94988a18177bf85309a5d0cbed8a0487d97b1c59 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Sat, 20 Sep 2025 05:17:22 +0100 Subject: [PATCH 3/6] ppc64le: `untabify` assembly Signed-off-by: Hanno Becker --- dev/ppc64le/src/intt_ppc.S | 1018 ++++++++++---------- dev/ppc64le/src/ntt_ppc.S | 636 ++++++------ dev/ppc64le/src/poly_tomont.S | 204 ++-- dev/ppc64le/src/reduce.S | 322 +++---- mlkem/src/native/ppc64le/src/intt_ppc.S | 1018 ++++++++++---------- mlkem/src/native/ppc64le/src/ntt_ppc.S | 636 ++++++------ mlkem/src/native/ppc64le/src/poly_tomont.S | 204 ++-- mlkem/src/native/ppc64le/src/reduce.S | 322 +++---- 8 files changed, 2180 insertions(+), 2180 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 1f4b48e42..608fc9295 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -37,177 +37,177 @@ #define V1441 10 .macro Load_4Coeffs start next step - mr 9, \start # j - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+8, 3, 10 # r[j+len] - lxvd2x 32+12, 3, 17 # r[j+len] - lxvd2x 32+16, 3, 19 # r[j+len] - lxvd2x 32+20, 3, 21 # r[j+len] - xxpermdi 32+8, 32+8, 32+8, 2 - xxpermdi 32+12, 32+12, 32+12, 2 - xxpermdi 32+16, 32+16, 32+16, 2 - xxpermdi 32+20, 32+20, 32+20, 2 - - lxvd2x 32+21, 3, 9 - lxvd2x 32+22, 3, 16 - lxvd2x 32+23, 3, 18 - lxvd2x 32+24, 3, 20 - xxpermdi 32+21, 32+21, 32+21, 2 - xxpermdi 32+22, 32+22, 32+22, 2 - xxpermdi 32+23, 32+23, 32+23, 2 - xxpermdi 32+24, 32+24, 32+24, 2 - - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + mr 9, \start # j + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 - vxor 7, 7, 7 - xxlor 32+3, 6, 6 # V_MKQ - xxlor 32+1, 7, 7 # V_25 - xxlor 32+2, 8, 8 # V_26 - # Multify Odd/Even signed halfword; - # Results word bound by 2^32 in abs value. - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - # Right shift and pack lower halfword, - # results bond to 2^16 in abs value - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - # Modulo multify-Low unsigned halfword; - # results bond to 2^16 * q in abs value. - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm #----------------------------------- # MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 25, \_vz0, 3 - vmladduhm 20, 26, \_vz1, 3 - vmladduhm 27, 30, \_vz2, 3 - vmladduhm 28, 31, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 25, \_vz0, 3 - vmhraddshs 19, 26, \_vz1, 3 - vmhraddshs 24, 30, \_vz2, 3 - vmhraddshs 29, 31, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah \_vo0, 15, 4 # >> 1 - vsrah \_vo1, 20, 4 # >> 1 - vsrah \_vo2, 25, 4 # >> 1 - vsrah \_vo3, 30, 4 # >> 1 + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 .endm .macro Set_mont_consts - xxlor 32+5, 0, 0 # V_NMKQ - xxlor 32+2, 2, 2 # V_QINV - xxlor 32+3, 3, 3 # 0 - xxlor 32+4, 4, 4 # 1 + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 .endm .macro Load_next_4zetas - lxv 32+V_Z0, 0(14) - lxv 32+V_Z1, 16(14) - lxv 32+V_Z2, 32(14) - lxv 32+V_Z3, 48(14) - addi 14, 14, 64 + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvx \_vs0, 3, 9 - stxvx \_vs1, 3, 16 - stxvx \_vs2, 3, 18 - stxvx \_vs3, 3, 20 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvx \_vs0, 3, 10 - stxvx \_vs1, 3, 17 - stxvx \_vs2, 3, 19 - stxvx \_vs3, 3, 21 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 .endm .macro Reload_4coeffs - lxv 32+25, 0(3) - lxv 32+26, 16(3) - lxv 32+30, 32(3) - lxv 32+31, 48(3) - addi 3, 3, 64 + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - stxv \_vs0, -128(3) - stxv \_vs1, -112(3) - stxv \_vs2, -96(3) - stxv \_vs3, -80(3) - stxv \_vs4, -64(3) - stxv \_vs5, -48(3) - stxv \_vs6, -32(3) - stxv \_vs7, -16(3) + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) .endm .macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 @@ -230,10 +230,10 @@ .endm .macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 - xxpermdi 10, 10, \_vs0, 3 - xxpermdi 11, 11, \_vs1, 3 - xxpermdi 12, 12, \_vs2, 3 - xxpermdi 13, 13, \_vs3, 3 + xxpermdi 10, 10, \_vs0, 3 + xxpermdi 11, 11, \_vs1, 3 + xxpermdi 12, 12, \_vs2, 3 + xxpermdi 13, 13, \_vs3, 3 stxvd2x 10, 3, 9 stxvd2x 11, 3, 16 stxvd2x 12, 3, 18 @@ -253,404 +253,404 @@ .align 4 MLK_ASM_FN_SYMBOL(intt_ppc) - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - stxv 32+31, 304(1) - - # init vectors and constants - # Setup for Montgomery reduce - lxv 0, 0(4) - - lxv 32+V_QINV, QINV_OFFSET(4) # QINV - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 # QINV - xxlor 3, 32+3, 32+3 # 0 - xxlor 4, 32+4, 32+4 # 1 - - # Setup for Barrett reduce - lxv 6, Q_OFFSET(4) # V_MKQ - lxv 32+V20159, C20159_OFFSET(4) # V20159 - lxv 7, 0(4) # V_25 - - #xxspltiw 8, 26 # for power9 and above - vspltisw 8, 13 - vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 # V_26 store at vs8 - - vspltisw 9, 1 - vsubuwm 10, 8, 9 # 25 - vslw 9, 9, 10 - xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # init vectors and constants + # Setup for Montgomery reduce + lxv 0, 0(4) + + lxv 32+V_QINV, QINV_OFFSET(4) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 + + # Setup for Barrett reduce + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 + + #xxspltiw 8, 26 # for power9 and above + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 + + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 #__Len2: - # - # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # Update zetas vectors, each vector has 2 zetas - addi 14, 4, IZETA_NTT_OFFSET127 - li 7, 4 - li 15, 4 - mtctr 15 - li 5, 0 + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addi 14, 4, IZETA_NTT_OFFSET127 + li 7, 4 + li 15, 4 + mtctr 15 + li 5, 0 intt_ppc__Loop2: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - bdnz intt_ppc__Loop2 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz intt_ppc__Loop2 .align 4 #__Len4: - # - # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addi 14, 4, IZETA_NTT_OFFSET63 - li 5, 0 - li 7, 8 - li 15, 4 # loops - mtctr 15 + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + addi 14, 4, IZETA_NTT_OFFSET63 + li 5, 0 + li 7, 8 + li 15, 4 # loops + mtctr 15 intt_ppc__Loop4: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - bdnz intt_ppc__Loop4 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz intt_ppc__Loop4 .align 4 #__Len8: - # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - #addi 14, 14, 512 - li 7, 16 - li 5, 0 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + #addi 14, 14, 512 + li 7, 16 + li 5, 0 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 #__Len16: - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - #addi 14, 14, 768 - li 5, 0 - li 7, 32 - - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + #addi 14, 14, 768 + li 5, 0 + li 7, 32 + + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 16 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 256 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 272 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - addi 14, 14, -64 - Load_next_4zetas + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 #__Len32: # # 5. len = 32, start = 0, 64, 128, 192 - #addi 14, 14, 896 - li 5, 0 - li 7, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + #addi 14, 14, 896 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 #__Len64: - # - # 6. len = 64, start = 0, 128 - #addi 14, 14, 960 - li 5, 0 - li 7, 128 - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lxv 32+10, -16(14) - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 320 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lxv 32+10, -16(14) - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + # + # 6. len = 64, start = 0, 128 + #addi 14, 14, 960 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 #__Len128: - # 7. len = 128, start = 0 - # - #addi 14, 14, 992 - li 5, 0 # start - li 7, 256 # len * 2 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - xxlor 9, 32+10, 32+10 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 192 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + # 7. len = 128, start = 0 + # + #addi 14, 14, 992 + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # - # Montgomery reduce loops with constant 1441 - # - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - lxv 32+31, 304(1) - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 - blr + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 5bc1c34b8..33413d956 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -19,10 +19,10 @@ #define V_QINV 2 #define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 #define V_ZETA 10 .machine "any" @@ -37,121 +37,121 @@ # MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) # .macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 - mr 9, \start - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+13, 3, 10 # r[j+len] - lxvd2x 32+18, 3, 17 # r[j+len] - lxvd2x 32+23, 3, 19 # r[j+len] - lxvd2x 32+28, 3, 21 # r[j+len] - xxpermdi 32+13, 32+13, 32+13, 2 - xxpermdi 32+18, 32+18, 32+18, 2 - xxpermdi 32+23, 32+23, 32+23, 2 - xxpermdi 32+28, 32+28, 32+28, 2 - - # fqmul = zeta * coefficient - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 13, \_vz0, 3 - vmladduhm 20, 18, \_vz1, 3 - vmladduhm 25, 23, \_vz2, 3 - vmladduhm 30, 28, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 13, \_vz0, 3 - vmhraddshs 19, 18, \_vz1, 3 - vmhraddshs 24, 23, \_vz2, 3 - vmhraddshs 29, 28, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah 13, 15, 4 # >> 1 - vsrah 18, 20, 4 # >> 1 - vsrah 23, 25, 4 # >> 1 - vsrah 28, 30, 4 # >> 1 - - lxvd2x 32+12, 3, 9 # r[j] - lxvd2x 32+17, 3, 16 # r[j] - lxvd2x 32+22, 3, 18 # r[j] - lxvd2x 32+27, 3, 20 # r[j] - xxpermdi 32+12, 32+12, 32+12, 2 - xxpermdi 32+17, 32+17, 32+17, 2 - xxpermdi 32+22, 32+22, 32+22, 2 - xxpermdi 32+27, 32+27, 32+27, 2 - - # Since the result of the Montgomery multiplication is bounded - # by q in absolute value. - # Finally to complete the final update of the results with add/sub - vsubuhm 16, 12, 13 # r - t - vadduhm 15, 13, 12 # r + t - vsubuhm 21, 17, 18 # r - t - vadduhm 20, 18, 17 # r + t - vsubuhm 26, 22, 23 # r - t - vadduhm 25, 23, 22 # r + t - vsubuhm 31, 27, 28 # r - t - vadduhm 30, 28, 27 # r + t + mr 9, \start + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 + + # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 + + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t .endm .macro Write_One - stxvx 32+15, 3, 9 - stxvx 32+16, 3, 10 - stxvx 32+20, 3, 16 - stxvx 32+21, 3, 17 - stxvx 32+25, 3, 18 - stxvx 32+26, 3, 19 - stxvx 32+30, 3, 20 - stxvx 32+31, 3, 21 + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 .endm .macro Write_Two - xxpermdi 32+17, 32+16, 32+15, 3 - xxpermdi 32+22, 32+21, 32+20, 3 - xxpermdi 32+27, 32+26, 32+25, 3 - xxpermdi 32+29, 32+31, 32+30, 3 - - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 + xxpermdi 32+17, 32+16, 32+15, 3 + xxpermdi 32+22, 32+21, 32+20, 3 + xxpermdi 32+27, 32+26, 32+25, 3 + xxpermdi 32+29, 32+31, 32+30, 3 + + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 .endm .macro Write_Three - xxmrglw 32+14, 32+16, 32+15 - xxmrghw 32+13, 32+16, 32+15 - xxpermdi 32+17, 32+13, 32+14, 3 - xxmrglw 32+19, 32+21, 32+20 - xxmrghw 32+18, 32+21, 32+20 - xxpermdi 32+22, 32+18, 32+19, 3 - xxmrglw 32+14, 32+26, 32+25 - xxmrghw 32+13, 32+26, 32+25 - xxpermdi 32+27, 32+13, 32+14, 3 - xxmrglw 32+24, 32+31, 32+30 - xxmrghw 32+23, 32+31, 32+30 - xxpermdi 32+29, 32+23, 32+24, 3 - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 + xxmrglw 32+14, 32+16, 32+15 + xxmrghw 32+13, 32+16, 32+15 + xxpermdi 32+17, 32+13, 32+14, 3 + xxmrglw 32+19, 32+21, 32+20 + xxmrghw 32+18, 32+21, 32+20 + xxpermdi 32+22, 32+18, 32+19, 3 + xxmrglw 32+14, 32+26, 32+25 + xxmrghw 32+13, 32+26, 32+25 + xxpermdi 32+27, 32+13, 32+14, 3 + xxmrglw 32+24, 32+31, 32+30 + xxmrghw 32+23, 32+31, 32+30 + xxpermdi 32+29, 32+23, 32+24, 3 + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 .endm .macro Load_next_4zetas - lxv 32+V_Z0, 0(14) - lxv 32+V_Z1, 16(14) - lxv 32+V_Z2, 32(14) - lxv 32+V_Z3, 48(14) - addi 14, 14, 64 + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 .endm # @@ -161,241 +161,241 @@ .align 4 MLK_ASM_FN_SYMBOL(ntt_ppc) - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - stxv 32+31, 304(1) - - # get MLKEM_Q - lvx V_NMKQ,0,4 - - # zetas array - addi 14, 4, ZETA_NTT_OFFSET - - vxor 3, 3, 3 - vspltish 4, 1 - - lxv 32+V_QINV, QINV_OFFSET(4) + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # get MLKEM_Q + lvx V_NMKQ,0,4 + + # zetas array + addi 14, 4, ZETA_NTT_OFFSET + + vxor 3, 3, 3 + vspltish 4, 1 + + lxv 32+V_QINV, QINV_OFFSET(4) .align 4 #__Len128: - # - # Compute coefficients of the NTT based on the following loop. - # for (len = 128; len ≥ 2; len = len/2) - # - # 1. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 192 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 #__Len64: - # - # 2. len = 64, start = 0, 128 - # k += 2 - li 5, 0 - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 320 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 #__Len32: - # - # 3. len = 32, start = 0, 64, 128, 192 - # k += 4 - li 5, 0 - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - #li 5, 64 - li 5, 128 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - #li 5, 128 - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - #li 5, 192 - li 5, 384 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 64 + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 128 + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 192 + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 #__Len16: - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - # k += 8 - li 5, 0 - li 7, 32 - Load_next_4zetas - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 16 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - Load_next_4zetas - li 5, 256 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 272 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One .align 4 #__Len8: - # - # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - # k += 16 - li 5, 0 - li 7, 16 - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 128 - - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 256 - - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 384 - - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - # - # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - # k += 32 - li 15, 4 # loops - mtctr 15 - li 5, 0 - li 7, 8 + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + li 5, 0 + li 7, 8 .align 4 ntt_ppc__Len4: - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 - bdnz ntt_ppc__Len4 + bdnz ntt_ppc__Len4 - # - # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # k += 64 - # Update zetas vectors, each vector has 2 zetas + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_NTT_OFFSET64 + addi 14, 4, ZETA_NTT_OFFSET64 - li 15, 4 - mtctr 15 - li 5, 0 - li 7, 4 + li 15, 4 + mtctr 15 + li 5, 0 + li 7, 4 .align 4 ntt_ppc__Len2: - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 - - bdnz ntt_ppc__Len2 - - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - lxv 32+31, 304(1) - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 - blr + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + bdnz ntt_ppc__Len2 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index b7b010aaf..66f5aba81 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -39,116 +39,116 @@ # MREDUCE_4X(_v0, _v1, _v2, _v3) # .macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+13, 0, 3 - addi 3, 3, 16 - lxvd2x 32+18, 0, 3 - addi 3, 3, 16 - lxvd2x 32+23, 0, 3 - addi 3, 3, 16 - lxvd2x 32+7, 0, 3 - addi 3, 3, 16 - - vmladduhm 15, 13, V1353, 3 - vmladduhm 20, 18, V1353, 3 - vmladduhm 25, 23, V1353, 3 - vmladduhm 9, 7, V1353, 3 - - vmhraddshs 14, 13, V1353, 3 - vmhraddshs 19, 18, V1353, 3 - vmhraddshs 24, 23, V1353, 3 - vmhraddshs 8, 7, V1353, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 9, 9, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 9, 9, V_NMKQ, 8 - - vsrah \_v0, 15, 4 # >> 1 - vsrah \_v1, 20, 4 # >> 1 - vsrah \_v2, 25, 4 # >> 1 - vsrah \_v3, 9, 4 # >> 1 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 .endm .macro Write_8X - stxvd2x 32+27, 4, 3 - stxvd2x 32+28, 5, 3 - stxvd2x 32+29, 6, 3 - stxvd2x 32+30, 7, 3 - stxvd2x 32+13, 8, 3 - stxvd2x 32+18, 9, 3 - stxvd2x 32+23, 10, 3 - stxvd2x 32+7, 11, 3 + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu 1, -320(1) - mflr 0 - - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - - lxv 32+V_NMKQ, NQ_OFFSET(4) - lxv 32+V_QINV, QINV_OFFSET(4) - lxv 32+V1353, C1353_OFFSET(4) - - vxor 3, 3, 3 - vspltish 4, 1 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - mtlr 0 - addi 1, 1, 320 - blr + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index dfb634392..de95d5c3f 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -34,182 +34,182 @@ .text .macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+8, 0, 3 - lxvd2x 32+12, 14, 3 - lxvd2x 32+16, 15, 3 - lxvd2x 32+20, 16, 3 - addi 3, 3, 64 - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm .macro Write_8X - stxvd2x 32+21, 4, 3 - stxvd2x 32+22, 5, 3 - stxvd2x 32+23, 6, 3 - stxvd2x 32+24, 7, 3 - stxvd2x 32+4, 8, 3 - stxvd2x 32+9, 9, 3 - stxvd2x 32+13, 10, 3 - stxvd2x 32+17, 11, 3 + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 .endm # # Conditional addition to get unsigned canonical representative # .macro To_unsigned_16 - lxv 32+12, 0(3) - lxv 32+13, 16(3) - lxv 32+14, 32(3) - lxv 32+15, 48(3) - addi 3, 3, 64 - vsrh 1, 12, 10 - vsrh 0, 13, 10 - vsrh 3, 14, 10 - vsrh 2, 15, 10 - vadduhm 7, 12, 11 - vadduhm 8, 13, 11 - vadduhm 5, 14, 11 - vadduhm 6, 15, 11 - vcmpequh 1, 1, 9 - vcmpequh 0, 0, 9 - vcmpequh 3, 3, 9 - vcmpequh 2, 2, 9 - xxsel 32+1, 32+7,32+12, 32+1 - xxsel 32+0, 32+8,32+13, 32+0 - xxsel 32+3, 32+5,32+14, 32+3 - xxsel 32+2, 32+6,32+15, 32+2 - stxv 32+3, -32(3) - stxv 32+2, -16(3) - stxv 32+1, -64(3) - stxv 32+0, -48(3) + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) .endm .align 4 .globl MLK_ASM_NAMESPACE(reduce_ppc) MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu 1, -224(1) - mflr 0 - std 14, 96(1) - std 15, 104(1) - std 16, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - - vxor 7, 7, 7 - - lxv 32+V_MKQ, Q_OFFSET(4) - lxv 32+V20159, C20159_OFFSET(4) - - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - vspltisw 4, 1 - vsubuwm 5, V_26, 4 - vslw V_25, 4, 5 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - li 14, 16 - li 15, 32 - li 16, 48 - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - # - # To unsigned canonical - # + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + vxor 7, 7, 7 + + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # .align 4 - addi 3, 3, -512 - xxspltib 32+9 ,0 - vspltish 10, 15 - vmr 11, V_MKQ - - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - - ld 14, 96(1) - ld 15, 104(1) - ld 16, 112(1) - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - mtlr 0 - addi 1, 1, 224 - blr + addi 3, 3, -512 + xxspltib 32+9 ,0 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 1a4975ba0..2883a7bdf 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -36,177 +36,177 @@ #define V1441 10 .macro Load_4Coeffs start next step - mr 9, \start # j - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+8, 3, 10 # r[j+len] - lxvd2x 32+12, 3, 17 # r[j+len] - lxvd2x 32+16, 3, 19 # r[j+len] - lxvd2x 32+20, 3, 21 # r[j+len] - xxpermdi 32+8, 32+8, 32+8, 2 - xxpermdi 32+12, 32+12, 32+12, 2 - xxpermdi 32+16, 32+16, 32+16, 2 - xxpermdi 32+20, 32+20, 32+20, 2 - - lxvd2x 32+21, 3, 9 - lxvd2x 32+22, 3, 16 - lxvd2x 32+23, 3, 18 - lxvd2x 32+24, 3, 20 - xxpermdi 32+21, 32+21, 32+21, 2 - xxpermdi 32+22, 32+22, 32+22, 2 - xxpermdi 32+23, 32+23, 32+23, 2 - xxpermdi 32+24, 32+24, 32+24, 2 - - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + mr 9, \start # j + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 - vxor 7, 7, 7 - xxlor 32+3, 6, 6 # V_MKQ - xxlor 32+1, 7, 7 # V_25 - xxlor 32+2, 8, 8 # V_26 - # Multify Odd/Even signed halfword; - # Results word bound by 2^32 in abs value. - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - # Right shift and pack lower halfword, - # results bond to 2^16 in abs value - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - # Modulo multify-Low unsigned halfword; - # results bond to 2^16 * q in abs value. - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm #----------------------------------- # MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 25, \_vz0, 3 - vmladduhm 20, 26, \_vz1, 3 - vmladduhm 27, 30, \_vz2, 3 - vmladduhm 28, 31, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 25, \_vz0, 3 - vmhraddshs 19, 26, \_vz1, 3 - vmhraddshs 24, 30, \_vz2, 3 - vmhraddshs 29, 31, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah \_vo0, 15, 4 # >> 1 - vsrah \_vo1, 20, 4 # >> 1 - vsrah \_vo2, 25, 4 # >> 1 - vsrah \_vo3, 30, 4 # >> 1 + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 .endm .macro Set_mont_consts - xxlor 32+5, 0, 0 # V_NMKQ - xxlor 32+2, 2, 2 # V_QINV - xxlor 32+3, 3, 3 # 0 - xxlor 32+4, 4, 4 # 1 + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 .endm .macro Load_next_4zetas - lxv 32+V_Z0, 0(14) - lxv 32+V_Z1, 16(14) - lxv 32+V_Z2, 32(14) - lxv 32+V_Z3, 48(14) - addi 14, 14, 64 + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvx \_vs0, 3, 9 - stxvx \_vs1, 3, 16 - stxvx \_vs2, 3, 18 - stxvx \_vs3, 3, 20 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvx \_vs0, 3, 10 - stxvx \_vs1, 3, 17 - stxvx \_vs2, 3, 19 - stxvx \_vs3, 3, 21 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 .endm .macro Reload_4coeffs - lxv 32+25, 0(3) - lxv 32+26, 16(3) - lxv 32+30, 32(3) - lxv 32+31, 48(3) - addi 3, 3, 64 + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - stxv \_vs0, -128(3) - stxv \_vs1, -112(3) - stxv \_vs2, -96(3) - stxv \_vs3, -80(3) - stxv \_vs4, -64(3) - stxv \_vs5, -48(3) - stxv \_vs6, -32(3) - stxv \_vs7, -16(3) + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) .endm .macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 @@ -229,10 +229,10 @@ .endm .macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 - xxpermdi 10, 10, \_vs0, 3 - xxpermdi 11, 11, \_vs1, 3 - xxpermdi 12, 12, \_vs2, 3 - xxpermdi 13, 13, \_vs3, 3 + xxpermdi 10, 10, \_vs0, 3 + xxpermdi 11, 11, \_vs1, 3 + xxpermdi 12, 12, \_vs2, 3 + xxpermdi 13, 13, \_vs3, 3 stxvd2x 10, 3, 9 stxvd2x 11, 3, 16 stxvd2x 12, 3, 18 @@ -252,404 +252,404 @@ .align 4 MLK_ASM_FN_SYMBOL(intt_ppc) - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - stxv 32+31, 304(1) - - # init vectors and constants - # Setup for Montgomery reduce - lxv 0, 0(4) - - lxv 32+V_QINV, QINV_OFFSET(4) # QINV - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 # QINV - xxlor 3, 32+3, 32+3 # 0 - xxlor 4, 32+4, 32+4 # 1 - - # Setup for Barrett reduce - lxv 6, Q_OFFSET(4) # V_MKQ - lxv 32+V20159, C20159_OFFSET(4) # V20159 - lxv 7, 0(4) # V_25 - - #xxspltiw 8, 26 # for power9 and above - vspltisw 8, 13 - vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 # V_26 store at vs8 - - vspltisw 9, 1 - vsubuwm 10, 8, 9 # 25 - vslw 9, 9, 10 - xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # init vectors and constants + # Setup for Montgomery reduce + lxv 0, 0(4) + + lxv 32+V_QINV, QINV_OFFSET(4) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 + + # Setup for Barrett reduce + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 + + #xxspltiw 8, 26 # for power9 and above + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 + + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 #__Len2: - # - # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # Update zetas vectors, each vector has 2 zetas - addi 14, 4, IZETA_NTT_OFFSET127 - li 7, 4 - li 15, 4 - mtctr 15 - li 5, 0 + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addi 14, 4, IZETA_NTT_OFFSET127 + li 7, 4 + li 15, 4 + mtctr 15 + li 5, 0 intt_ppc__Loop2: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - bdnz intt_ppc__Loop2 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz intt_ppc__Loop2 .align 4 #__Len4: - # - # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addi 14, 4, IZETA_NTT_OFFSET63 - li 5, 0 - li 7, 8 - li 15, 4 # loops - mtctr 15 + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + addi 14, 4, IZETA_NTT_OFFSET63 + li 5, 0 + li 7, 8 + li 15, 4 # loops + mtctr 15 intt_ppc__Loop4: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - bdnz intt_ppc__Loop4 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz intt_ppc__Loop4 .align 4 #__Len8: - # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - #addi 14, 14, 512 - li 7, 16 - li 5, 0 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + #addi 14, 14, 512 + li 7, 16 + li 5, 0 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 #__Len16: - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - #addi 14, 14, 768 - li 5, 0 - li 7, 32 - - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + #addi 14, 14, 768 + li 5, 0 + li 7, 32 + + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 16 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 256 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 272 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - addi 14, 14, -64 - Load_next_4zetas + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 #__Len32: # # 5. len = 32, start = 0, 64, 128, 192 - #addi 14, 14, 896 - li 5, 0 - li 7, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + #addi 14, 14, 896 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 #__Len64: - # - # 6. len = 64, start = 0, 128 - #addi 14, 14, 960 - li 5, 0 - li 7, 128 - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lxv 32+10, -16(14) - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 320 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lxv 32+10, -16(14) - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + # + # 6. len = 64, start = 0, 128 + #addi 14, 14, 960 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 #__Len128: - # 7. len = 128, start = 0 - # - #addi 14, 14, 992 - li 5, 0 # start - li 7, 256 # len * 2 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - xxlor 9, 32+10, 32+10 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 192 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + # 7. len = 128, start = 0 + # + #addi 14, 14, 992 + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # - # Montgomery reduce loops with constant 1441 - # - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - lxv 32+31, 304(1) - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 - blr + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index e9a8df81f..834f5091f 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -18,10 +18,10 @@ #define V_QINV 2 #define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 #define V_ZETA 10 .machine "any" @@ -36,121 +36,121 @@ # MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) # .macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 - mr 9, \start - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+13, 3, 10 # r[j+len] - lxvd2x 32+18, 3, 17 # r[j+len] - lxvd2x 32+23, 3, 19 # r[j+len] - lxvd2x 32+28, 3, 21 # r[j+len] - xxpermdi 32+13, 32+13, 32+13, 2 - xxpermdi 32+18, 32+18, 32+18, 2 - xxpermdi 32+23, 32+23, 32+23, 2 - xxpermdi 32+28, 32+28, 32+28, 2 - - # fqmul = zeta * coefficient - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 13, \_vz0, 3 - vmladduhm 20, 18, \_vz1, 3 - vmladduhm 25, 23, \_vz2, 3 - vmladduhm 30, 28, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 13, \_vz0, 3 - vmhraddshs 19, 18, \_vz1, 3 - vmhraddshs 24, 23, \_vz2, 3 - vmhraddshs 29, 28, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah 13, 15, 4 # >> 1 - vsrah 18, 20, 4 # >> 1 - vsrah 23, 25, 4 # >> 1 - vsrah 28, 30, 4 # >> 1 - - lxvd2x 32+12, 3, 9 # r[j] - lxvd2x 32+17, 3, 16 # r[j] - lxvd2x 32+22, 3, 18 # r[j] - lxvd2x 32+27, 3, 20 # r[j] - xxpermdi 32+12, 32+12, 32+12, 2 - xxpermdi 32+17, 32+17, 32+17, 2 - xxpermdi 32+22, 32+22, 32+22, 2 - xxpermdi 32+27, 32+27, 32+27, 2 - - # Since the result of the Montgomery multiplication is bounded - # by q in absolute value. - # Finally to complete the final update of the results with add/sub - vsubuhm 16, 12, 13 # r - t - vadduhm 15, 13, 12 # r + t - vsubuhm 21, 17, 18 # r - t - vadduhm 20, 18, 17 # r + t - vsubuhm 26, 22, 23 # r - t - vadduhm 25, 23, 22 # r + t - vsubuhm 31, 27, 28 # r - t - vadduhm 30, 28, 27 # r + t + mr 9, \start + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 + + # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 + + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t .endm .macro Write_One - stxvx 32+15, 3, 9 - stxvx 32+16, 3, 10 - stxvx 32+20, 3, 16 - stxvx 32+21, 3, 17 - stxvx 32+25, 3, 18 - stxvx 32+26, 3, 19 - stxvx 32+30, 3, 20 - stxvx 32+31, 3, 21 + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 .endm .macro Write_Two - xxpermdi 32+17, 32+16, 32+15, 3 - xxpermdi 32+22, 32+21, 32+20, 3 - xxpermdi 32+27, 32+26, 32+25, 3 - xxpermdi 32+29, 32+31, 32+30, 3 - - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 + xxpermdi 32+17, 32+16, 32+15, 3 + xxpermdi 32+22, 32+21, 32+20, 3 + xxpermdi 32+27, 32+26, 32+25, 3 + xxpermdi 32+29, 32+31, 32+30, 3 + + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 .endm .macro Write_Three - xxmrglw 32+14, 32+16, 32+15 - xxmrghw 32+13, 32+16, 32+15 - xxpermdi 32+17, 32+13, 32+14, 3 - xxmrglw 32+19, 32+21, 32+20 - xxmrghw 32+18, 32+21, 32+20 - xxpermdi 32+22, 32+18, 32+19, 3 - xxmrglw 32+14, 32+26, 32+25 - xxmrghw 32+13, 32+26, 32+25 - xxpermdi 32+27, 32+13, 32+14, 3 - xxmrglw 32+24, 32+31, 32+30 - xxmrghw 32+23, 32+31, 32+30 - xxpermdi 32+29, 32+23, 32+24, 3 - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 + xxmrglw 32+14, 32+16, 32+15 + xxmrghw 32+13, 32+16, 32+15 + xxpermdi 32+17, 32+13, 32+14, 3 + xxmrglw 32+19, 32+21, 32+20 + xxmrghw 32+18, 32+21, 32+20 + xxpermdi 32+22, 32+18, 32+19, 3 + xxmrglw 32+14, 32+26, 32+25 + xxmrghw 32+13, 32+26, 32+25 + xxpermdi 32+27, 32+13, 32+14, 3 + xxmrglw 32+24, 32+31, 32+30 + xxmrghw 32+23, 32+31, 32+30 + xxpermdi 32+29, 32+23, 32+24, 3 + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 .endm .macro Load_next_4zetas - lxv 32+V_Z0, 0(14) - lxv 32+V_Z1, 16(14) - lxv 32+V_Z2, 32(14) - lxv 32+V_Z3, 48(14) - addi 14, 14, 64 + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 .endm # @@ -160,241 +160,241 @@ .align 4 MLK_ASM_FN_SYMBOL(ntt_ppc) - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - stxv 32+31, 304(1) - - # get MLKEM_Q - lvx V_NMKQ,0,4 - - # zetas array - addi 14, 4, ZETA_NTT_OFFSET - - vxor 3, 3, 3 - vspltish 4, 1 - - lxv 32+V_QINV, QINV_OFFSET(4) + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # get MLKEM_Q + lvx V_NMKQ,0,4 + + # zetas array + addi 14, 4, ZETA_NTT_OFFSET + + vxor 3, 3, 3 + vspltish 4, 1 + + lxv 32+V_QINV, QINV_OFFSET(4) .align 4 #__Len128: - # - # Compute coefficients of the NTT based on the following loop. - # for (len = 128; len ≥ 2; len = len/2) - # - # 1. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 192 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 #__Len64: - # - # 2. len = 64, start = 0, 128 - # k += 2 - li 5, 0 - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 320 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 #__Len32: - # - # 3. len = 32, start = 0, 64, 128, 192 - # k += 4 - li 5, 0 - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - #li 5, 64 - li 5, 128 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - #li 5, 128 - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - #li 5, 192 - li 5, 384 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 64 + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 128 + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 192 + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 #__Len16: - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - # k += 8 - li 5, 0 - li 7, 32 - Load_next_4zetas - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 16 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - Load_next_4zetas - li 5, 256 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 272 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One .align 4 #__Len8: - # - # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - # k += 16 - li 5, 0 - li 7, 16 - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 128 - - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 256 - - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 384 - - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - # - # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - # k += 32 - li 15, 4 # loops - mtctr 15 - li 5, 0 - li 7, 8 + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + li 5, 0 + li 7, 8 .align 4 ntt_ppc__Len4: - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 - bdnz ntt_ppc__Len4 + bdnz ntt_ppc__Len4 - # - # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # k += 64 - # Update zetas vectors, each vector has 2 zetas + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_NTT_OFFSET64 + addi 14, 4, ZETA_NTT_OFFSET64 - li 15, 4 - mtctr 15 - li 5, 0 - li 7, 4 + li 15, 4 + mtctr 15 + li 5, 0 + li 7, 4 .align 4 ntt_ppc__Len2: - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 - - bdnz ntt_ppc__Len2 - - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - lxv 32+31, 304(1) - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 - blr + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + bdnz ntt_ppc__Len2 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index eb770a631..c664702db 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -38,116 +38,116 @@ # MREDUCE_4X(_v0, _v1, _v2, _v3) # .macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+13, 0, 3 - addi 3, 3, 16 - lxvd2x 32+18, 0, 3 - addi 3, 3, 16 - lxvd2x 32+23, 0, 3 - addi 3, 3, 16 - lxvd2x 32+7, 0, 3 - addi 3, 3, 16 - - vmladduhm 15, 13, V1353, 3 - vmladduhm 20, 18, V1353, 3 - vmladduhm 25, 23, V1353, 3 - vmladduhm 9, 7, V1353, 3 - - vmhraddshs 14, 13, V1353, 3 - vmhraddshs 19, 18, V1353, 3 - vmhraddshs 24, 23, V1353, 3 - vmhraddshs 8, 7, V1353, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 9, 9, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 9, 9, V_NMKQ, 8 - - vsrah \_v0, 15, 4 # >> 1 - vsrah \_v1, 20, 4 # >> 1 - vsrah \_v2, 25, 4 # >> 1 - vsrah \_v3, 9, 4 # >> 1 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 .endm .macro Write_8X - stxvd2x 32+27, 4, 3 - stxvd2x 32+28, 5, 3 - stxvd2x 32+29, 6, 3 - stxvd2x 32+30, 7, 3 - stxvd2x 32+13, 8, 3 - stxvd2x 32+18, 9, 3 - stxvd2x 32+23, 10, 3 - stxvd2x 32+7, 11, 3 + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu 1, -320(1) - mflr 0 - - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - - lxv 32+V_NMKQ, NQ_OFFSET(4) - lxv 32+V_QINV, QINV_OFFSET(4) - lxv 32+V1353, C1353_OFFSET(4) - - vxor 3, 3, 3 - vspltish 4, 1 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - mtlr 0 - addi 1, 1, 320 - blr + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index 558410955..f6860f33b 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -33,182 +33,182 @@ .text .macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+8, 0, 3 - lxvd2x 32+12, 14, 3 - lxvd2x 32+16, 15, 3 - lxvd2x 32+20, 16, 3 - addi 3, 3, 64 - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm .macro Write_8X - stxvd2x 32+21, 4, 3 - stxvd2x 32+22, 5, 3 - stxvd2x 32+23, 6, 3 - stxvd2x 32+24, 7, 3 - stxvd2x 32+4, 8, 3 - stxvd2x 32+9, 9, 3 - stxvd2x 32+13, 10, 3 - stxvd2x 32+17, 11, 3 + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 .endm # # Conditional addition to get unsigned canonical representative # .macro To_unsigned_16 - lxv 32+12, 0(3) - lxv 32+13, 16(3) - lxv 32+14, 32(3) - lxv 32+15, 48(3) - addi 3, 3, 64 - vsrh 1, 12, 10 - vsrh 0, 13, 10 - vsrh 3, 14, 10 - vsrh 2, 15, 10 - vadduhm 7, 12, 11 - vadduhm 8, 13, 11 - vadduhm 5, 14, 11 - vadduhm 6, 15, 11 - vcmpequh 1, 1, 9 - vcmpequh 0, 0, 9 - vcmpequh 3, 3, 9 - vcmpequh 2, 2, 9 - xxsel 32+1, 32+7,32+12, 32+1 - xxsel 32+0, 32+8,32+13, 32+0 - xxsel 32+3, 32+5,32+14, 32+3 - xxsel 32+2, 32+6,32+15, 32+2 - stxv 32+3, -32(3) - stxv 32+2, -16(3) - stxv 32+1, -64(3) - stxv 32+0, -48(3) + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) .endm .align 4 .globl MLK_ASM_NAMESPACE(reduce_ppc) MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu 1, -224(1) - mflr 0 - std 14, 96(1) - std 15, 104(1) - std 16, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - - vxor 7, 7, 7 - - lxv 32+V_MKQ, Q_OFFSET(4) - lxv 32+V20159, C20159_OFFSET(4) - - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - vspltisw 4, 1 - vsubuwm 5, V_26, 4 - vslw V_25, 4, 5 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - li 14, 16 - li 15, 32 - li 16, 48 - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - # - # To unsigned canonical - # + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + vxor 7, 7, 7 + + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # .align 4 - addi 3, 3, -512 - xxspltib 32+9 ,0 - vspltish 10, 15 - vmr 11, V_MKQ - - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - - ld 14, 96(1) - ld 15, 104(1) - ld 16, 112(1) - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - mtlr 0 - addi 1, 1, 224 - blr + addi 3, 3, -512 + xxspltib 32+9 ,0 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ From 7f188acd447f5a781e24c6acf56e18bebd3edcf3 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Sat, 20 Sep 2025 08:23:09 +0100 Subject: [PATCH 4/6] Various minor adjustments to PPC64 backend setup Signed-off-by: Hanno Becker --- dev/ppc64le/meta.h | 2 +- dev/ppc64le/src/consts.c | 202 ++++++++++---------------------- dev/ppc64le/src/consts.h | 3 + dev/ppc64le/src/consts_intt.inc | 90 ++++++++++++++ dev/ppc64le/src/consts_ntt.inc | 45 +++++++ dev/ppc64le/src/intt_ppc.S | 164 ++++++++++++-------------- dev/ppc64le/src/ntt_ppc.S | 160 ++++++++++++------------- dev/ppc64le/src/poly_tomont.S | 58 +++++---- dev/ppc64le/src/reduce.S | 52 ++++---- 9 files changed, 402 insertions(+), 374 deletions(-) create mode 100644 dev/ppc64le/src/consts_intt.inc create mode 100644 dev/ppc64le/src/consts_ntt.inc diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index 34f8cbec6..8fec0c2ad 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -8,7 +8,7 @@ /* Identifier for this backend so that source and assembly files * in the build can be appropriately guarded. */ -#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#define MLK_ARITH_BACKEND_PPC64LE #define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c index 4c2fbdf61..c9c869a60 100644 --- a/dev/ppc64le/src/consts.c +++ b/dev/ppc64le/src/consts.c @@ -5,151 +5,73 @@ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) #include "consts.h" -MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { +MLK_ALIGN const int16_t mlk_ppc_qdata[] = { /* -Q */ - -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* check-magic: -3329 == -1 * MLKEM_Q */ + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, /* QINV */ - -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */ + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, /* Q */ - 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, - /* const 20159 for reduce.S and intt */ - 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, - /* const 1441 for intt */ - 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, - /* for poly_tomont.S */ - 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, - /* zetas */ - /* For ntt Len=128, offset 96 */ - -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, - -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, - 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, - 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, - -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, - 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, - -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, - 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, - -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, - 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, - -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, - 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, - -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, - -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, - 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, - 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, - -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, - 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, - -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, - 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, - 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, - -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, - 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, - -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, - -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, - -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, - 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, - -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, - -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, - 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, - -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, - -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, - -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, - 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, - -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, - -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, - 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, - /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ - -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, - 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, - 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, - 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, - -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, - -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, - -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, - -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, - 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, - 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, - 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, - 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, - 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, - 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, - 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, - -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, - -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, - -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, - -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, - 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, - 1628, 1628, - /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ - 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, - 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, - -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, - -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, - 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, - -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, - 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, - 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, - 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, - 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, - 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, - 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, - -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, - 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, - -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, - -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, - -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, - 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, - 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, - 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, - -1103, - /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ - -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, - 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, - 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, - -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, - 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, - -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, - -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, - -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, - 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, - -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, - -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, - 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, - -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, - -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, - -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, - 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, - -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, - 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, - 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, - -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, - 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, - -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, - 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, - 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, - -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, - -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, - 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, - -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, - 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, - 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, - -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, - 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, - -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, - 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, - -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, - -359, -758, -758, -758, -758, -758, -758, -758, -758}; + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + /* check-magic: 20159 == round(2^26 / MLKEM_Q) */ + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + /* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */ + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, +/* zetas for NTT */ +#include "consts_ntt.inc" + , +/* zetas for invNTT */ +#include "consts_intt.inc" +}; -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index d424601ac..59de765cf 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -7,6 +7,8 @@ #define MLK_DEV_PPC64LE_SRC_CONSTS_H #include "../../../common.h" +/* Offsets into the constant table */ +/* check-magic: off */ #define NQ_OFFSET 0 #define QINV_OFFSET 16 #define Q_OFFSET 32 @@ -17,6 +19,7 @@ #define ZETA_NTT_OFFSET64 1104 #define IZETA_NTT_OFFSET127 1616 #define IZETA_NTT_OFFSET63 2128 +/* check-magic: on */ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) diff --git a/dev/ppc64le/src/consts_intt.inc b/dev/ppc64le/src/consts_intt.inc new file mode 100644 index 000000000..7cd95fcd0 --- /dev/null +++ b/dev/ppc64le/src/consts_intt.inc @@ -0,0 +1,90 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758 diff --git a/dev/ppc64le/src/consts_ntt.inc b/dev/ppc64le/src/consts_ntt.inc new file mode 100644 index 000000000..bfb64e722 --- /dev/null +++ b/dev/ppc64le/src/consts_ntt.inc @@ -0,0 +1,45 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 608fc9295..3d056c850 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -3,30 +3,28 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) /* simpasm: header-end */ #include "consts.h" -.machine "any" -.text - -# Barrett reduce constatnts +// Barrett reduce constants #define V20159 0 #define V_25 1 #define V_26 2 #define V_MKQ 3 -# Montgomery reduce constatnts +// Montgomery reduce constants #define V_QINV 2 #define V_NMKQ 5 #define V_Z0 7 @@ -37,18 +35,18 @@ #define V1441 10 .macro Load_4Coeffs start next step - mr 9, \start # j - add 10, 7, 9 # J + len*2 + mr 9, \start // j + add 10, 7, 9 // J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next addi 19, 17, \step addi 20, 18, \next addi 21, 19, \step - lxvd2x 32+8, 3, 10 # r[j+len] - lxvd2x 32+12, 3, 17 # r[j+len] - lxvd2x 32+16, 3, 19 # r[j+len] - lxvd2x 32+20, 3, 21 # r[j+len] + lxvd2x 32+8, 3, 10 // r[j+len] + lxvd2x 32+12, 3, 17 // r[j+len] + lxvd2x 32+16, 3, 19 // r[j+len] + lxvd2x 32+20, 3, 21 // r[j+len] xxpermdi 32+8, 32+8, 32+8, 2 xxpermdi 32+12, 32+12, 32+12, 2 xxpermdi 32+16, 32+16, 32+16, 2 @@ -63,14 +61,14 @@ xxpermdi 32+23, 32+23, 32+23, 2 xxpermdi 32+24, 32+24, 32+24, 2 - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + vsubuhm 25, 8, 21 // r[j+len] - t + vsubuhm 26, 12, 22 // r[j+len] - t + vsubuhm 30, 16, 23 // r[j+len] - t + vsubuhm 31, 20, 24 // r[j+len] - t + vadduhm 8, 8, 21 // r[j+len] + t + vadduhm 12, 12, 22 // r[j+len] + t + vadduhm 16, 16, 23 // r[j+len] + t + vadduhm 20, 20, 24 // r[j+len] + t .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 @@ -78,8 +76,8 @@ xxlor 32+3, 6, 6 # V_MKQ xxlor 32+1, 7, 7 # V_25 xxlor 32+2, 8, 8 # V_26 - # Multify Odd/Even signed halfword; - # Results word bound by 2^32 in abs value. + // Multiply Odd/Even signed halfword; + // Results word bound by 2^32 in abs value. vmulosh 6, 8, V20159 vmulesh 5, 8, V20159 vmulosh 11, 12, V20159 @@ -104,8 +102,8 @@ vadduwm 14, 14, V_25 vadduwm 17, 17, V_25 vadduwm 18, 18, V_25 - # Right shift and pack lower halfword, - # results bond to 2^16 in abs value + // Right shift and pack lower halfword, + // results bond to 2^16 in abs value vsraw 4, 4, V_26 vsraw 5, 5, V_26 vsraw 9, 9, V_26 @@ -122,25 +120,25 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 - # Modulo multify-Low unsigned halfword; - # results bond to 2^16 * q in abs value. + // Modulo multify-Low unsigned halfword; + // results bond to 2^16 * q in abs value. vmladduhm \_v0, 4, V_MKQ, 8 vmladduhm \_v1, 9, V_MKQ, 12 vmladduhm \_v2, 13, V_MKQ, 16 vmladduhm \_v3, 17, V_MKQ, 20 .endm -#----------------------------------- -# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) -# +//----------------------------------- +// MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +// .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 - # Modular multification bond by 2^16 * q in abs value + // Modular multiplication bond by 2^16 * q in abs value vmladduhm 15, 25, \_vz0, 3 vmladduhm 20, 26, \_vz1, 3 vmladduhm 27, 30, \_vz2, 3 vmladduhm 28, 31, \_vz3, 3 - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + // Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 25, \_vz0, 3 vmhraddshs 19, 26, \_vz1, 3 vmhraddshs 24, 30, \_vz2, 3 @@ -156,17 +154,17 @@ vmhraddshs 25, 25, V_NMKQ, 24 vmhraddshs 30, 30, V_NMKQ, 29 - vsrah \_vo0, 15, 4 # >> 1 - vsrah \_vo1, 20, 4 # >> 1 - vsrah \_vo2, 25, 4 # >> 1 - vsrah \_vo3, 30, 4 # >> 1 + vsrah \_vo0, 15, 4 // >> 1 + vsrah \_vo1, 20, 4 // >> 1 + vsrah \_vo2, 25, 4 // >> 1 + vsrah \_vo3, 30, 4 // >> 1 .endm .macro Set_mont_consts - xxlor 32+5, 0, 0 # V_NMKQ - xxlor 32+2, 2, 2 # V_QINV - xxlor 32+3, 3, 3 # 0 - xxlor 32+4, 4, 4 # 1 + xxlor 32+5, 0, 0 // V_NMKQ + xxlor 32+2, 2, 2 // V_QINV + xxlor 32+3, 3, 3 // 0 + xxlor 32+4, 4, 4 // 1 .endm .macro Load_next_4zetas @@ -240,17 +238,18 @@ stxvd2x 13, 3, 20 .endm -# intt -# t = r[j]; -# r[j] = barrett_reduce(t + r[j + len]); -# r[j + len] = r[j + len] - t; -# r[j + len] = fqmul(zeta, r[j + len]); +// intt +// t = r[j]; +// r[j] = barrett_reduce(t + r[j + len]); +// r[j + len] = r[j + len] - t; +// r[j + len] = fqmul(zeta, r[j + len]); -# -# mlk_intt_ppc(r) -# +// +// mlk_intt_ppc(r) +// +.text .global MLK_ASM_NAMESPACE(intt_ppc) -.align 4 +.balign 16 MLK_ASM_FN_SYMBOL(intt_ppc) stdu 1, -352(1) @@ -276,8 +275,8 @@ MLK_ASM_FN_SYMBOL(intt_ppc) stxv 32+30, 288(1) stxv 32+31, 304(1) - # init vectors and constants - # Setup for Montgomery reduce + // init vectors and constants + // Setup for Montgomery reduce lxv 0, 0(4) lxv 32+V_QINV, QINV_OFFSET(4) # QINV @@ -287,12 +286,12 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 3, 32+3, 32+3 # 0 xxlor 4, 32+4, 32+4 # 1 - # Setup for Barrett reduce + // Setup for Barrett reduce lxv 6, Q_OFFSET(4) # V_MKQ lxv 32+V20159, C20159_OFFSET(4) # V20159 lxv 7, 0(4) # V_25 - #xxspltiw 8, 26 # for power9 and above + // xxspltiw 8, 26 # for power9 and above vspltisw 8, 13 vadduwm 8, 8, 8 xxlor 8, 32+8, 32+8 # V_26 store at vs8 @@ -303,10 +302,9 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 -#__Len2: - # - # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # Update zetas vectors, each vector has 2 zetas + // + // 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + // Update zetas vectors, each vector has 2 zetas addi 14, 4, IZETA_NTT_OFFSET127 li 7, 4 li 15, 4 @@ -340,9 +338,7 @@ intt_ppc__Loop2: bdnz intt_ppc__Loop2 .align 4 -#__Len4: - # - # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + // 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 addi 14, 4, IZETA_NTT_OFFSET63 li 5, 0 li 7, 8 @@ -375,9 +371,8 @@ intt_ppc__Loop4: bdnz intt_ppc__Loop4 .align 4 -#__Len8: - # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - #addi 14, 14, 512 + // 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + // addi 14, 14, 512 li 7, 16 li 5, 0 @@ -417,10 +412,8 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len16: - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - #addi 14, 14, 768 + // 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + // addi 14, 14, 768 li 5, 0 li 7, 32 @@ -461,10 +454,8 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len32: - # - # 5. len = 32, start = 0, 64, 128, 192 - #addi 14, 14, 896 + // 5. len = 32, start = 0, 64, 128, 192 + // addi 14, 14, 896 li 5, 0 li 7, 64 @@ -508,10 +499,8 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len64: - # - # 6. len = 64, start = 0, 128 - #addi 14, 14, 960 + // 6. len = 64, start = 0, 128 + // addi 14, 14, 960 li 5, 0 li 7, 128 Load_4Coeffs 5, 16, 16 @@ -552,10 +541,8 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len128: - # 7. len = 128, start = 0 - # - #addi 14, 14, 992 + // 7. len = 128, start = 0 + // addi 14, 14, 992 li 5, 0 # start li 7, 256 # len * 2 @@ -596,9 +583,9 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # - # Montgomery reduce loops with constant 1441 - # + // + // Montgomery reduce loops with constant 1441 + // addi 14, 4, C1441_OFFSET lvx V1441, 0, 14 @@ -668,5 +655,4 @@ intt_ppc__Loop4: #undef V1441 /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 33413d956..1f6519ff4 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -3,15 +3,16 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) /* simpasm: header-end */ @@ -25,43 +26,39 @@ #define V_Z3 10 #define V_ZETA 10 -.machine "any" -.text +// montgomery_reduce +// t = a * QINV +// t = (a - (int32_t)t*_MLKEM_Q) >> 16 +// +//----------------------------------- +// MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) -# .macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 mr 9, \start - add 10, 7, 9 # J + len*2 + add 10, 7, 9 // J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next addi 19, 17, \step addi 20, 18, \next addi 21, 19, \step - lxvd2x 32+13, 3, 10 # r[j+len] - lxvd2x 32+18, 3, 17 # r[j+len] - lxvd2x 32+23, 3, 19 # r[j+len] - lxvd2x 32+28, 3, 21 # r[j+len] + lxvd2x 32+13, 3, 10 // r[j+len] + lxvd2x 32+18, 3, 17 // r[j+len] + lxvd2x 32+23, 3, 19 // r[j+len] + lxvd2x 32+28, 3, 21 // r[j+len] xxpermdi 32+13, 32+13, 32+13, 2 xxpermdi 32+18, 32+18, 32+18, 2 xxpermdi 32+23, 32+23, 32+23, 2 xxpermdi 32+28, 32+28, 32+28, 2 - # fqmul = zeta * coefficient - # Modular multification bond by 2^16 * q in abs value + // fqmul = zeta * coefficient + // Modular multification bond by 2^16 * q in abs value vmladduhm 15, 13, \_vz0, 3 vmladduhm 20, 18, \_vz1, 3 vmladduhm 25, 23, \_vz2, 3 vmladduhm 30, 28, \_vz3, 3 - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + // Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 13, \_vz0, 3 vmhraddshs 19, 18, \_vz1, 3 vmhraddshs 24, 23, \_vz2, 3 @@ -77,31 +74,31 @@ vmhraddshs 25, 25, V_NMKQ, 24 vmhraddshs 30, 30, V_NMKQ, 29 - vsrah 13, 15, 4 # >> 1 - vsrah 18, 20, 4 # >> 1 - vsrah 23, 25, 4 # >> 1 - vsrah 28, 30, 4 # >> 1 + vsrah 13, 15, 4 // >> 1 + vsrah 18, 20, 4 // >> 1 + vsrah 23, 25, 4 // >> 1 + vsrah 28, 30, 4 // >> 1 - lxvd2x 32+12, 3, 9 # r[j] - lxvd2x 32+17, 3, 16 # r[j] - lxvd2x 32+22, 3, 18 # r[j] - lxvd2x 32+27, 3, 20 # r[j] + lxvd2x 32+12, 3, 9 // r[j] + lxvd2x 32+17, 3, 16 // r[j] + lxvd2x 32+22, 3, 18 // r[j] + lxvd2x 32+27, 3, 20 // r[j] xxpermdi 32+12, 32+12, 32+12, 2 xxpermdi 32+17, 32+17, 32+17, 2 xxpermdi 32+22, 32+22, 32+22, 2 xxpermdi 32+27, 32+27, 32+27, 2 - # Since the result of the Montgomery multiplication is bounded - # by q in absolute value. - # Finally to complete the final update of the results with add/sub - vsubuhm 16, 12, 13 # r - t - vadduhm 15, 13, 12 # r + t - vsubuhm 21, 17, 18 # r - t - vadduhm 20, 18, 17 # r + t - vsubuhm 26, 22, 23 # r - t - vadduhm 25, 23, 22 # r + t - vsubuhm 31, 27, 28 # r - t - vadduhm 30, 28, 27 # r + t + // Since the result of the Montgomery multiplication is bounded + // by q in absolute value. + // Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 // r - t + vadduhm 15, 13, 12 // r + t + vsubuhm 21, 17, 18 // r - t + vadduhm 20, 18, 17 // r + t + vsubuhm 26, 22, 23 // r - t + vadduhm 25, 23, 22 // r + t + vsubuhm 31, 27, 28 // r - t + vadduhm 30, 28, 27 // r + t .endm .macro Write_One @@ -154,11 +151,12 @@ addi 14, 14, 64 .endm -# -# mlk_ntt_ppc(int16_t *r) -# +// +// mlk_ntt_ppc(int16_t *r) +// +.text .global MLK_ASM_NAMESPACE(ntt_ppc) -.align 4 +.balign 16 MLK_ASM_FN_SYMBOL(ntt_ppc) stdu 1, -352(1) @@ -184,10 +182,10 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) stxv 32+30, 288(1) stxv 32+31, 304(1) - # get MLKEM_Q + // get MLKEM_Q lvx V_NMKQ,0,4 - # zetas array + // zetas array addi 14, 4, ZETA_NTT_OFFSET vxor 3, 3, 3 @@ -196,13 +194,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) lxv 32+V_QINV, QINV_OFFSET(4) .align 4 -#__Len128: - # - # Compute coefficients of the NTT based on the following loop. - # for (len = 128; len ≥ 2; len = len/2) - # - # 1. len = 128, start = 0 - # + // + // Compute coefficients of the NTT based on the following loop. + // for (len = 128; len ≥ 2; len = len/2) + // + // 1. len = 128, start = 0 + // li 5, 0 # start li 7, 256 # len * 2 lvx V_ZETA, 0, 14 @@ -221,10 +218,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len64: - # - # 2. len = 64, start = 0, 128 - # k += 2 + // 2. len = 64, start = 0, 128 + // k += 2 li 5, 0 li 7, 128 lvx V_ZETA, 0, 14 @@ -245,31 +240,29 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len32: - # - # 3. len = 32, start = 0, 64, 128, 192 - # k += 4 + // 3. len = 32, start = 0, 64, 128, 192 + // k += 4 li 5, 0 li 7, 64 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 64 + // li 5, 64 li 5, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 128 + // li 5, 128 li 5, 256 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 192 + // li 5, 192 li 5, 384 lvx V_ZETA, 0, 14 @@ -278,10 +271,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len16: - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - # k += 8 + // 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + // k += 8 li 5, 0 li 7, 32 Load_next_4zetas @@ -300,10 +291,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len8: - # - # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - # k += 16 + // 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + // k += 16 li 5, 0 li 7, 16 Load_next_4zetas @@ -325,9 +314,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One - # - # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - # k += 32 + // 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + // k += 32 li 15, 4 # loops mtctr 15 li 5, 0 @@ -346,10 +334,9 @@ ntt_ppc__Len4: bdnz ntt_ppc__Len4 - # - # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # k += 64 - # Update zetas vectors, each vector has 2 zetas + // 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + // k += 64 + // Update zetas vectors, each vector has 2 zetas addi 14, 4, ZETA_NTT_OFFSET64 @@ -401,8 +388,11 @@ ntt_ppc__Len2: * Don't modify by hand -- this is auto-generated by scripts/autogen. */ #undef V_QINV #undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 #undef V_ZETA /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index 66f5aba81..a8628f43c 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -3,21 +3,21 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# - -# Poly_tomont: Inplace conversion of all coefficients of a polynomial -# from normal domain to Montgomery domain -# -# Arguments:*r: pointer to input/output polynomial -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ + +// Poly_tomont: Inplace conversion of all coefficients of a polynomial +// from normal domain to Montgomery domain +// +// Arguments:*r: pointer to input/output polynomial #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) /* simpasm: header-end */ @@ -27,17 +27,13 @@ #define V_QINV 2 #define V_NMKQ 5 -.machine "any" -.text +// montgomery_reduce +// t = a * QINV +// t = (a - (int32_t)t*_MLKEM_Q) >> 16 +// +//----------------------------------- +// MREDUCE_4X(_v0, _v1, _v2, _v3) -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(_v0, _v1, _v2, _v3) -# .macro MREDUCE_4X _v0 _v1 _v2 _v3 lxvd2x 32+13, 0, 3 addi 3, 3, 16 @@ -68,10 +64,10 @@ vmhraddshs 25, 25, V_NMKQ, 24 vmhraddshs 9, 9, V_NMKQ, 8 - vsrah \_v0, 15, 4 # >> 1 - vsrah \_v1, 20, 4 # >> 1 - vsrah \_v2, 25, 4 # >> 1 - vsrah \_v3, 9, 4 # >> 1 + vsrah \_v0, 15, 4 // >> 1 + vsrah \_v1, 20, 4 // >> 1 + vsrah \_v2, 25, 4 // >> 1 + vsrah \_v3, 9, 4 // >> 1 .endm .macro Write_8X @@ -85,8 +81,9 @@ stxvd2x 32+7, 11, 3 .endm -.align 4 -.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +.text +.global MLK_ASM_NAMESPACE(poly_tomont_ppc) +.balign 16 MLK_ASM_FN_SYMBOL(poly_tomont_ppc) stdu 1, -320(1) mflr 0 @@ -157,5 +154,4 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) #undef V_NMKQ /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index de95d5c3f..2ca8a5e35 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -3,36 +3,32 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# - -# -# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial -# for details of the Barrett reduction -# -# Arguments: *r: pointer to input/output polynomial -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) /* simpasm: header-end */ #include "consts.h" -# Barrett reduce constatnts +// poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +// for details of the Barrett reduction +// +// Arguments: *r: pointer to input/output polynomial + +// Barrett reduce constatnts #define V20159 0 #define V_25 1 #define V_26 2 #define V_MKQ 3 -.machine "any" -.text - .macro BREDUCE_4X _v0 _v1 _v2 _v3 lxvd2x 32+8, 0, 3 lxvd2x 32+12, 14, 3 @@ -96,9 +92,9 @@ stxvd2x 32+17, 11, 3 .endm -# -# Conditional addition to get unsigned canonical representative -# +// +// Conditional addition to get unsigned canonical representative +// .macro To_unsigned_16 lxv 32+12, 0(3) lxv 32+13, 16(3) @@ -127,8 +123,9 @@ stxv 32+0, -48(3) .endm -.align 4 -.globl MLK_ASM_NAMESPACE(reduce_ppc) +.text +.global MLK_ASM_NAMESPACE(reduce_ppc) +.balign 16 MLK_ASM_FN_SYMBOL(reduce_ppc) stdu 1, -224(1) mflr 0 @@ -181,10 +178,10 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) BREDUCE_4X 4, 9, 13, 17 Write_8X - # - # To unsigned canonical - # .align 4 + // + // To unsigned canonical + // addi 3, 3, -512 xxspltib 32+9 ,0 vspltish 10, 15 @@ -219,5 +216,4 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) #undef V_MKQ /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ From 63f7d521cb930d88538d653c4ca9d73b87db6568 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Fri, 12 Sep 2025 21:06:40 +0100 Subject: [PATCH 5/6] Add CI test for PPC64LE backend Signed-off-by: Hanno Becker --- .github/actions/multi-functest/action.yml | 2 +- .github/workflows/ci.yml | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/actions/multi-functest/action.yml b/.github/actions/multi-functest/action.yml index 252931918..64ccfebd1 100644 --- a/.github/actions/multi-functest/action.yml +++ b/.github/actions/multi-functest/action.yml @@ -119,7 +119,7 @@ runs: nix-verbose: ${{ inputs.nix-verbose }} gh_token: ${{ inputs.gh_token }} custom_shell: ${{ inputs.custom_shell }} - cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE" + cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE -mvsx" cross_prefix: powerpc64le-unknown-linux-gnu- exec_wrapper: qemu-ppc64le opt: ${{ inputs.opt }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f6f6fd39..b51ab25e2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -134,16 +134,15 @@ jobs: runs-on: ${{ matrix.target.runner }} steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - - name: build + test + - name: build + test (no-opt) uses: ./.github/actions/multi-functest with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} gh_token: ${{ secrets.GITHUB_TOKEN }} compile_mode: ${{ matrix.target.mode }} - # There is no native code yet on PPC64LE, R-V or AArch64_be, so no point running opt tests - opt: ${{ (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv64' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') && 'all' || 'no_opt' }} - - name: build + test (+debug+memsan+ubsan) + opt: 'no_opt' + - name: build + test (+debug+memsan+ubsan, native) uses: ./.github/actions/multi-functest if: ${{ matrix.target.mode == 'native' }} with: @@ -151,6 +150,17 @@ jobs: compile_mode: native cflags: "-DMLKEM_DEBUG -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all" check_namespace: 'false' + - name: build + test (+debug, cross, opt) + uses: ./.github/actions/multi-functest + # There is no native code yet on riscv64, riscv32 or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'riscv64' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + with: + nix-shell: ${{ matrix.target.nix_shell }} + nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} + gh_token: ${{ secrets.GITHUB_TOKEN }} + compile_mode: ${{ matrix.target.mode }} + cflags: "-DMLKEM_DEBUG" + opt: 'opt' backend_tests: name: AArch64 FIPS202 backends (${{ matrix.backend }}) strategy: From fbdf60e5837300d5efc740de302d04ecd841a133 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Thu, 11 Sep 2025 09:50:51 +0100 Subject: [PATCH 6/6] Rerun autogen to update mlkem/* sources for PPC64Le Signed-off-by: Hanno Becker --- dev/ppc64le/src/arith_native_ppc64le.h | 4 +- mlkem/mlkem_native.S | 8 +- mlkem/mlkem_native.c | 5 +- mlkem/src/native/ppc64le/meta.h | 2 +- .../native/ppc64le/src/arith_native_ppc64le.h | 4 +- mlkem/src/native/ppc64le/src/consts.c | 202 +- mlkem/src/native/ppc64le/src/consts.h | 3 + mlkem/src/native/ppc64le/src/consts_intt.inc | 90 + mlkem/src/native/ppc64le/src/consts_ntt.inc | 45 + mlkem/src/native/ppc64le/src/intt_ppc.S | 3831 ++++++++++++++--- mlkem/src/native/ppc64le/src/ntt_ppc.S | 2033 +++++++-- mlkem/src/native/ppc64le/src/poly_tomont.S | 462 +- mlkem/src/native/ppc64le/src/reduce.S | 869 +++- scripts/autogen | 12 + 14 files changed, 6024 insertions(+), 1546 deletions(-) create mode 100644 mlkem/src/native/ppc64le/src/consts_intt.inc create mode 100644 mlkem/src/native/ppc64le/src/consts_ntt.inc diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h index 1c7534668..282b3566c 100644 --- a/dev/ppc64le/src/arith_native_ppc64le.h +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -1,6 +1,6 @@ /* - * Copyright (c) 2024-2025 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ #ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H #define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H diff --git a/mlkem/mlkem_native.S b/mlkem/mlkem_native.S index 6f2a8b221..46bcb9772 100644 --- a/mlkem/mlkem_native.S +++ b/mlkem/mlkem_native.S @@ -85,6 +85,12 @@ #include "mlkem/src/native/x86_64/src/rej_uniform_asm.S" #include "mlkem/src/native/x86_64/src/tomont.S" #endif /* MLK_SYS_X86_64 */ +#if defined(MLK_SYS_PPC64LE) +#include "mlkem/src/native/ppc64le/src/intt_ppc.S" +#include "mlkem/src/native/ppc64le/src/ntt_ppc.S" +#include "mlkem/src/native/ppc64le/src/poly_tomont.S" +#include "mlkem/src/native/ppc64le/src/reduce.S" +#endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -460,7 +466,7 @@ #undef MLK_NATIVE_META_H /* mlkem/src/native/ppc64le/meta.h */ #undef MLK_ARITH_BACKEND_NAME -#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_ARITH_BACKEND_PPC64LE #undef MLK_NATIVE_PPC64LE_META_H #undef MLK_USE_NATIVE_INTT #undef MLK_USE_NATIVE_NTT diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 74903ed1d..0ee436f33 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -84,6 +84,9 @@ #include "src/native/x86_64/src/consts.c" #include "src/native/x86_64/src/rej_uniform_table.c" #endif +#if defined(MLK_SYS_PPC64LE) +#include "src/native/ppc64le/src/consts.c" +#endif #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -447,7 +450,7 @@ #undef MLK_NATIVE_META_H /* mlkem/src/native/ppc64le/meta.h */ #undef MLK_ARITH_BACKEND_NAME -#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_ARITH_BACKEND_PPC64LE #undef MLK_NATIVE_PPC64LE_META_H #undef MLK_USE_NATIVE_INTT #undef MLK_USE_NATIVE_NTT diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index 54b3ddd9c..c5694f9c2 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -8,7 +8,7 @@ /* Identifier for this backend so that source and assembly files * in the build can be appropriately guarded. */ -#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#define MLK_ARITH_BACKEND_PPC64LE #define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h index dbcee3e3e..7ab3226c4 100644 --- a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -1,6 +1,6 @@ /* - * Copyright (c) 2024-2025 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ #ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H #define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c index 4c2fbdf61..c9c869a60 100644 --- a/mlkem/src/native/ppc64le/src/consts.c +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -5,151 +5,73 @@ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) #include "consts.h" -MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { +MLK_ALIGN const int16_t mlk_ppc_qdata[] = { /* -Q */ - -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* check-magic: -3329 == -1 * MLKEM_Q */ + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, /* QINV */ - -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */ + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, /* Q */ - 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, - /* const 20159 for reduce.S and intt */ - 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, - /* const 1441 for intt */ - 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, - /* for poly_tomont.S */ - 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, - /* zetas */ - /* For ntt Len=128, offset 96 */ - -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, - -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, - 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, - 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, - -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, - 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, - -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, - 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, - -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, - 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, - -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, - 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, - -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, - -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, - 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, - 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, - -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, - 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, - -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, - 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, - 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, - -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, - 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, - -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, - -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, - -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, - 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, - -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, - -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, - 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, - -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, - -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, - -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, - 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, - -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, - -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, - 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, - /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ - -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, - 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, - 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, - 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, - -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, - -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, - -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, - -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, - 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, - 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, - 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, - 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, - 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, - 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, - 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, - -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, - -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, - -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, - -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, - 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, - 1628, 1628, - /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ - 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, - 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, - -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, - -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, - 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, - -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, - 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, - 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, - 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, - 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, - 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, - 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, - -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, - 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, - -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, - -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, - -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, - 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, - 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, - 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, - -1103, - /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ - -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, - 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, - 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, - -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, - 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, - -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, - -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, - -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, - 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, - -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, - -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, - 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, - -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, - -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, - -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, - 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, - -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, - 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, - 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, - -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, - 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, - -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, - 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, - 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, - -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, - -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, - 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, - -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, - 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, - 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, - -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, - 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, - -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, - 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, - -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, - -359, -758, -758, -758, -758, -758, -758, -758, -758}; + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + /* check-magic: 20159 == round(2^26 / MLKEM_Q) */ + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + /* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */ + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, +/* zetas for NTT */ +#include "consts_ntt.inc" + , +/* zetas for invNTT */ +#include "consts_intt.inc" +}; -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h index 49f519d0c..38246248d 100644 --- a/mlkem/src/native/ppc64le/src/consts.h +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -7,6 +7,8 @@ #define MLK_NATIVE_PPC64LE_SRC_CONSTS_H #include "../../../common.h" +/* Offsets into the constant table */ +/* check-magic: off */ #define NQ_OFFSET 0 #define QINV_OFFSET 16 #define Q_OFFSET 32 @@ -17,6 +19,7 @@ #define ZETA_NTT_OFFSET64 1104 #define IZETA_NTT_OFFSET127 1616 #define IZETA_NTT_OFFSET63 2128 +/* check-magic: on */ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) diff --git a/mlkem/src/native/ppc64le/src/consts_intt.inc b/mlkem/src/native/ppc64le/src/consts_intt.inc new file mode 100644 index 000000000..7cd95fcd0 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_intt.inc @@ -0,0 +1,90 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758 diff --git a/mlkem/src/native/ppc64le/src/consts_ntt.inc b/mlkem/src/native/ppc64le/src/consts_ntt.inc new file mode 100644 index 000000000..bfb64e722 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_ntt.inc @@ -0,0 +1,45 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 2883a7bdf..de8b634a4 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -3,116 +3,145 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" - -.machine "any" -.text - -# Barrett reduce constatnts -#define V20159 0 -#define V_25 1 -#define V_26 2 -#define V_MKQ 3 - -# Montgomery reduce constatnts -#define V_QINV 2 -#define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 -#define V_ZETA 10 -#define V1441 10 - -.macro Load_4Coeffs start next step - mr 9, \start # j - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+8, 3, 10 # r[j+len] - lxvd2x 32+12, 3, 17 # r[j+len] - lxvd2x 32+16, 3, 19 # r[j+len] - lxvd2x 32+20, 3, 21 # r[j+len] - xxpermdi 32+8, 32+8, 32+8, 2 - xxpermdi 32+12, 32+12, 32+12, 2 - xxpermdi 32+16, 32+16, 32+16, 2 - xxpermdi 32+20, 32+20, 32+20, 2 +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/intt_ppc.S using scripts/simpasm. Do not modify it directly. + */ - lxvd2x 32+21, 3, 9 - lxvd2x 32+22, 3, 16 - lxvd2x 32+23, 3, 18 - lxvd2x 32+24, 3, 20 - xxpermdi 32+21, 32+21, 32+21, 2 - xxpermdi 32+22, 32+22, 32+22, 2 - xxpermdi 32+23, 32+23, 32+23, 2 - xxpermdi 32+24, 32+24, 32+24, 2 - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t -.endm +.text +.balign 16 +.global MLK_ASM_NAMESPACE(intt_ppc) +MLK_ASM_FN_SYMBOL(intt_ppc) -.macro BREDUCE_4X _v0 _v1 _v2 _v3 - vxor 7, 7, 7 - xxlor 32+3, 6, 6 # V_MKQ - xxlor 32+1, 7, 7 # V_25 - xxlor 32+2, 8, 8 # V_26 - # Multify Odd/Even signed halfword; - # Results word bound by 2^32 in abs value. - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - # Right shift and pack lower halfword, - # results bond to 2^16 in abs value - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 + .cfi_startproc + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 52, 128(1) + stxv 53, 144(1) + stxv 54, 160(1) + stxv 55, 176(1) + stxv 56, 192(1) + stxv 57, 208(1) + stxv 58, 224(1) + stxv 59, 240(1) + stxv 60, 256(1) + stxv 61, 272(1) + stxv 62, 288(1) + stxv 63, 304(1) + lxv 0, 0(4) + lxv 34, 16(4) + xxlxor 35, 35, 35 + vspltish 4, 1 + xxlor 2, 34, 34 + xxlor 3, 35, 35 + xxlor 4, 36, 36 + lxv 6, 32(4) + lxv 32, 48(4) + lxv 7, 0(4) + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 40, 40 + vspltisw 9, 1 + vsubuwm 10, 8, 9 + vslw 9, 9, 10 + xxlor 7, 41, 41 + ori 2, 2, 0 + addi 14, 4, 1616 + li 7, 4 + li 15, 4 + mtctr 15 + li 5, 0 + +Lintt_ppc__Loop2: + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 vpkuwum 4, 5, 4 vsubuhm 4, 7, 4 vpkuwum 9, 10, 9 @@ -121,565 +150,3041 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 - # Modulo multify-Low unsigned halfword; - # results bond to 2^16 * q in abs value. - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 -.endm - -#----------------------------------- -# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) -# -.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 25, \_vz0, 3 - vmladduhm 20, 26, \_vz1, 3 - vmladduhm 27, 30, \_vz2, 3 - vmladduhm 28, 31, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 25, \_vz0, 3 - vmhraddshs 19, 26, \_vz1, 3 - vmhraddshs 24, 30, \_vz2, 3 - vmhraddshs 29, 31, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah \_vo0, 15, 4 # >> 1 - vsrah \_vo1, 20, 4 # >> 1 - vsrah \_vo2, 25, 4 # >> 1 - vsrah \_vo3, 30, 4 # >> 1 -.endm - -.macro Set_mont_consts - xxlor 32+5, 0, 0 # V_NMKQ - xxlor 32+2, 2, 2 # V_QINV - xxlor 32+3, 3, 3 # 0 - xxlor 32+4, 4, 4 # 1 -.endm - -.macro Load_next_4zetas - lxv 32+V_Z0, 0(14) - lxv 32+V_Z1, 16(14) - lxv 32+V_Z2, 32(14) - lxv 32+V_Z3, 48(14) - addi 14, 14, 64 -.endm - -.macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvx \_vs0, 3, 9 - stxvx \_vs1, 3, 16 - stxvx \_vs2, 3, 18 - stxvx \_vs3, 3, 20 -.endm - -.macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvx \_vs0, 3, 10 - stxvx \_vs1, 3, 17 - stxvx \_vs2, 3, 19 - stxvx \_vs3, 3, 21 -.endm - -.macro Reload_4coeffs - lxv 32+25, 0(3) - lxv 32+26, 16(3) - lxv 32+30, 32(3) - lxv 32+31, 48(3) - addi 3, 3, 64 -.endm - -.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - stxv \_vs0, -128(3) - stxv \_vs1, -112(3) - stxv \_vs2, -96(3) - stxv \_vs3, -80(3) - stxv \_vs4, -64(3) - stxv \_vs5, -48(3) - stxv \_vs6, -32(3) - stxv \_vs7, -16(3) -.endm - -.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 - xxmrglw 32+12, \_vs0, 10 - xxmrghw 32+11, \_vs0, 10 - xxpermdi 10, 32+12, 32+11, 3 - xxmrglw 32+16, \_vs1, 11 - xxmrghw 32+15, \_vs1, 11 - xxpermdi 11, 32+16, 32+15, 3 - xxmrglw 32+12, \_vs2, 12 - xxmrghw 32+11, \_vs2, 12 - xxpermdi 12, 32+12, 32+11, 3 - xxmrglw 32+16, \_vs3, 13 - xxmrghw 32+15, \_vs3, 13 - xxpermdi 13, 32+16, 32+15, 3 - stxvd2x 10, 3, 9 - stxvd2x 11, 3, 16 - stxvd2x 12, 3, 18 - stxvd2x 13, 3, 20 -.endm - -.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 - xxpermdi 10, 10, \_vs0, 3 - xxpermdi 11, 11, \_vs1, 3 - xxpermdi 12, 12, \_vs2, 3 - xxpermdi 13, 13, \_vs3, 3 - stxvd2x 10, 3, 9 - stxvd2x 11, 3, 16 - stxvd2x 12, 3, 18 - stxvd2x 13, 3, 20 -.endm - -# intt -# t = r[j]; -# r[j] = barrett_reduce(t + r[j + len]); -# r[j + len] = r[j + len] - t; -# r[j + len] = fqmul(zeta, r[j + len]); - -# -# mlk_intt_ppc(r) -# -.global MLK_ASM_NAMESPACE(intt_ppc) -.align 4 -MLK_ASM_FN_SYMBOL(intt_ppc) - - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - stxv 32+31, 304(1) - - # init vectors and constants - # Setup for Montgomery reduce - lxv 0, 0(4) - - lxv 32+V_QINV, QINV_OFFSET(4) # QINV - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 # QINV - xxlor 3, 32+3, 32+3 # 0 - xxlor 4, 32+4, 32+4 # 1 - - # Setup for Barrett reduce - lxv 6, Q_OFFSET(4) # V_MKQ - lxv 32+V20159, C20159_OFFSET(4) # V20159 - lxv 7, 0(4) # V_25 - - #xxspltiw 8, 26 # for power9 and above - vspltisw 8, 13 - vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 # V_26 store at vs8 - - vspltisw 9, 1 - vsubuwm 10, 8, 9 # 25 - vslw 9, 9, 10 - xxlor 7, 32+9, 32+9 # V_25 syore at vs7 - -.align 4 -#__Len2: - # - # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # Update zetas vectors, each vector has 2 zetas - addi 14, 4, IZETA_NTT_OFFSET127 - li 7, 4 - li 15, 4 - mtctr 15 - li 5, 0 -intt_ppc__Loop2: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - bdnz intt_ppc__Loop2 - -.align 4 -#__Len4: - # - # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addi 14, 4, IZETA_NTT_OFFSET63 - li 5, 0 - li 7, 8 - li 15, 4 # loops - mtctr 15 -intt_ppc__Loop4: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - bdnz intt_ppc__Loop4 - -.align 4 -#__Len8: - # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - #addi 14, 14, 512 - li 7, 16 - li 5, 0 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - -.align 4 -#__Len16: - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - #addi 14, 14, 768 - li 5, 0 - li 7, 32 - - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 16 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - - li 5, 256 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - - li 5, 272 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - -.align 4 -#__Len32: - # - # 5. len = 32, start = 0, 64, 128, 192 - #addi 14, 14, 896 - li 5, 0 - li 7, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - -.align 4 -#__Len64: - # - # 6. len = 64, start = 0, 128 - #addi 14, 14, 960 - li 5, 0 - li 7, 128 - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lxv 32+10, -16(14) - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 320 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lxv 32+10, -16(14) - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - -.align 4 -#__Len128: - # 7. len = 128, start = 0 - # - #addi 14, 14, 992 - li 5, 0 # start - li 7, 256 # len * 2 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - xxlor 9, 32+10, 32+10 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 192 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - -.align 4 - # - # Montgomery reduce loops with constant 1441 - # - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - lxv 32+31, 304(1) - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxmrglw 44, 45, 10 + xxmrghw 43, 45, 10 + xxmrgld 10, 44, 43 + xxmrglw 48, 50, 11 + xxmrghw 47, 50, 11 + xxmrgld 11, 48, 47 + xxmrglw 44, 55, 12 + xxmrghw 43, 55, 12 + xxmrgld 12, 44, 43 + xxmrglw 48, 60, 13 + xxmrghw 47, 60, 13 + xxmrgld 13, 48, 47 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 + addi 5, 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxmrglw 44, 45, 10 + xxmrghw 43, 45, 10 + xxmrgld 10, 44, 43 + xxmrglw 48, 50, 11 + xxmrghw 47, 50, 11 + xxmrgld 11, 48, 47 + xxmrglw 44, 55, 12 + xxmrghw 43, 55, 12 + xxmrgld 12, 44, 43 + xxmrglw 48, 60, 13 + xxmrghw 47, 60, 13 + xxmrgld 13, 48, 47 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 + addi 5, 5, 64 + bdnz Lintt_ppc__Loop2 + nop + ori 2, 2, 0 + addi 14, 4, 2128 + li 5, 0 + li 7, 8 + li 15, 4 + mtctr 15 + +Lintt_ppc__Loop4: + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxmrgld 10, 10, 45 + xxmrgld 11, 11, 50 + xxmrgld 12, 12, 55 + xxmrgld 13, 13, 60 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 + addi 5, 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxmrgld 10, 10, 45 + xxmrgld 11, 11, 50 + xxmrgld 12, 12, 55 + xxmrgld 13, 13, 60 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 + addi 5, 5, 64 + bdnz Lintt_ppc__Loop4 + nop + ori 2, 2, 0 + li 7, 16 + li 5, 0 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 384 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 32 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + addi 14, 14, -64 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 272 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + addi 14, 14, -64 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 384 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 42, -16(14) + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 320 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lxv 42, -16(14) + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + xxlor 9, 42, 42 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + xxlor 42, 9, 9 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + xxlor 42, 9, 9 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + li 5, 192 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + xxswapd 40, 40 + xxswapd 44, 44 + xxswapd 48, 48 + xxswapd 52, 52 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + xxswapd 53, 53 + xxswapd 54, 54 + xxswapd 55, 55 + xxswapd 56, 56 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvx 36, 3, 9 + stxvx 41, 3, 16 + stxvx 45, 3, 18 + stxvx 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + xxlor 42, 9, 9 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvx 45, 3, 10 + stxvx 50, 3, 17 + stxvx 55, 3, 19 + stxvx 60, 3, 21 + nop + ori 2, 2, 0 + addi 14, 4, 64 + lvx 10, 0, 14 + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 6, 15, 4 + vsrah 7, 20, 4 + vsrah 8, 25, 4 + vsrah 9, 30, 4 + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxv 38, -128(3) + stxv 39, -112(3) + stxv 40, -96(3) + stxv 41, -80(3) + stxv 45, -64(3) + stxv 50, -48(3) + stxv 55, -32(3) + stxv 60, -16(3) + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 6, 15, 4 + vsrah 7, 20, 4 + vsrah 8, 25, 4 + vsrah 9, 30, 4 + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxv 38, -128(3) + stxv 39, -112(3) + stxv 40, -96(3) + stxv 41, -80(3) + stxv 45, -64(3) + stxv 50, -48(3) + stxv 55, -32(3) + stxv 60, -16(3) + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 6, 15, 4 + vsrah 7, 20, 4 + vsrah 8, 25, 4 + vsrah 9, 30, 4 + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxv 38, -128(3) + stxv 39, -112(3) + stxv 40, -96(3) + stxv 41, -80(3) + stxv 45, -64(3) + stxv 50, -48(3) + stxv 55, -32(3) + stxv 60, -16(3) + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 6, 15, 4 + vsrah 7, 20, 4 + vsrah 8, 25, 4 + vsrah 9, 30, 4 + lxv 57, 0(3) + lxv 58, 16(3) + lxv 62, 32(3) + lxv 63, 48(3) + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxv 38, -128(3) + stxv 39, -112(3) + stxv 40, -96(3) + stxv 41, -80(3) + stxv 45, -64(3) + stxv 50, -48(3) + stxv 55, -32(3) + stxv 60, -16(3) + lxv 52, 128(1) + lxv 53, 144(1) + lxv 54, 160(1) + lxv 55, 176(1) + lxv 56, 192(1) + lxv 57, 208(1) + lxv 58, 224(1) + lxv 59, 240(1) + lxv 60, 256(1) + lxv 61, 272(1) + lxv 62, 288(1) + lxv 63, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + mtlr 0 + addi 1, 1, 352 blr + .cfi_endproc -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V20159 -#undef V_25 -#undef V_26 -#undef V_MKQ -#undef V_QINV -#undef V_NMKQ -#undef V_Z0 -#undef V_Z1 -#undef V_Z2 -#undef V_Z3 -#undef V_ZETA -#undef V1441 - -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V20159 -#undef V_25 -#undef V_26 -#undef V_MKQ -#undef V_QINV -#undef V_NMKQ -#undef V_Z0 -#undef V_Z1 -#undef V_Z2 -#undef V_Z3 -#undef V_ZETA -#undef V1441 +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 834f5091f..dd0aef877 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -3,410 +3,1659 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/ntt_ppc.S using scripts/simpasm. Do not modify it directly. + */ -#define V_QINV 2 -#define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 -#define V_ZETA 10 -.machine "any" .text - -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) -# -.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 - mr 9, \start - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+13, 3, 10 # r[j+len] - lxvd2x 32+18, 3, 17 # r[j+len] - lxvd2x 32+23, 3, 19 # r[j+len] - lxvd2x 32+28, 3, 21 # r[j+len] - xxpermdi 32+13, 32+13, 32+13, 2 - xxpermdi 32+18, 32+18, 32+18, 2 - xxpermdi 32+23, 32+23, 32+23, 2 - xxpermdi 32+28, 32+28, 32+28, 2 - - # fqmul = zeta * coefficient - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 13, \_vz0, 3 - vmladduhm 20, 18, \_vz1, 3 - vmladduhm 25, 23, \_vz2, 3 - vmladduhm 30, 28, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 13, \_vz0, 3 - vmhraddshs 19, 18, \_vz1, 3 - vmhraddshs 24, 23, \_vz2, 3 - vmhraddshs 29, 28, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah 13, 15, 4 # >> 1 - vsrah 18, 20, 4 # >> 1 - vsrah 23, 25, 4 # >> 1 - vsrah 28, 30, 4 # >> 1 - - lxvd2x 32+12, 3, 9 # r[j] - lxvd2x 32+17, 3, 16 # r[j] - lxvd2x 32+22, 3, 18 # r[j] - lxvd2x 32+27, 3, 20 # r[j] - xxpermdi 32+12, 32+12, 32+12, 2 - xxpermdi 32+17, 32+17, 32+17, 2 - xxpermdi 32+22, 32+22, 32+22, 2 - xxpermdi 32+27, 32+27, 32+27, 2 - - # Since the result of the Montgomery multiplication is bounded - # by q in absolute value. - # Finally to complete the final update of the results with add/sub - vsubuhm 16, 12, 13 # r - t - vadduhm 15, 13, 12 # r + t - vsubuhm 21, 17, 18 # r - t - vadduhm 20, 18, 17 # r + t - vsubuhm 26, 22, 23 # r - t - vadduhm 25, 23, 22 # r + t - vsubuhm 31, 27, 28 # r - t - vadduhm 30, 28, 27 # r + t -.endm - -.macro Write_One - stxvx 32+15, 3, 9 - stxvx 32+16, 3, 10 - stxvx 32+20, 3, 16 - stxvx 32+21, 3, 17 - stxvx 32+25, 3, 18 - stxvx 32+26, 3, 19 - stxvx 32+30, 3, 20 - stxvx 32+31, 3, 21 -.endm - -.macro Write_Two - xxpermdi 32+17, 32+16, 32+15, 3 - xxpermdi 32+22, 32+21, 32+20, 3 - xxpermdi 32+27, 32+26, 32+25, 3 - xxpermdi 32+29, 32+31, 32+30, 3 - - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 -.endm - -.macro Write_Three - xxmrglw 32+14, 32+16, 32+15 - xxmrghw 32+13, 32+16, 32+15 - xxpermdi 32+17, 32+13, 32+14, 3 - xxmrglw 32+19, 32+21, 32+20 - xxmrghw 32+18, 32+21, 32+20 - xxpermdi 32+22, 32+18, 32+19, 3 - xxmrglw 32+14, 32+26, 32+25 - xxmrghw 32+13, 32+26, 32+25 - xxpermdi 32+27, 32+13, 32+14, 3 - xxmrglw 32+24, 32+31, 32+30 - xxmrghw 32+23, 32+31, 32+30 - xxpermdi 32+29, 32+23, 32+24, 3 - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 -.endm - -.macro Load_next_4zetas - lxv 32+V_Z0, 0(14) - lxv 32+V_Z1, 16(14) - lxv 32+V_Z2, 32(14) - lxv 32+V_Z3, 48(14) - addi 14, 14, 64 -.endm - -# -# mlk_ntt_ppc(int16_t *r) -# +.balign 16 .global MLK_ASM_NAMESPACE(ntt_ppc) -.align 4 MLK_ASM_FN_SYMBOL(ntt_ppc) - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - stxv 32+31, 304(1) - - # get MLKEM_Q - lvx V_NMKQ,0,4 - - # zetas array - addi 14, 4, ZETA_NTT_OFFSET - - vxor 3, 3, 3 + .cfi_startproc + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 52, 128(1) + stxv 53, 144(1) + stxv 54, 160(1) + stxv 55, 176(1) + stxv 56, 192(1) + stxv 57, 208(1) + stxv 58, 224(1) + stxv 59, 240(1) + stxv 60, 256(1) + stxv 61, 272(1) + stxv 62, 288(1) + stxv 63, 304(1) + lvx 5, 0, 4 + addi 14, 4, 96 + vxor 3, 3, 3 vspltish 4, 1 + lxv 34, 16(4) + ori 2, 2, 0 + li 5, 0 + li 7, 256 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 192 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 128 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 256 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 320 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 64 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 128 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 256 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 384 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 32 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 272 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 16 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 128 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 256 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 5, 384 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvx 47, 3, 9 + stxvx 48, 3, 10 + stxvx 52, 3, 16 + stxvx 53, 3, 17 + stxvx 57, 3, 18 + stxvx 58, 3, 19 + stxvx 62, 3, 20 + stxvx 63, 3, 21 + li 15, 4 + mtctr 15 + li 5, 0 + li 7, 8 + nop + nop + ori 2, 2, 0 - lxv 32+V_QINV, QINV_OFFSET(4) - -.align 4 -#__Len128: - # - # Compute coefficients of the NTT based on the following loop. - # for (len = 128; len ≥ 2; len = len/2) - # - # 1. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 192 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One +Lntt_ppc__Len4: + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 49, 48, 47 + xxmrgld 54, 53, 52 + xxmrgld 59, 58, 57 + xxmrgld 61, 63, 62 + stxvx 49, 3, 9 + stxvx 54, 3, 16 + stxvx 59, 3, 18 + stxvx 61, 3, 20 + addi 5, 5, 64 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 49, 48, 47 + xxmrgld 54, 53, 52 + xxmrgld 59, 58, 57 + xxmrgld 61, 63, 62 + stxvx 49, 3, 9 + stxvx 54, 3, 16 + stxvx 59, 3, 18 + stxvx 61, 3, 20 + addi 5, 5, 64 + bdnz Lntt_ppc__Len4 + addi 14, 4, 1104 + li 15, 4 + mtctr 15 + li 5, 0 + li 7, 4 + nop + ori 2, 2, 0 -.align 4 -#__Len64: - # - # 2. len = 64, start = 0, 128 - # k += 2 - li 5, 0 - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 320 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - -.align 4 -#__Len32: - # - # 3. len = 32, start = 0, 64, 128, 192 - # k += 4 - li 5, 0 - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - #li 5, 64 - li 5, 128 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - #li 5, 128 - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - #li 5, 192 - li 5, 384 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - -.align 4 -#__Len16: - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - # k += 8 - li 5, 0 - li 7, 32 - Load_next_4zetas - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 16 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - Load_next_4zetas - li 5, 256 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 272 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - -.align 4 -#__Len8: - # - # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - # k += 16 - li 5, 0 - li 7, 16 - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 128 - - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 256 - - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 384 - - Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - # - # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - # k += 32 - li 15, 4 # loops - mtctr 15 - li 5, 0 - li 7, 8 -.align 4 -ntt_ppc__Len4: - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 - - bdnz ntt_ppc__Len4 - - # - # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # k += 64 - # Update zetas vectors, each vector has 2 zetas - - addi 14, 4, ZETA_NTT_OFFSET64 - - li 15, 4 - mtctr 15 - li 5, 0 - li 7, 4 -.align 4 -ntt_ppc__Len2: - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 - - bdnz ntt_ppc__Len2 - - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - lxv 32+31, 304(1) - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 +Lntt_ppc__Len2: + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrglw 46, 48, 47 + xxmrghw 45, 48, 47 + xxmrgld 49, 45, 46 + xxmrglw 51, 53, 52 + xxmrghw 50, 53, 52 + xxmrgld 54, 50, 51 + xxmrglw 46, 58, 57 + xxmrghw 45, 58, 57 + xxmrgld 59, 45, 46 + xxmrglw 56, 63, 62 + xxmrghw 55, 63, 62 + xxmrgld 61, 55, 56 + stxvx 49, 3, 9 + stxvx 54, 3, 16 + stxvx 59, 3, 18 + stxvx 61, 3, 20 + addi 5, 5, 64 + lxv 39, 0(14) + lxv 40, 16(14) + lxv 41, 32(14) + lxv 42, 48(14) + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + xxswapd 45, 45 + xxswapd 50, 50 + xxswapd 55, 55 + xxswapd 60, 60 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + xxswapd 44, 44 + xxswapd 49, 49 + xxswapd 54, 54 + xxswapd 59, 59 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrglw 46, 48, 47 + xxmrghw 45, 48, 47 + xxmrgld 49, 45, 46 + xxmrglw 51, 53, 52 + xxmrghw 50, 53, 52 + xxmrgld 54, 50, 51 + xxmrglw 46, 58, 57 + xxmrghw 45, 58, 57 + xxmrgld 59, 45, 46 + xxmrglw 56, 63, 62 + xxmrghw 55, 63, 62 + xxmrgld 61, 55, 56 + stxvx 49, 3, 9 + stxvx 54, 3, 16 + stxvx 59, 3, 18 + stxvx 61, 3, 20 + addi 5, 5, 64 + bdnz Lntt_ppc__Len2 + lxv 52, 128(1) + lxv 53, 144(1) + lxv 54, 160(1) + lxv 55, 176(1) + lxv 56, 192(1) + lxv 57, 208(1) + lxv 58, 224(1) + lxv 59, 240(1) + lxv 60, 256(1) + lxv 61, 272(1) + lxv 62, 288(1) + lxv 63, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + mtlr 0 + addi 1, 1, 352 blr + .cfi_endproc -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V_QINV -#undef V_NMKQ -#undef V_ZETA - -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V_QINV -#undef V_NMKQ -#undef V_ZETA +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index c664702db..344579c4d 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -3,163 +3,331 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ -# Poly_tomont: Inplace conversion of all coefficients of a polynomial -# from normal domain to Montgomery domain -# -# Arguments:*r: pointer to input/output polynomial -# +// Poly_tomont: Inplace conversion of all coefficients of a polynomial +// from normal domain to Montgomery domain +// +// Arguments:*r: pointer to input/output polynomial #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/poly_tomont.S using scripts/simpasm. Do not modify it directly. + */ -#define V1353 0 -#define V_QINV 2 -#define V_NMKQ 5 -.machine "any" .text - -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(_v0, _v1, _v2, _v3) -# -.macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+13, 0, 3 - addi 3, 3, 16 - lxvd2x 32+18, 0, 3 - addi 3, 3, 16 - lxvd2x 32+23, 0, 3 - addi 3, 3, 16 - lxvd2x 32+7, 0, 3 - addi 3, 3, 16 - - vmladduhm 15, 13, V1353, 3 - vmladduhm 20, 18, V1353, 3 - vmladduhm 25, 23, V1353, 3 - vmladduhm 9, 7, V1353, 3 - - vmhraddshs 14, 13, V1353, 3 - vmhraddshs 19, 18, V1353, 3 - vmhraddshs 24, 23, V1353, 3 - vmhraddshs 8, 7, V1353, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 9, 9, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 9, 9, V_NMKQ, 8 - - vsrah \_v0, 15, 4 # >> 1 - vsrah \_v1, 20, 4 # >> 1 - vsrah \_v2, 25, 4 # >> 1 - vsrah \_v3, 9, 4 # >> 1 -.endm - -.macro Write_8X - stxvd2x 32+27, 4, 3 - stxvd2x 32+28, 5, 3 - stxvd2x 32+29, 6, 3 - stxvd2x 32+30, 7, 3 - stxvd2x 32+13, 8, 3 - stxvd2x 32+18, 9, 3 - stxvd2x 32+23, 10, 3 - stxvd2x 32+7, 11, 3 -.endm - -.align 4 -.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +.balign 16 +.global MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu 1, -320(1) - mflr 0 - - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - - lxv 32+V_NMKQ, NQ_OFFSET(4) - lxv 32+V_QINV, QINV_OFFSET(4) - lxv 32+V1353, C1353_OFFSET(4) - vxor 3, 3, 3 + .cfi_startproc + stdu 1, -320(1) + mflr 0 + stxv 52, 128(1) + stxv 53, 144(1) + stxv 54, 160(1) + stxv 55, 176(1) + stxv 56, 192(1) + stxv 57, 208(1) + stxv 58, 224(1) + stxv 59, 240(1) + stxv 60, 256(1) + stxv 61, 272(1) + stxv 62, 288(1) + lxv 37, 0(4) + lxv 34, 16(4) + lxv 32, 80(4) + vxor 3, 3, 3 vspltish 4, 1 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - mtlr 0 - addi 1, 1, 320 + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxv 52, 128(1) + lxv 53, 144(1) + lxv 54, 160(1) + lxv 55, 176(1) + lxv 56, 192(1) + lxv 57, 208(1) + lxv 58, 224(1) + lxv 59, 240(1) + lxv 60, 256(1) + lxv 61, 272(1) + lxv 62, 288(1) + mtlr 0 + addi 1, 1, 320 blr + .cfi_endproc -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V1353 -#undef V_QINV -#undef V_NMKQ - -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V1353 -#undef V_QINV -#undef V_NMKQ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index f6860f33b..0cf7783a4 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -3,73 +3,96 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# - -# -# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial -# for details of the Barrett reduction -# -# Arguments: *r: pointer to input/output polynomial -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/reduce.S using scripts/simpasm. Do not modify it directly. + */ -# Barrett reduce constatnts -#define V20159 0 -#define V_25 1 -#define V_26 2 -#define V_MKQ 3 -.machine "any" .text +.balign 16 +.global MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) -.macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+8, 0, 3 - lxvd2x 32+12, 14, 3 - lxvd2x 32+16, 15, 3 - lxvd2x 32+20, 16, 3 - addi 3, 3, 64 - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 + .cfi_startproc + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 52, 128(1) + stxv 53, 144(1) + stxv 54, 160(1) + stxv 55, 176(1) + stxv 56, 192(1) + vxor 7, 7, 7 + lxv 35, 32(4) + lxv 32, 48(4) + vspltisw 2, 13 + vadduwm 2, 2, 2 + vspltisw 4, 1 + vsubuwm 5, 2, 4 + vslw 1, 4, 5 + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + li 14, 16 + li 15, 32 + li 16, 48 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 vpkuwum 4, 5, 4 vsubuhm 4, 7, 4 vpkuwum 9, 10, 9 @@ -78,36 +101,401 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 -.endm - -.macro Write_8X - stxvd2x 32+21, 4, 3 - stxvd2x 32+22, 5, 3 - stxvd2x 32+23, 6, 3 - stxvd2x 32+24, 7, 3 - stxvd2x 32+4, 8, 3 - stxvd2x 32+9, 9, 3 - stxvd2x 32+13, 10, 3 - stxvd2x 32+17, 11, 3 -.endm - -# -# Conditional addition to get unsigned canonical representative -# -.macro To_unsigned_16 - lxv 32+12, 0(3) - lxv 32+13, 16(3) - lxv 32+14, 32(3) - lxv 32+15, 48(3) - addi 3, 3, 64 - vsrh 1, 12, 10 - vsrh 0, 13, 10 - vsrh 3, 14, 10 - vsrh 2, 15, 10 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + nop + nop + ori 2, 2, 0 + addi 3, 3, -512 + xxspltib 41, 0 + vspltish 10, 15 + vmr 11, 3 + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 vadduhm 7, 12, 11 vadduhm 8, 13, 11 vadduhm 5, 14, 11 @@ -116,113 +504,200 @@ vcmpequh 0, 0, 9 vcmpequh 3, 3, 9 vcmpequh 2, 2, 9 - xxsel 32+1, 32+7,32+12, 32+1 - xxsel 32+0, 32+8,32+13, 32+0 - xxsel 32+3, 32+5,32+14, 32+3 - xxsel 32+2, 32+6,32+15, 32+2 - stxv 32+3, -32(3) - stxv 32+2, -16(3) - stxv 32+1, -64(3) - stxv 32+0, -48(3) -.endm - -.align 4 -.globl MLK_ASM_NAMESPACE(reduce_ppc) -MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu 1, -224(1) - mflr 0 - std 14, 96(1) - std 15, 104(1) - std 16, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - - vxor 7, 7, 7 - - lxv 32+V_MKQ, Q_OFFSET(4) - lxv 32+V20159, C20159_OFFSET(4) - - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - vspltisw 4, 1 - vsubuwm 5, V_26, 4 - vslw V_25, 4, 5 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - li 14, 16 - li 15, 32 - li 16, 48 - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - # - # To unsigned canonical - # -.align 4 - addi 3, 3, -512 - xxspltib 32+9 ,0 - vspltish 10, 15 - vmr 11, V_MKQ - - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - - ld 14, 96(1) - ld 15, 104(1) - ld 16, 112(1) - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - mtlr 0 - addi 1, 1, 224 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + lxv 44, 0(3) + lxv 45, 16(3) + lxv 46, 32(3) + lxv 47, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxv 35, -32(3) + stxv 34, -16(3) + stxv 33, -64(3) + stxv 32, -48(3) + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 52, 128(1) + lxv 53, 144(1) + lxv 54, 160(1) + lxv 55, 176(1) + lxv 56, 192(1) + mtlr 0 + addi 1, 1, 224 blr + .cfi_endproc -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V20159 -#undef V_25 -#undef V_26 -#undef V_MKQ - -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V20159 -#undef V_25 -#undef V_26 -#undef V_MKQ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/scripts/autogen b/scripts/autogen index 74c68b650..d80d0b724 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -1220,6 +1220,10 @@ def x86_64(c): return "/x86_64/" in c +def ppc64le(c): + return "/ppc64le/" in c + + def native_fips202(c): return native(c) and fips202(c) @@ -1252,6 +1256,10 @@ def native_arith_x86_64(c): return native_arith(c) and x86_64(c) +def native_arith_ppc64le(c): + return native_arith(c) and ppc64le(c) + + def native_arith_core(c): return ( native_arith(c) and not native_arith_x86_64(c) and not native_arith_aarch64(c) @@ -1519,6 +1527,10 @@ def gen_monolithic_asm_file(dry_run=False): for c in filter(native_arith_x86_64, asm_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + for c in filter(native_arith_ppc64le, asm_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)"