diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 37cdda80e..03ce6198c 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -29,6 +29,7 @@ source code and documentation. - [examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h](examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h) - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) + - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) - [integration/liboqs/config_x86_64.h](integration/liboqs/config_x86_64.h) - [mlkem/src/config.h](mlkem/src/config.h) - [mlkem/src/kem.c](mlkem/src/kem.c) diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md new file mode 100644 index 000000000..5125a40ea --- /dev/null +++ b/dev/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h new file mode 100644 index 000000000..34f8cbec6 --- /dev/null +++ b/dev/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_META_H +#define MLK_DEV_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_DEV_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 000000000..1c7534668 --- /dev/null +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c new file mode 100644 index 000000000..fa0f7097f --- /dev/null +++ b/dev/ppc64le/src/consts.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include +#include +#include +#include + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1072] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, + /* For Len=4 */ + 1223, 1223, 1223, 1223, 652, 652, 652, 652, -552, -552, -552, -552, 1015, + 1015, 1015, 1015, -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -282, + -282, -282, -282, -1544, -1544, -1544, -1544, 516, 516, 516, 516, -8, -8, + -8, -8, -320, -320, -320, -320, -666, -666, -666, -666, -1618, -1618, -1618, + -1618, -1162, -1162, -1162, -1162, 126, 126, 126, 126, 1469, 1469, 1469, + 1469, -853, -853, -853, -853, -90, -90, -90, -90, -271, -271, -271, -271, + 830, 830, 830, 830, 107, 107, 107, 107, -1421, -1421, -1421, -1421, -247, + -247, -247, -247, -951, -951, -951, -951, -398, -398, -398, -398, 961, 961, + 961, 961, -1508, -1508, -1508, -1508, -725, -725, -725, -725, 448, 448, 448, + 448, -1065, -1065, -1065, -1065, 677, 677, 677, 677, -1275, -1275, -1275, + -1275, + /* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, + 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, + -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, + 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, + 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, + -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, + -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, + 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, + /* + * For intt Len=2, offset IZETA_NTT_OFFSET127 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, + -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, + -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, + -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, + 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, + -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, + -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, + 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, + /* For intt Len=4 */ + -1275, -1275, -1275, -1275, 677, 677, 677, 677, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, -725, -725, -725, -725, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, -398, -398, -398, -398, -951, -951, -951, -951, -247, -247, + -247, -247, -1421, -1421, -1421, -1421, 107, 107, 107, 107, 830, 830, 830, + 830, -271, -271, -271, -271, -90, -90, -90, -90, -853, -853, -853, -853, + 1469, 1469, 1469, 1469, 126, 126, 126, 126, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -666, -666, -666, -666, -320, -320, -320, -320, + -8, -8, -8, -8, 516, 516, 516, 516, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, 1015, 1015, + 1015, 1015, -552, -552, -552, -552, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, + /* For intt Len=8 and others */ + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, + -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, + 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, + 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, + -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, + -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, + 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, + -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, + 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, + -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, + 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, + 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, + -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, + 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, + 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, + -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h new file mode 100644 index 000000000..b5e66983f --- /dev/null +++ b/dev/ppc64le/src/consts.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_SRC_CONSTS_H +#define MLK_DEV_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_INTT_OFFSET 1104 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S new file mode 100644 index 000000000..5c7b3dba6 --- /dev/null +++ b/dev/ppc64le/src/intt_ppc.S @@ -0,0 +1,667 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +.machine "any" +.text + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +# Montgomery reduce constatnts +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro Compute_4Coeffs + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + +.macro Load_4Coeffs start next step + mr 9, \start # j + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + Compute_4Coeffs +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 10, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 11, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+8, 11, 10, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+21, 11, 10, 3 # rj0 - rj4, rj8 - rj11 + lxv 10, 32(5) + lxv 11, 48(5) + xxpermdi 32+12, 11, 10, 0 + xxpermdi 32+22, 11, 10, 3 + lxv 10, 64(5) + lxv 11, 80(5) + xxpermdi 32+16, 11, 10, 0 + xxpermdi 32+23, 11, 10, 3 + lxv 10, 96(5) + lxv 11, 112(5) + xxpermdi 32+20, 11, 10, 0 + xxpermdi 32+24, 11, 10, 3 +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) +.endm + +.macro PermWriteL44 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+13, 32+14, 3 + xxpermdi 32+11, 32+13, 32+14, 0 + xxpermdi 32+12, 32+18, 32+19, 3 + xxpermdi 32+13, 32+18, 32+19, 0 + xxpermdi 32+14, 32+23, 32+24, 3 + xxpermdi 32+15, 32+23, 32+24, 0 + xxpermdi 32+16, 32+28, 32+29, 3 + xxpermdi 32+17, 32+28, 32+29, 0 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro PermWriteL24 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 +.endm + +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 +.endm + +.macro INTT_REDUCE_4X start next step + Load_4Coeffs \start, \next, \step + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 +.endm + +# intt +# t = r[j]; +# r[j] = barrett_reduce(t + r[j + len]); +# r[j + len] = r[j + len] - t; +# r[j + len] = fqmul(zeta, r[j + len]); + +# +# mlk_intt_ppc(r) +# +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # init vectors and constants + # Setup for Montgomery reduce + lxv 0, 0(4) + + lxv 32+V_QINV, QINV_OFFSET(4) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 + + # Setup for Barrett reduce + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 + + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 + + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + li 15, 4 # loops + mtctr 15 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + bdnz intt_ppc__Loopf + + addi 3, 3, -512 + +.align 4 + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addi 14, 4, ZETA_INTT_OFFSET + li 7, 4 + li 15, 4 + mtctr 15 + mr 5, 3 +intt_ppc__Loop2: + INTT_REDUCE_L24 + addi 5, 5, 128 + bdnz intt_ppc__Loop2 + +.align 4 + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + mr 5, 3 + li 7, 8 + li 15, 4 # loops + mtctr 15 +intt_ppc__Loop4: + INTT_REDUCE_L44 + addi 5, 5, 128 + bdnz intt_ppc__Loop4 + +.align 4 + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + li 7, 16 + li 5, 0 + li 15, 4 # loops + mtctr 15 + +intt_ppc__Loop8: + INTT_REDUCE_4X 5, 32, 32 + addi 5, 5, 128 + bdnz intt_ppc__Loop8 + +.align 4 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + li 5, 0 + li 7, 32 + + INTT_REDUCE_4X 5, 64, 64 + + li 5, 16 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 + + li 5, 256 + INTT_REDUCE_4X 5, 64, 64 + + li 5, 272 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 + +.align 4 + # + # 5. len = 32, start = 0, 64, 128, 192 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # + # 6. len = 64, start = 0, 128 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # 7. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S new file mode 100644 index 000000000..435e5bb52 --- /dev/null +++ b/dev/ppc64le/src/ntt_ppc.S @@ -0,0 +1,470 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +.macro Load_4Coeffs start next step + mr 9, \start + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 1, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 2, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+13, 2, 1, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 3 # rj0 - rj4, rj8 - rj11 + lxv 3, 32(5) + lxv 4, 48(5) + xxpermdi 32+18, 4, 3, 0 + xxpermdi 32+17, 4, 3, 3 + lxv 1, 64(5) + lxv 2, 80(5) + xxpermdi 32+23, 2, 1, 0 + xxpermdi 32+22, 2, 1, 3 + lxv 3, 96(5) + lxv 4, 112(5) + xxpermdi 32+28, 4, 3, 0 + xxpermdi 32+27, 4, 3, 3 +.endm + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 + # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + +.endm + +.macro Load_4Aj + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 +.endm + +.macro Compute_4Coeffs + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t +.endm + +.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next, \step + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Aj + Compute_4Coeffs +.endm + +.macro Write_One + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 +.endm + +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi 0, 32+16, 32+15, 3 + xxpermdi 1, 32+16, 32+15, 0 + xxpermdi 2, 32+21, 32+20, 3 + xxpermdi 3, 32+21, 32+20, 0 + xxpermdi 4, 32+26, 32+25, 3 + xxpermdi 5, 32+26, 32+25, 0 + xxpermdi 6, 32+31, 32+30, 3 + xxpermdi 7, 32+31, 32+30, 0 + stxv 0, 0(5) + stxv 1, 16(5) + stxv 2, 32(5) + stxv 3, 48(5) + stxv 4, 64(5) + stxv 5, 80(5) + stxv 6, 96(5) + stxv 7, 112(5) +.endm + +.macro PermWriteL24 + Compute_4Coeffs + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +# +# mlk_ntt_ppc(int16_t *r) +# +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # get MLKEM_Q + lvx V_NMKQ,0,4 + + # zetas array + addi 14, 4, ZETA_NTT_OFFSET + + vxor 3, 3, 3 + vspltish 4, 1 + + lxv 32+V_QINV, QINV_OFFSET(4) + +.align 4 + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + mr 5, 3 + li 7, 8 +.align 4 +ntt_ppc__Len4: + Load_next_4zetas + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 + + bdnz ntt_ppc__Len4 + + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas + + li 15, 4 + mtctr 15 + mr 5, 3 + li 7, 4 +.align 4 +ntt_ppc__Len2: + Load_next_4zetas + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 + + bdnz ntt_ppc__Len2 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S new file mode 100644 index 000000000..b7b010aaf --- /dev/null +++ b/dev/ppc64le/src/poly_tomont.S @@ -0,0 +1,161 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# Poly_tomont: Inplace conversion of all coefficients of a polynomial +# from normal domain to Montgomery domain +# +# Arguments:*r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_v0, _v1, _v2, _v3) +# +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S new file mode 100644 index 000000000..603e0d38b --- /dev/null +++ b/dev/ppc64le/src/reduce.S @@ -0,0 +1,223 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# +# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +# for details of the Barrett reduction +# +# Arguments: *r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +# +# Conditional addition to get unsigned canonical representative +# +.macro To_unsigned_16 + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + vxor 7, 7, 7 + + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # +.align 4 + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 7d8e50d4c..9c7fe672a 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index aa88537d3..f46dbfdbf 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 254d67478..1b01c4d42 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h new file mode 100644 index 000000000..2fa1cdbcf --- /dev/null +++ b/integration/liboqs/config_ppc64le.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS140_3_IG] + * Implementation Guidance for FIPS 140-3 and the Cryptographic Module + * Validation Program National Institute of Standards and Technology + * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements + */ + +#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H +#define MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H + +/****************************************************************************** + * Name: MLK_CONFIG_PARAMETER_SET + * + * Description: Specifies the parameter set for ML-KEM + * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 + * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 + * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#ifndef MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_PARAMETER_SET \ + 768 /* Change this for different security strengths */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_NAMESPACE_PREFIX + * + * Description: The prefix to use to namespace global symbols from mlkem/. + * + * In a multi-level build (that is, if either + * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or + * - MLK_CONFIG_MULTILEVEL_NO_SHARED, + * are set, level-dependent symbols will additionally be prefixed + * with the parameter set (512/768/1024). + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if MLK_CONFIG_PARAMETER_SET == 512 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 768 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 1024 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + * + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + +/****************************************************************************** + * Name: MLK_CONFIG_ARITH_BACKEND_FILE + * + * Description: The arithmetic backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202_CUSTOM_HEADER \ + "../../integration/liboqs/fips202_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202-X4 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202x4.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER \ + "../../integration/liboqs/fips202x4_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_ZEROIZE + * + * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes + * intermediate stack buffers before returning from function calls. + * + * Set this option and define `mlk_zeroize` if you want to + * use a custom method to zeroize intermediate stack buffers. + * The default implementation uses SecureZeroMemory on Windows + * and a memset + compiler barrier otherwise. If neither of those + * is available on the target platform, compilation will fail, + * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide + * a custom implementation of `mlk_zeroize()`. + * + * WARNING: + * The explicit stack zeroization conducted by mlkem-native + * reduces the likelihood of data leaking on the stack, but + * does not eliminate it! The C standard makes no guarantee about + * where a compiler allocates structures and whether/where it makes + * copies of them. Also, in addition to entire structures, there + * may also be potentially exploitable leakage of individual values + * on the stack. + * + * If you need bullet-proof zeroization of the stack, you need to + * consider additional measures instead of of what this feature + * provides. In this case, you can set mlk_zeroize to a no-op. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_ZEROIZE + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES + * + * Description: mlkem-native does not provide a secure randombytes + * implementation. Such an implementation has to provided by the + * consumer. + * + * If this option is not set, mlkem-native expects a function + * void randombytes(uint8_t *out, size_t outlen). + * + * Set this option and define `mlk_randombytes` if you want to + * use a custom method to sample randombytes with a different name + * or signature. + * + *****************************************************************************/ +#define MLK_CONFIG_CUSTOM_RANDOMBYTES +#if !defined(__ASSEMBLER__) +#include +#include +#include "../../mlkem/src/sys.h" +static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) +{ + OQS_randombytes(ptr, len); +} +#endif /* !__ASSEMBLER__ */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_ASM + * + * Description: If this option is set, mlkem-native will be built without + * use of native code or inline assembly. + * + * By default, inline assembly is used to implement value barriers. + * Without inline assembly, mlkem-native will use a global volatile + * 'opt blocker' instead; see verify.h. + * + * Inline assembly is also used to implement a secure zeroization + * function on non-Windows platforms. If this option is set and + * the target platform is not Windows, you MUST set + * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization + * function. + * + * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and + * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no + *native backends will be used. + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_ASM */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT + * + * Description: Compliance with @[FIPS140_3_IG, p.87] requires a + * Pairwise Consistency Test (PCT) to be carried out on a freshly + * generated keypair before it can be exported. + * + * Set this option if such a check should be implemented. + * In this case, crypto_kem_keypair_derand and crypto_kem_keypair + * will return a non-zero error code if the PCT failed. + * + * NOTE: This feature will drastically lower the performance of + * key generation. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + * + * Description: If this option is set, the user must provide a runtime + * function `static inline int mlk_break_pct() { ... }` to + * indicate whether the PCT should be made fail. + * + * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + #if !defined(__ASSEMBLER__) + #include "sys.h" + static MLK_INLINE int mlk_break_pct(void) + { + ... return 0/1 depending on whether PCT should be broken ... + } + #endif +*/ + +/* Enable valgrind-based assertions in mlkem-native through macro + * from libOQS. */ +#if !defined(__ASSEMBLER__) +#include +#if defined(OQS_ENABLE_TEST_CONSTANT_TIME) +#define MLK_CONFIG_CT_TESTING_ENABLED +#endif +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H */ diff --git a/mlkem/mlkem_native.S b/mlkem/mlkem_native.S index fcdf182eb..7f957afa7 100644 --- a/mlkem/mlkem_native.S +++ b/mlkem/mlkem_native.S @@ -460,6 +460,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index d846f9f55..b76617518 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -447,6 +447,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index f2b9b848b..e39188323 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -18,4 +18,8 @@ #include "x86_64/meta.h" #endif +#ifdef MLK_SYS_PPC64LE +#include "ppc64le/meta.h" +#endif + #endif /* !MLK_NATIVE_META_H */ diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md new file mode 100644 index 000000000..5125a40ea --- /dev/null +++ b/mlkem/src/native/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h new file mode 100644 index 000000000..54b3ddd9c --- /dev/null +++ b/mlkem/src/native/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 000000000..dbcee3e3e --- /dev/null +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c new file mode 100644 index 000000000..fa0f7097f --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include +#include +#include +#include + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1072] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, + /* For Len=4 */ + 1223, 1223, 1223, 1223, 652, 652, 652, 652, -552, -552, -552, -552, 1015, + 1015, 1015, 1015, -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -282, + -282, -282, -282, -1544, -1544, -1544, -1544, 516, 516, 516, 516, -8, -8, + -8, -8, -320, -320, -320, -320, -666, -666, -666, -666, -1618, -1618, -1618, + -1618, -1162, -1162, -1162, -1162, 126, 126, 126, 126, 1469, 1469, 1469, + 1469, -853, -853, -853, -853, -90, -90, -90, -90, -271, -271, -271, -271, + 830, 830, 830, 830, 107, 107, 107, 107, -1421, -1421, -1421, -1421, -247, + -247, -247, -247, -951, -951, -951, -951, -398, -398, -398, -398, 961, 961, + 961, 961, -1508, -1508, -1508, -1508, -725, -725, -725, -725, 448, 448, 448, + 448, -1065, -1065, -1065, -1065, 677, 677, 677, 677, -1275, -1275, -1275, + -1275, + /* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, + 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, + -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, + 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, + 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, + -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, + -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, + 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, + /* + * For intt Len=2, offset IZETA_NTT_OFFSET127 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, + -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, + -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, + -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, + 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, + -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, + -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, + 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, + /* For intt Len=4 */ + -1275, -1275, -1275, -1275, 677, 677, 677, 677, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, -725, -725, -725, -725, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, -398, -398, -398, -398, -951, -951, -951, -951, -247, -247, + -247, -247, -1421, -1421, -1421, -1421, 107, 107, 107, 107, 830, 830, 830, + 830, -271, -271, -271, -271, -90, -90, -90, -90, -853, -853, -853, -853, + 1469, 1469, 1469, 1469, 126, 126, 126, 126, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -666, -666, -666, -666, -320, -320, -320, -320, + -8, -8, -8, -8, 516, 516, 516, 516, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, 1015, 1015, + 1015, 1015, -552, -552, -552, -552, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, + /* For intt Len=8 and others */ + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, + -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, + 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, + 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, + -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, + -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, + 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, + -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, + 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, + -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, + 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, + 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, + -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, + 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, + 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, + -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h new file mode 100644 index 000000000..df5d163f7 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#define MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_INTT_OFFSET 1104 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S new file mode 100644 index 000000000..65df15b99 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -0,0 +1,665 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +.machine "any" +.text + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +# Montgomery reduce constatnts +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro Compute_4Coeffs + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + +.macro Load_4Coeffs start next step + mr 9, \start # j + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + Compute_4Coeffs +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 10, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 11, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+8, 11, 10, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+21, 11, 10, 3 # rj0 - rj4, rj8 - rj11 + lxv 10, 32(5) + lxv 11, 48(5) + xxpermdi 32+12, 11, 10, 0 + xxpermdi 32+22, 11, 10, 3 + lxv 10, 64(5) + lxv 11, 80(5) + xxpermdi 32+16, 11, 10, 0 + xxpermdi 32+23, 11, 10, 3 + lxv 10, 96(5) + lxv 11, 112(5) + xxpermdi 32+20, 11, 10, 0 + xxpermdi 32+24, 11, 10, 3 +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) +.endm + +.macro PermWriteL44 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+13, 32+14, 3 + xxpermdi 32+11, 32+13, 32+14, 0 + xxpermdi 32+12, 32+18, 32+19, 3 + xxpermdi 32+13, 32+18, 32+19, 0 + xxpermdi 32+14, 32+23, 32+24, 3 + xxpermdi 32+15, 32+23, 32+24, 0 + xxpermdi 32+16, 32+28, 32+29, 3 + xxpermdi 32+17, 32+28, 32+29, 0 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro PermWriteL24 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 +.endm + +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 +.endm + +.macro INTT_REDUCE_4X start next step + Load_4Coeffs \start, \next, \step + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 +.endm + +# intt +# t = r[j]; +# r[j] = barrett_reduce(t + r[j + len]); +# r[j + len] = r[j + len] - t; +# r[j + len] = fqmul(zeta, r[j + len]); + +# +# mlk_intt_ppc(r) +# +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # init vectors and constants + # Setup for Montgomery reduce + lxv 0, 0(4) + + lxv 32+V_QINV, QINV_OFFSET(4) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 + + # Setup for Barrett reduce + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 + + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 + + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + li 15, 4 # loops + mtctr 15 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + bdnz intt_ppc__Loopf + + addi 3, 3, -512 + +.align 4 + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addi 14, 4, ZETA_INTT_OFFSET + li 7, 4 + li 15, 4 + mtctr 15 + mr 5, 3 +intt_ppc__Loop2: + INTT_REDUCE_L24 + addi 5, 5, 128 + bdnz intt_ppc__Loop2 + +.align 4 + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + mr 5, 3 + li 7, 8 + li 15, 4 # loops + mtctr 15 +intt_ppc__Loop4: + INTT_REDUCE_L44 + addi 5, 5, 128 + bdnz intt_ppc__Loop4 + +.align 4 + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + li 7, 16 + li 5, 0 + li 15, 4 # loops + mtctr 15 + +intt_ppc__Loop8: + INTT_REDUCE_4X 5, 32, 32 + addi 5, 5, 128 + bdnz intt_ppc__Loop8 + +.align 4 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + li 5, 0 + li 7, 32 + + INTT_REDUCE_4X 5, 64, 64 + + li 5, 16 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 + + li 5, 256 + INTT_REDUCE_4X 5, 64, 64 + + li 5, 272 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 + +.align 4 + # + # 5. len = 32, start = 0, 64, 128, 192 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # + # 6. len = 64, start = 0, 128 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # 7. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S new file mode 100644 index 000000000..70e7bf710 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -0,0 +1,468 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +.macro Load_4Coeffs start next step + mr 9, \start + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 1, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 2, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+13, 2, 1, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 3 # rj0 - rj4, rj8 - rj11 + lxv 3, 32(5) + lxv 4, 48(5) + xxpermdi 32+18, 4, 3, 0 + xxpermdi 32+17, 4, 3, 3 + lxv 1, 64(5) + lxv 2, 80(5) + xxpermdi 32+23, 2, 1, 0 + xxpermdi 32+22, 2, 1, 3 + lxv 3, 96(5) + lxv 4, 112(5) + xxpermdi 32+28, 4, 3, 0 + xxpermdi 32+27, 4, 3, 3 +.endm + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 + # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + +.endm + +.macro Load_4Aj + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 +.endm + +.macro Compute_4Coeffs + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t +.endm + +.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next, \step + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Aj + Compute_4Coeffs +.endm + +.macro Write_One + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 +.endm + +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi 0, 32+16, 32+15, 3 + xxpermdi 1, 32+16, 32+15, 0 + xxpermdi 2, 32+21, 32+20, 3 + xxpermdi 3, 32+21, 32+20, 0 + xxpermdi 4, 32+26, 32+25, 3 + xxpermdi 5, 32+26, 32+25, 0 + xxpermdi 6, 32+31, 32+30, 3 + xxpermdi 7, 32+31, 32+30, 0 + stxv 0, 0(5) + stxv 1, 16(5) + stxv 2, 32(5) + stxv 3, 48(5) + stxv 4, 64(5) + stxv 5, 80(5) + stxv 6, 96(5) + stxv 7, 112(5) +.endm + +.macro PermWriteL24 + Compute_4Coeffs + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +# +# mlk_ntt_ppc(int16_t *r) +# +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # get MLKEM_Q + lvx V_NMKQ,0,4 + + # zetas array + addi 14, 4, ZETA_NTT_OFFSET + + vxor 3, 3, 3 + vspltish 4, 1 + + lxv 32+V_QINV, QINV_OFFSET(4) + +.align 4 + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + mr 5, 3 + li 7, 8 +.align 4 +ntt_ppc__Len4: + Load_next_4zetas + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 + + bdnz ntt_ppc__Len4 + + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas + + li 15, 4 + mtctr 15 + mr 5, 3 + li 7, 4 +.align 4 +ntt_ppc__Len2: + Load_next_4zetas + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 + + bdnz ntt_ppc__Len2 + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S new file mode 100644 index 000000000..eb770a631 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -0,0 +1,165 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# Poly_tomont: Inplace conversion of all coefficients of a polynomial +# from normal domain to Montgomery domain +# +# Arguments:*r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_v0, _v1, _v2, _v3) +# +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S new file mode 100644 index 000000000..f9681c456 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -0,0 +1,228 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# +# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +# for details of the Barrett reduction +# +# Arguments: *r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +# +# Conditional addition to get unsigned canonical representative +# +.macro To_unsigned_16 + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + vxor 7, 7, 7 + + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # +.align 4 + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ diff --git a/test/mk/components.mk b/test/mk/components.mk index cdcc3eb5d..88158f703 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -8,6 +8,7 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) + SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif